Stata Code: phrasefreq

/* NOTES:
   Usage: Phrasecount n text1.txt text2.txt ...
   where n is the number of phrases to use
*/

* version 7.0
#delimit ;

program define phrasefreq, rclass;
  local phrlen = `1';
  display in text "  Starting PHRASEFREQ (length=`phrlen')...";
  macro shift;
  while "`1'" ~= "" {;
    quietly infile str30 phrase using `1', automatic clear;
    cleanwords phrase;
    stackphrases `phrlen' phrase;
    local tempname = subinstr("`1'",".txt","",.);
    local tempname = "t" + "`tempname'";
    quietly gen `tempname' = 1;
    display in text "    `1' --> `tempname'";
    quietly collapse (count) `tempname', by(phrase);
    capture confirm file wc_temp.dta;
    if _rc==0 {;
        quietly sort phrase;
	quietly merge phrase using wc_temp;
	quietly drop _merge;
	};
    quietly sort phrase;
    quietly save wc_temp, replace;
    macro shift;
    };
  erase wc_temp.dta;
  display in text "    ...cleaning up...";
  quietly mvencode _all, mv(0);
  quietly egen tot = rsum(t*);
  quietly gsort -tot;
  quietly drop tot;

  tempvar tupletemp;
  quietly gen ntuple = 3 if index( substr(phrase,index(phrase,"_")+1,.), "_");
  quietly gen tupletemp = 1 if index(phrase, "_");
  quietly replace ntuple=1 if tupletemp~=1;
  quietly recode ntuple .=2;
  quietly drop tupletemp;
  quietly display in text "  Finished.";
end;


program define cleanwords, rclass;
  args word;
  confirm variable `word';
  quietly replace `word' = trim(`word');
  quietly replace `word' = lower(`word');
  quietly /* remove all of the following characters */
  quietly replace `word' = subinstr(`word',"(","",.);
  quietly replace `word' = subinstr(`word',")","",.);
  quietly replace `word' = subinstr(`word',";","",.);
  quietly replace `word' = subinstr(`word',":","",.);
  /* replace `word' = subinstr(`word',"'","",.); */
  quietly replace `word' = subinstr(`word',",","",.);
  quietly replace `word' = subinstr(`word',"*","",.);
  quietly replace `word' = subinstr(`word',"^","",.);
  quietly replace `word' = subinstr(`word',"%","",.);
  quietly replace `word' = subinstr(`word',"{","",.);
  quietly replace `word' = subinstr(`word',"}","",.);
  quietly replace `word' = subinstr(`word',"[","",.);
  quietly replace `word' = subinstr(`word',"]","",.);
  quietly replace `word' = subinstr(`word',"....","",.);
  quietly replace `word' = subinstr(`word',"...","",.);
  quietly replace `word' = subinstr(`word',"..","",.);
  quietly replace `word' = subinstr(`word',"`","",.);
        
  /* remove these characters only if at the end of a word */
  quietly replace `word' = subinstr(`word',".","",.) if index(`word',".")==(length(`word'));
  quietly replace `word' = subinstr(`word',"?","",.) if index(`word',"?")==(length(`word'));
  quietly replace `word' = subinstr(`word',"!","",.) if index(`word',"!")==(length(`word'));

  quietly replace `word' = subinstr(`word',"'","",.) if index(`word',"'")==1;
  quietly replace `word' = subinstr(`word',"'","",.) if index(`word',"'")==(length(`word'));
  quietly replace `word' = subinstr(`word',"`","",.) if index(`word',"'")==1;

  /* remove these characters if they stand alone */
  quietly drop if `word' == "";
  quietly drop if `word' == "%";
  quietly drop if `word' == "+";
  quietly drop if `word' == "-";
  quietly drop if `word' == "!";
  quietly drop if `word' == "/";
  quietly drop if `word' == "@";
  quietly drop if `word' == "~";
  quietly drop if `word' == "&";
  quietly splitnc `word';
  quietly drop `word';
  capture confirm variable `word'2;
  if _rc==0 {;
     quietly stack `word'*, into(`word') clear;
     quietly drop if `word'=="";
     quietly drop _stack;
     }; else 
  quietly rename `word'1 `word';
end;


program define stackphrases, rclass;
  args len word;
  confirm variable `word';
*  if `len' > 3 {
*      display "Error: stackphrases currently only supports phrase length max=3.";
*  };
  local i = 2;
  while `i' <= `len' {;
      quietly gen str80 `word'`i' = `word'[_n] + "_" + `word'[_n+1];
      local j = 3;
      while `j' <= `i' {;
	   quietly replace `word'`i' = `word'`i' + "_" + `word'[_n+`j'-1];
	   local j = `j' + 1;
      };
      local todrop = -1*`i'+1;
      quietly replace `word'`i' = "" in `todrop'/l;
      local i = `i' + 1;
  };  
  quietly compress `word'*;
  quietly stack `word'*, into(`word') clear;
  quietly drop _stack;
  quietly replace `word' = subinstr(`word',"_","",.) 
          if index(`word',"'")==(length(`word')) | index(`word',"'")==1;
  quietly drop if `word' == "";
end;