Stata Code: wordfreq

/* NOTES:
   Should also test to see whether filename args exist.
   Filename wildcard * ? characters would be nice
   How to eliminate quote  characters?
   Problems if filename extensions end in something besides .txt 
   (e.g. .asc)
*/

* version 7.0

program define wordfreq, rclass
  display in text "  Starting WORDFREQ ..."
  while "`1'" ~= "" {
    quietly infile str30 word using `1', automatic clear
    cleanwords word
    local tempname = subinstr("`1'",".txt","",.)
    local tempname = "t" + "`tempname'"
    quietly gen `tempname' = 1
    display in text "    `1' --> `tempname'"
    quietly collapse (count) `tempname', by(word)
    capture confirm file wc_temp.dta
    if _rc==0 {
        quietly sort word
	quietly merge word using wc_temp
	quietly drop _merge
	}
    quietly sort word
    quietly save wc_temp, replace
    macro shift 
  }
  erase wc_temp.dta
  display in text "    ...cleaning up..."
  quietly mvencode _all, mv(0)
  quietly egen tot = rsum(t*)
  quietly gsort -tot
  quietly drop tot
  display in text "  Finished."
end


program define cleanwords, rclass
  args word
  confirm variable `word'
  quietly replace `word' = trim(`word')
  quietly replace `word' = lower(`word')
  quietly /* remove all of the following characters */
  quietly replace `word' = subinstr(`word',"(","",.)
  quietly replace `word' = subinstr(`word',")","",.)
  quietly replace `word' = subinstr(`word',";","",.)
  quietly replace `word' = subinstr(`word',":","",.)
  /* replace `word' = subinstr(`word',"'","",.) */
  quietly replace `word' = subinstr(`word',",","",.)
  quietly replace `word' = subinstr(`word',"*","",.)
  quietly replace `word' = subinstr(`word',"^","",.)
  quietly replace `word' = subinstr(`word',"%","",.)
  quietly replace `word' = subinstr(`word',"{","",.)
  quietly replace `word' = subinstr(`word',"}","",.)
  quietly replace `word' = subinstr(`word',"[","",.)
  quietly replace `word' = subinstr(`word',"]","",.)
  quietly replace `word' = subinstr(`word',"....","",.)
  quietly replace `word' = subinstr(`word',"...","",.)
  quietly replace `word' = subinstr(`word',"..","",.)
  quietly replace `word' = subinstr(`word',"`","",.)
  quietly replace `word' = subinstr(`word',"&","",.)
        
  /* remove these characters only if at the end of a word */
  quietly replace `word' = subinstr(`word',".","",.) if index(`word',".")==(length(`word'))
  quietly replace `word' = subinstr(`word',"?","",.) if index(`word',"?")==(length(`word'))
  quietly replace `word' = subinstr(`word',"!","",.) if index(`word',"!")==(length(`word'))

  quietly replace `word' = subinstr(`word',"'","",.) if index(`word',"'")==1 | index(`word',"'")==(length(`word'))
  quietly replace `word' = subinstr(`word',"`","",.) if index(`word',"'")==1
  quietly replace `word' = subinstr(`word',"-","",.) if index(`word',"-")==1 | index(`word',"-")==(length(`word'))
  quietly replace `word' = subinstr(`word',"_","",.) if index(`word',"_")==1 | index(`word',"_")==(length(`word'))

  /* remove these characters if they stand alone */
  quietly drop if `word' == ""
  quietly drop if `word' == "%"
  quietly drop if `word' == "+"
  quietly drop if `word' == "-"
  quietly drop if `word' == "!"
  quietly drop if `word' == "/"
  quietly drop if `word' == "@"
  quietly drop if `word' == "~"
  quietly drop if `word' == "&"
  quietly sort `word'
  quietly splitnc `word'
  quietly drop `word'
  capture confirm variable `word'2
  if _rc==0 {
     quietly stack `word'*, into(`word') clear
     quietly drop if `word'==""
     quietly drop _stack
     } 
   else quietly rename `word'1 `word'
end

/****
program define cleanwords, rclass;
  args wrd;
  display "in cleanwords";
  confirm variable `wrd';
  quietly replace `wrd' = trim(`wrd');
  quietly replace `wrd' = lower(`wrd');
  quietly /* remove all of the following characters */
  quietly replace `wrd' = subinstr(`wrd',"(","",.);
  quietly replace `wrd' = subinstr(`wrd',")","",.);
  quietly replace `wrd' = subinstr(`wrd',";","",.);
  quietly replace `wrd' = subinstr(`wrd',":","",.);
  /* replace `wrd' = subinstr(`wrd',"'","",.); */
  quietly replace `wrd' = subinstr(`wrd',",","",.);
  quietly replace `wrd' = subinstr(`wrd',"*","",.);
  quietly replace `wrd' = subinstr(`wrd',"^","",.);
  quietly replace `wrd' = subinstr(`wrd',"%","",.);
  quietly replace `wrd' = subinstr(`wrd',"{","",.);
  quietly replace `wrd' = subinstr(`wrd',"}","",.);
  quietly replace `wrd' = subinstr(`wrd',"[","",.);
  quietly replace `wrd' = subinstr(`wrd',"]","",.);
  quietly replace `wrd' = subinstr(`wrd',"....","",.);
  quietly replace `wrd' = subinstr(`wrd',"...","",.);
  quietly replace `wrd' = subinstr(`wrd',"..","",.);
  quietly replace `wrd' = subinstr(`wrd',"`","",.);
  quietly replace `wrd' = subinstr(`wrd',"&","",.);
        
  /* remove these characters only if at the end of a word */
  quietly replace `wrd' = subinstr(`wrd',".","",.) if index(`wrd',".")==(length(`wrd'));
  quietly replace `wrd' = subinstr(`wrd',"?","",.) if index(`wrd',"?")==(length(`wrd'));
  quietly replace `wrd' = subinstr(`wrd',"!","",.) if index(`wrd',"!")==(length(`wrd'));

  quietly replace `wrd' = subinstr(`wrd',"'","",.) 
         if index(`wrd',"'")==1 | index(`wrd',"'")==(length(`wrd'));
  quietly replace `wrd' = subinstr(`wrd',"`","",.) if index(`wrd',"'")==1;
  quietly replace `wrd' = subinstr(`wrd',"-","",.) 
         if index(`wrd',"-")==1 | index(`wrd',"-")==(length(`wrd'));
  quietly replace `wrd' = subinstr(`wrd',"_","",.) 
         if index(`wrd',"_")==1 | index(`wrd',"_")==(length(`wrd'));

  /* remove these characters if they stand alone */
  quietly drop if `wrd' == "";
  quietly drop if `wrd' == "%";
  quietly drop if `wrd' == "+";
  quietly drop if `wrd' == "-";
  quietly drop if `wrd' == "!";
  quietly drop if `wrd' == "/";
  quietly drop if `wrd' == "@";
  quietly drop if `wrd' == "~";
  quietly drop if `wrd' == "&";
  quietly sort `wrd';
  quietly splitnc `wrd';
  quietly drop `wrd';
  capture confirm variable `wrd'2;
  if _rc==0 {
     quietly stack `wrd'*, into(`wrd') clear;
     quietly drop if `wrd'=="";
     quietly drop _stack;
     }; else 
  quietly rename `wrd'1 `wrd';
end;

***/