# Written by C. Hafner, Spring 2007 for NLP class # demonstration program for computing frequency counts of the first 1,000 lines # tagged brown corpus. Each line has 3 data elements: word, tag, location counts_dict = {} # create a new dictionary import os datapath = os.getcwd() brown = open(datapath + '\startbrown.txt') for line in brown.readlines()[4:]: word = line.split()[0] # default is to skip all whitespace if word[0] == "*": word = word[1:] # remove leading * from word using slice if word not in counts_dict: counts_dict[word] = 0 counts_dict[word] += 1 # we are done counting, now print the 20 most frequent words # we need to reverse each item in the dictionary to sort by count not word def reversepair(p): return (p[1] , p[0]) # counts_dict.items() returns a list of 2 element sequences (pairs) # illustrates sorting, mapping and keyword arguments result = sorted(map(reversepair, counts_dict.items()),reverse=True) for w in result[0:20]: print w[1], " occurs " , w[0], " times." raw_input("Press any key to exit") # needed to keep command window # open if you double-click to run