# Written by C. Hafner, Spring 2007 for NLP class
# demonstration program for computing frequency counts of the first 1,000 lines
# tagged brown corpus.  Each line has 3 data elements: word, tag, location


counts_dict = {} # create a new dictionary


import os
datapath = os.getcwd()
brown = open(datapath + '\startbrown.txt')
for line in brown.readlines()[4:]:  
	word = line.split()[0]  # default is to skip all whitespace
	if word[0] == "*": word = word[1:]  # remove leading * from word using slice
	if word not in counts_dict:
		counts_dict[word] = 0
	counts_dict[word] += 1

# we are done counting, now print the 20 most frequent words

# we need to reverse each item in the dictionary to sort by count not word
def reversepair(p):
	return (p[1] , p[0])

# counts_dict.items() returns a list of 2 element sequences (pairs)
# illustrates sorting, mapping and keyword arguments
result = sorted(map(reversepair, counts_dict.items()),reverse=True)

for w in result[0:20]:
	print w[1], " occurs " , w[0], " times."

raw_input("Press any key to exit")  # needed to keep command window
                                    # open if you double-click to run