noun

import re import operator import nltk from nltk.corpus import stopwords from nltk.corpus import wordnet as wn res = [] notres = [] resmin = 1000000 resmax = 0 ressum = 0 notresmin = 100000 notresmax = 0 notressum = 0 for i in range(241,278): print "\n" num = i name = "txt " + str(i) + ".txt" mod = open(name,"r") #opensfile modi = open(name,"r") print "FILE NO: " + str(i) stri = mod.read() stri = stri.lower() start = modi.readline() start = start.lower() start = nltk.word_tokenize(start) sr = stri ind = stri.find('\n') #deletes the first line (heading) stri = stri[ind+1:] print sr.split('\n', 1)[0] #prints the first line print "\n" stri = re.sub(r'https?:\/\/.*[//ml]', '', stri, flags=re.MULTILINE) #removes url stri = re.sub(r'www.?.*[//ml]', '', stri, flags=re.MULTILINE) #removes url cleanr = re.compile('<.*?>') #removes html tags stri = re.sub(cleanr, '', stri) delim = [',','?','/','//','!','\\','[',']','&','-',':',';','@','...','>','<','=','****','.',')','(','*','|','2',"'"] new_s = stri print stri print '\n' for i in delim: #removes delimiters new_s = new_s.replace(i, ' ') stri = ' '.join(new_s.split()) stop_words = set(stopwords.words('english')) tokens = nltk.word_tokenize(stri) filtered_sentence = [] #removes stopwords for w in tokens: if w not in stop_words: filtered_sentence.append(w) tokens = filtered_sentence ab = nltk.pos_tag(tokens) noun = [] for item in ab: if item[1] == "NN" or item[1] == "NNS" or item[1] == "NNP": noun.append(item[0]) freq = nltk.FreqDist(noun) print freq.most_common(10) print "\n" print "\n"

Be the first to comment