import re
import operator
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
res = []
notres = []
resmin = 1000000
resmax = 0
ressum = 0
notresmin = 100000
notresmax = 0
notressum = 0
for i in range(241,278):
print "\n"
num = i
name = "txt " + str(i) + ".txt"
mod = open(name,"r") #opensfile
modi = open(name,"r")
print "FILE NO: " + str(i)
stri = mod.read()
stri = stri.lower()
start = modi.readline()
start = start.lower()
start = nltk.word_tokenize(start)
sr = stri
ind = stri.find('\n') #deletes the first line (heading)
stri = stri[ind+1:]
print sr.split('\n', 1)[0] #prints the first line
print "\n"
stri = re.sub(r'https?:\/\/.*[//ml]', '', stri, flags=re.MULTILINE) #removes url
stri = re.sub(r'www.?.*[//ml]', '', stri, flags=re.MULTILINE) #removes url
cleanr = re.compile('<.*?>') #removes html tags
stri = re.sub(cleanr, '', stri)
delim = [',','?','/','//','!','\\','[',']','&','-',':',';','@','...','>','<','=','****','.',')','(','*','|','2',"'"]
new_s = stri
print stri
print '\n'
for i in delim: #removes delimiters
new_s = new_s.replace(i, ' ')
stri = ' '.join(new_s.split())
stop_words = set(stopwords.words('english'))
tokens = nltk.word_tokenize(stri)
filtered_sentence = [] #removes stopwords
for w in tokens:
if w not in stop_words:
filtered_sentence.append(w)
tokens = filtered_sentence
ab = nltk.pos_tag(tokens)
noun = []
for item in ab:
if item[1] == "NN" or item[1] == "NNS" or item[1] == "NNP":
noun.append(item[0])
freq = nltk.FreqDist(noun)
print freq.most_common(10)
print "\n"
print "\n"
Be the first to comment
You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.