-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path1.py
111 lines (110 loc) · 2.87 KB
/
1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json
import urllib
import feedparser,os
import codecs
from bs4 import BeautifulSoup
import shutil
import math
import pickle
import sys
data={}
shutil.rmtree('corpus')
shutil.rmtree('pickle')
os.mkdir("corpus")
os.mkdir("pickle")
author_no=-1
lis=[]
with open('writers.txt','r') as f:
for line in f:
line=line.strip()
os.mkdir("corpus/"+line)
link="https://www.quora.com/"+line+"/answers/rss"
author_no=author_no+1
print author_no
f=feedparser.parse(link)
print line +" Answers are being parsed "
data[line]={}
answers={}
for i in range (0,len(f['entries'])):
answers={}
#print "Done ==> "+str((i/len(f['entries']))*100)+ "%"
entry=f['entries'][i]['description']
soup=BeautifulSoup(entry)
for div in soup.findAll('a', 'user'):
div.extract()
text= soup.get_text()
text=text[:-22]
#print text
if(len(text)<=500):
#print line
continue
text=text.encode('ascii','ignore')
#print len(text)
# if(len(text)>=5000):
# fil="corpus/"+line+"/Answer"+str(i+1)+"_part.txt"
# fe=open(fil,'w')
# index=len(text)/2
# while text[index]!= " " and text[index]!= ".":
# index=index+1
# fe.write(text[:(index)])
# fe.close()
# l=text[:(index)]
# sentences=l.split(". ")
# answers['no_of_sentences']=len(sentences)
# fes=open('sentence.csv','a')
# for i in range(0,len(sentences)):
# wr=sentences[i]+","+str(author_no)+"\n"
# fes.write(wr)
# fes.close()
# z="Answer"+str(i+1)+"_part.txt"
# answers['key']=z
# words=l.split(" ")
# answers['no_of_words']=len(words)
# sumi=0
# for word_index in range(0,len(words)):
# lsd=len(words[word_index])
# sumi=sumi+lsd
# avg=float(sumi)/len(words)
# answers['avg_word_length']=avg
# sumi=0
# for word_index in range(0,len(words)):
# lsd=len(words[word_index])
# lsd=(lsd-avg)*(lsd-avg)
# sumi=sumi+lsd
# std=math.sqrt(float(sumi)/len(words))
# answers['standard_dev']=std
# lis.append(answers)
# text=text[(index):]
fil="corpus/"+line+"/Answer"+str(i+1)+".txt"
z="Answer"+str(i+1)+".txt"
answers['text']=text
answers['author_no']=author_no
words=text.split(" ")
sentences=text.split(". ")
# fes=open('sentence.csv','a')
# for i in range(0,len(sentences)):
# wr=sentences[i]+","+str(author_no)+"\n"
# fes.write(wr)
# fes.close()
# answers['no_of_sentences']=len(sentences)
# answers['no_of_words']=len(words)
sumi=0
for word_index in range(0,len(words)):
lsd=len(words[word_index])
sumi=sumi+lsd
avg=float(sumi)/len(words)
#answers['avg_word_length']=avg
sumi=0
for word_index in range(0,len(words)):
lsd=len(words[word_index])
lsd=(lsd-avg)*(lsd-avg)
sumi=sumi+lsd
std=math.sqrt(float(sumi)/len(words))
# answers['standard_dev']=std
fe=open(fil,'w')
lis.append(answers)
fe.write(text)
fe.close()
#data[line]=lis
pickle.dump( lis , open( "pickle/list_"+line+".b", "wb" ) )
print lis