forked from ahmedshabib/twit-miner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTweetsProcessor.py
79 lines (63 loc) · 2.13 KB
/
TweetsProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import csv
import re
#Code to process the tweets and to get the words vector
def main():
test =0
class TweetProcessor:
def __init__(self):
self.stopwords = []
self.readStopWords()
def readStopWords(self):
self.stopwords = open(r'stopwords', 'r').read().splitlines()
def processTweet(self,tweet):
# process the tweets
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','',tweet)
tweet = re.sub('(:-\))|(:\)) ',' SMILEY ',tweet)
tweet = re.sub('\\\\[u][0-9]+',' ',tweet)
tweet = re.sub('[\\][u][0-9]+',' ',tweet)
tweet = re.sub('[\s]([0-9]+\.?[0-9]+)',' NUMBER ',tweet)
tweet = re.sub('[\\][\"]',' ',tweet)
tweet = re.sub('[\\][\']',' ',tweet)
tweet = re.sub('[\"]+',' ',tweet)
tweet = re.sub('[\']+',' ',tweet)
tweet = re.sub('[\?!]+',' ',tweet)
tweet = re.sub('\\\\',' ',tweet)
tweet = re.sub('\(',' ',tweet)
tweet = re.sub('\)',' ',tweet)
tweet = re.sub('&[a-zA-Z]+;',' ',tweet)
tweet = re.sub('[\.]+',' ',tweet)
tweet = re.sub('[,]+',' ',tweet)
tweet = re.sub('[:]+',' ',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
def getwords(self,sentence):
w= sentence.split()
#remove all things that are 1 or 2 characters long (punctuation)
w= [x for x in w if len(x)>2]
#get rid of all stop words
w= [x for x in w if not x in self.stopwords]
#add bigrams
w= w + [w[i]+' '+w[i+1] for i in range(len(w)-1)]
#get rid of duplicates by converting to set and back to list
#this works because sets dont contain duplicates
w= list(set(w))
return w
def sortdict(self,catdict):
returndict = {}
count = 1000000
for w in sorted(catdict, key=catdict.get, reverse=True):
if(count>0):
returndict[w] = catdict[w]
count = count-1
else:
break
return returndict
#return sorted(catdict, key=catdict.get, reverse=True)
if __name__ == "__main__":
main()