-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfarmer.py
145 lines (62 loc) · 1.6 KB
/
farmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import nltk
import pandas as pd
# In[3]:
data = pd.read_csv("sw_good.csv")
# In[13]:
import sklearn.feature_extraction.text
# In[4]:
from nltk.tokenize import word_tokenize
# In[5]:
import scipy
# In[6]:
import string
# In[7]:
from nltk.corpus import stopwords
# In[16]:
from sklearn.feature_extraction.text import TfidfVectorizer
# In[17]:
vectorizer = TfidfVectorizer()
# In[43]:
data
# In[22]:
vocab_list = []
# In[ ]:
"""Prepare the text for analysis."""
# In[32]:
for item in data['Review']:
# split into words
tokens = word_tokenize(item)
# convert to lowercase
tokens = [w.lower() for w in tokens]
# remove punctuation and abberations
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in stripped if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
# words is now a list of the significant words
for word in words:
vocab_list.append(word)
# In[33]:
print(len(vocab_list))
# In[37]:
print(vocab_list[0])
# In[ ]:
# Fitting the list
# In[38]:
"""
In Jupyter notebook the fitting had to be done to vocab_list[0] but
it gives me an error in Python when I try to use it that way.
"""
vectorizer.fit(vocab_list)
# In[41]:
# Summarize
# In[39]:
print(vectorizer.vocabulary_)
vector_export = vectorizer.vocabulary_
print(len(vector_export))
# In[44]:
#print(len(vectorizer.vocabulary_))
# In[40]:
#vector = vectorizer.transform()