-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
237 lines (184 loc) · 8.69 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import plotly.express as px
from wordcloud import WordCloud, get_single_color_func
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
import os
# For gathering data from reddit
import praw # Python Reddit API Wrapper
from praw.models import MoreComments
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.downloader.download('vader_lexicon')
import en_core_web_sm
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
# for visualization
sns.set(style='whitegrid', palette='Dark2')
# loading development settings for Reddit API
load_dotenv()
# Create a reddit connection with reddit API details
reddit = praw.Reddit(client_id=os.getenv("CLIENT_ID"),
client_secret=os.getenv("CLIENT_SECRET"),
user_agent=os.getenv("USER_AGENT"))
subreddit = reddit.subreddit('all')
# function to get comments
def get_comments(search_term, no_of_posts=10, no_of_comments=20, no_of_top_comments=5, include_replies=False):
"""function to get comments from reddit posts.
Args:
search_term (str): strings to search on reddit
no_of_posts (int, optional): number of posts to fetch. Defaults to 10.
no_of_comments (int, optional): number of comments to fetch. Defaults to 20.
no_of_top_comments (int, optional): number of comments with most upvotes to fetch. Defaults to 5.
include_replies (bool, optional): set True to include replies. Defaults to False.
Returns:
comments_body: list of all comments (including text in posts)
top_comments_body: a dict of of top comments (sorted by upvotes) and number of upvotes
len(posts): number of posts fetched
len(all_comments): number of comments fetched (excluding post texts)
"""
posts = {}
all_comments = []
top_comments_body = {}
# Search all subreddits and get comments from top posts
for submission in subreddit.search(search_term, limit=no_of_posts, sort='relevance'):
# Get comments from each submission
submission.comments.replace_more(limit=None)
if include_replies:
comments = submission.comments.list()
else:
comments = submission.comments
# Exclude stickied posts
if not submission.stickied:
posts[submission.id] = submission.selftext
all_comments += comments
# If the number of comments exceeds user input, stop searching for more posts
if len(all_comments) >= no_of_comments:
break
# Sort all comments by number of upvotes
all_comments.sort(key=lambda comment: comment.score, reverse=True)
# Store top n comments with most upvotes
top_comments = all_comments[:no_of_top_comments]
# Store top comments body
for comment in top_comments:
top_comments_body[comment.body] = comment.score
# Store all comments body
comments_body = [comment.body for comment in all_comments]
# Add original post text to comments body
for text in posts.values():
comments_body.append(text)
return comments_body, top_comments_body, len(posts), len(all_comments)
# Pre-processing comments
def pre_process_comments(comments):
"""function to tokenize, convert to lower case, remove stop words and lemmatize comments
Args:
comments (list): a list of comments (text)
Returns:
lemmatized_tokens: a list of pre-processed tokens
len(lemmatized_tokens): number to pre-processed tokens
"""
# convert to a string object
# map to a list of strings
strings_all = [str(i) for i in comments]
# join all strings spearated by a commma
strings_uncleaned = ' , '.join(strings_all)
# tokenizing and cleaning strings
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
tokenized_string = tokenizer.tokenize(strings_uncleaned)
# converting all tokens to lowercase
tokenized_string_lower = [word.lower() for word in tokenized_string]
# removing stop words
nlp = en_core_web_sm.load()
all_stopwords = nlp.Defaults.stop_words
tokens_without_stopwords = [
word for word in tokenized_string_lower if not word in all_stopwords]
# stemming might return a root word that is not an actual word
# whereas, lemmatizing returns a root word that is an actual language word.
# let's normalize words via lemmatizing
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = ([lemmatizer.lemmatize(w)
for w in tokens_without_stopwords])
return lemmatized_tokens, len(lemmatized_tokens)
# function to apply sentiment analyzer and get a data frame with scores and labels (based on threshold score) for each word
def apply_sentiment_analyzer(lemmatized_tokens, threshold=0.10):
"""function to apply sentiment analyzer to tokens
Args:
lemmatized_tokens (list): a list of tokens
threshold (float, optional): threshold to categorize compound polarity score as positive or negative. Defaults to 0.10.
Returns:
df: a DataFrame with columns neg (0 or 1), neu (0 or 1), pos (0 or 1), compound (polarity score), words, label (0 for neutral, 1 for positive, and -1 for negative)
"""
# Applying a sentiment analyzer (VADER)
sia = SIA()
results = []
# get polarity score for words
for word in lemmatized_tokens:
pol_score = sia.polarity_scores(word)
pol_score['words'] = word
results.append(pol_score)
df = pd.DataFrame.from_records(results)
# compound score as seen above is a normalized single unideimensional measure of sentiment for a given word
# adding a label column to denote neutral, positive or negative sentiment with 0, 1 or -1
df['label'] = 0 # neutral
df.loc[df['compound'] > threshold, 'label'] = 1 # positive
df.loc[df['compound'] < -threshold, 'label'] = -1 # negative
return df
# class for assigning colors to positive and negative words
class SimpleGroupedColorFunc(object):
"""class to assign colors to positive, negative and neutral words
"""
def __init__(self, color_to_words, default_color):
"""
Args:
color_to_words (dict): a dict with colors for positive and negative words as dict keys and a list of words as dict values
default_color (string): a color for neutral words
"""
self.word_to_color = {word: color
for (color, words) in color_to_words.items()
for word in words}
self.default_color = default_color
def __call__(self, word, **kwargs):
return self.word_to_color.get(word, self.default_color)
# generate bar charts for frequent words
def bar_chart_for_freq_words(words_dict, title, color, no_of_words=20):
"""function to generate a bar chart for frequency of words
Args:
words_dict (FreqDist): words and counts. Ex: FreqDist({'happy': 1, 'bad': -1, 'fan': 1, 'cheerlead': 1, 'hat': 0})
title (string): title for the chart
color (string): color for the bars
no_of_words (int, optional): number of most common words to display. Defaults to 20.
Returns:
fig: a plotly.graph_objs._figure.Figure
"""
# Most common positive words
freq_df = pd.DataFrame(words_dict.most_common(no_of_words))
freq_df = freq_df.rename(
columns={0: "Bar Graph of Frequent Words", 1: "Count"}, inplace=False)
fig = px.bar(freq_df, x="Bar Graph of Frequent Words",
y="Count", title=title, text_auto='.2s')
fig.update_traces(textfont_size=12, textangle=0,
textposition="outside", cliponaxis=False, marker_color=color)
return fig
# generate bar chart for percentage of postive, negative and neutral words
def bar_chart_for_sentiment(perc_of_pos_words, perc_of_neg_words):
"""function to generate a bar chart showing overall percentage of postive and negative words
Args:
perc_of_pos_words (int): proportion of positive words
perc_of_neg_words (int): proportion of negative words
Returns:
fig: a plotly.graph_objs._figure.Figure
"""
colors = ['green', 'red']
fig = px.bar(x=['positive', 'negative'],
y=[perc_of_pos_words, perc_of_neg_words],
labels={'x': 'Sentiment', 'y': 'Percentage'},
text_auto='.2s',
text=[f'{perc_of_pos_words}%', f'{perc_of_neg_words}%'],
title="Percentage of positive and negative words")
fig.update_traces(textfont_size=12, textangle=0,
textposition="outside", cliponaxis=False, marker_color=colors)
return fig