-
Notifications
You must be signed in to change notification settings - Fork 1
/
detector.py
247 lines (204 loc) · 8.49 KB
/
detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# Author: Mantresh Khurana
import os
import pickle
import tkinter as tk
from tkinter import Menu, PhotoImage, ttk
from tkinter import messagebox
import webbrowser
import tweepy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from dotenv import load_dotenv
import urllib.request
# check if the internet is available
try:
urllib.request.urlopen('http://www.google.com')
pass
except:
messagebox.showerror("Error", "Internet connection not available.")
exit()
# load the .env file if it exists
if os.path.exists(".env"):
load_dotenv()
else:
messagebox.showerror(
"Error", "Please create a .env file with the required environment variables, check .env.example for more info")
exit()
# --------------APIs & auth--------------- # DO NOT UNCOMMENT THIS SECTION
consumer_key = os.getenv("CONSUMER_KEY")
consumer_secret = os.getenv("CONSUMER_SECRET")
access_token = os.getenv("ACCESS_TOKEN")
access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")
# --------------APIs & auth END--------------- # DO NOT UNCOMMENT THIS SECTION
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
try:
api.verify_credentials()
print("Authentication Working.")
except:
messagebox.showerror(
"Error", "Error during authentication, check your .env file.")
exit()
# load the data into a pandas dataframe, ***YOU CAN USE YOUR OWN DATASET HERE, REMEMBER TO CHANGE THE COLUMN NAMES IN X & Y VARIABLES***
df = pd.read_csv('datasets/hate_speech.csv')
# split the data into feature and target variables
x = df['text']
y = df['is_toxic']
# convert the text data into numerical vectors using a CountVectorizer
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(x)
# split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.2, random_state=42)
# train a logistic regression model on the training data
model = LogisticRegression()
model.fit(x_train, y_train)
# use the model to make predictions on the test data
predictions = model.predict(x_test)
# calculate the accuracy of the model by comparing the predicted labels to the true labels
accuracy = sum(predictions == y_test) / len(y_test)
print("Model Accuracy(from datasets/hate_speech.csv):", accuracy * 100, "%\n")
# function to search for tweets
def search_tweets():
# get the username from the entry widget
username = username_entry.get()
# get the number of tweets to retrieve
count = int(count_entry.get())
# retrieve the tweets using the API object
tweets = api.user_timeline(screen_name=username, count=count)
# clear any existing tweet widgets
for widget in tweet_frame.winfo_children():
widget.destroy()
# create a canvas widget to hold the tweet frame and scrollbar
canvas = tk.Canvas(tweet_frame)
canvas.pack(side="left", fill="both", expand=True)
scrollbar = ttk.Scrollbar(
tweet_frame, orient="vertical", command=canvas.yview)
scrollbar.pack(side="right", fill="y")
canvas.config(yscrollcommand=scrollbar.set)
canvas.bind("<Configure>", lambda event: canvas.configure(
scrollregion=canvas.bbox("all")))
canvas.bind_all("<MouseWheel>", lambda event: canvas.yview_scroll(
int(-1 * (event.delta / 120)), "units"))
# create a frame inside the canvas to hold the tweets
canvas_frame = ttk.Frame(canvas)
canvas.create_window((0, 0), window=canvas_frame, anchor="nw")
canvas_frame.columnconfigure(0, weight=1)
# create a widget for each tweet
for tweet in tweets:
tweet_widget = ttk.Frame(canvas_frame, padding=10, relief="groove")
tweet_widget.grid(sticky="ew", pady=10)
tweet_widget.columnconfigure(1, weight=1)
user_label = ttk.Label(
tweet_widget, text=f"{tweet.user.name} (@{tweet.user.screen_name})")
user_label.grid(row=0, column=0, sticky="w")
text_label = ttk.Label(tweet_widget, text=tweet.text,
wraplength=400, justify="left")
text_label.grid(row=1, column=0, columnspan=2, sticky="w")
created_label = ttk.Label(
tweet_widget, text=tweet.created_at.strftime("%B %d, %Y at %I:%M %p"))
created_label.grid(row=2, column=0, sticky="w")
if tweet.favorite_count >= 1000000000:
favorite_label = ttk.Label(
tweet_widget, text=f"❤ {round((tweet.favorite_count/1000000000), 1)}B")
elif tweet.favorite_count >= 1000000:
favorite_label = ttk.Label(
tweet_widget, text=f"❤ {round((tweet.favorite_count/1000000), 1)}M")
elif tweet.favorite_count >= 1000:
favorite_label = ttk.Label(
tweet_widget, text=f"❤ {round((tweet.favorite_count/1000), 1)}K")
else:
favorite_label = ttk.Label(
tweet_widget, text=f"❤ {tweet.favorite_count}")
favorite_label.grid(row=2, column=1, sticky="e")
retweet_label = ttk.Label(
tweet_widget, text=f"♺ {tweet.retweet_count}")
retweet_label.grid(row=2, column=1, sticky="w")
# check if the tweet is toxic
tweet_text = tweet.text
tweet_text = vectorizer.transform([tweet_text])
percentage = round((model.predict_proba(tweet_text)[0][1] * 100), 2)
if percentage >= 65.00:
print(f"Toxicity: {percentage}%")
text_label.config(foreground="red")
percentage_label = ttk.Label(
tweet_widget, text=f"{percentage}% Toxic", foreground="red", font="bold")
percentage_label.grid(row=3, column=0, sticky="w")
else:
print(f"Toxicity: {percentage}%")
text_label.config(foreground="green")
percentage_label = ttk.Label(
tweet_widget, text=f"{percentage}% Toxic", foreground="green", font="bold")
percentage_label.grid(row=3, column=0, sticky="w")
# save the model
with open('model.pkl', 'wb') as f:
pickle.dump(model, f)
# update the canvas to reflect the changes
canvas_frame.update_idletasks()
canvas.config(scrollregion=canvas_frame.bbox("all"))
root = tk.Tk()
root.geometry("490x700")
root.resizable(False, False)
app_icon = PhotoImage(file='images/logo.png')
root.iconphoto(False, app_icon)
root.title(
"Twitter Toxicity Detector - Accuracy {:.2f}%\n".format(accuracy * 100))
menubar = Menu(root)
root.config(menu=menubar)
file_menu = Menu(menubar, tearoff=False)
help_menu = Menu(menubar, tearoff=False)
# file menu item
menubar.add_cascade(
label="File",
menu=file_menu,
underline=0
)
file_menu.add_command(
label='Exit',
command=root.destroy,
)
# help menu item
menubar.add_cascade(
label="About",
menu=help_menu
)
help_menu.add_command(
label='Visit Website',
command=lambda: webbrowser.open('https://www.mantreshkhurana.com'),
)
help_menu.add_separator()
help_menu.add_command(
label='Source Code',
command=lambda: webbrowser.open('https://github.com/mantreshkhurana/twitter-toxicity-detection-python'),
)
help_menu.add_command(
label='View License',
command=lambda: webbrowser.open(
'https://github.com/mantreshkhurana/twitter-toxicity-detection-python/blob/stable/LICENSE'),
)
search_frame = ttk.Frame(root, padding=10)
search_frame.pack(fill="x")
search_frame.columnconfigure(0, weight=1)
username_label = ttk.Label(search_frame, text="Username:")
username_label.grid(row=0, column=0, sticky="w")
username_entry = ttk.Entry(search_frame)
username_entry.grid(row=0, column=1, sticky="ew")
username_entry.focus()
author_label = ttk.Label(search_frame, text="This project was created by Mantresh Khurana", foreground="grey")
author_label.grid(row=2, column=0, columnspan=5, sticky="w", pady=5, padx=90)
author_label.configure(anchor="center")
post_label = ttk.Label(search_frame, text="Posts:")
post_label.grid(row=0, column=2, sticky="w")
count_entry = ttk.Entry(search_frame, width=5)
count_entry.grid(row=0, column=3, sticky="w")
count_entry.insert(0, "10")
search_button = ttk.Button(search_frame, text="Search", command=search_tweets)
search_button.grid(row=0, column=4, sticky="e")
# accuracy_label = ttk.Label(search_frame, text="Model Accuracy: {:.2f}%\n".format(accuracy * 100))
# accuracy_label.grid(row=1, column=0, columnspan=3, sticky="we")
tweet_frame = ttk.Frame(root, padding=10)
tweet_frame.pack(fill="both", expand=True)
root.mainloop()