-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
57 lines (31 loc) · 1.36 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from preprocessing_utils import preprocess_text
import pandas as pd
# load datasets
path = "data/L-HSAB.txt"
df = pd.read_table(path, delimiter=" ")
# Remove duplicated tweets based on the 'Tweet' column, keeping the first occurrence
df = df.drop_duplicates(subset='Tweet', keep='first')
# IQR method for identifying outliers
# Calculate the length of each tweet
df['Tweet_Length'] = df['Tweet'].apply(len)
# Calculate the first and third quartile
Q1 = df['Tweet_Length'].quantile(0.25)
Q3 = df['Tweet_Length'].quantile(0.75)
# Calculate the Interquartile Range (IQR)
IQR = Q3 - Q1
# Define the bounds for non-outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['Tweet_Length'] < lower_bound) | (df['Tweet_Length'] > upper_bound)]
# Remove outliers
# Filter the DataFrame to remove outliers
df = df[(df['Tweet_Length'] >= lower_bound) & (df['Tweet_Length'] <= upper_bound)]
# Replacing 'Abusive' with 'Hate' in the 'Class' column
df['Class'] = df['Class'].replace({'abusive': 'hate'})
# Apply the preprocess_text function to each tweet in the 'Tweet' column
df['Tweet'] = df['Tweet'].apply(preprocess_text)
# Apply tokenization to each tweet in the 'Tweet' column
# df['Tokenized_Tweet'] = df['Tweet'].apply(araby.tokenize)
# Apply the mapping to the 'Class' column
df['Class'] = df['Class'].map({'hate': 1, 'normal': 0})
# print(df)