-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
89 lines (73 loc) · 2.66 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Note: To run this code write streamlit run app.py on the terminal
import streamlit as st
import pickle as pkl
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import re
import string
# Load pre-trained model and vectorizer
tfidf = pkl.load(open("Vectorizer.pkl", "rb"))
model = pkl.load(open("Model.pkl", "rb"))
# Set up the Streamlit app header
st.header("Email/SMS Spam Classifier")
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
# Function to clean and preprocess the input text
def clean_text(text):
# Convert text to lowercase
text = text.lower()
# Remove mentions (e.g., @username)
text = re.sub(r"@[\w-]+", "", text)
# Remove punctuation marks
text = text.translate(str.maketrans("", "", string.punctuation))
# Strip unusual whitespaces
text = text.strip()
# Tokenize words
text = word_tokenize(text)
# Remove stop words
text = [word for word in text if word not in stopwords.words("english")]
# Part-of-speech tagging
tagged_text = pos_tag(text)
# Lemmatize words based on their POS tags
lemmatized_text = [
lemmatizer.lemmatize(word, pos=tag[0].lower()[0])
if tag[0].lower()[0] in ['a', 'n', 'v']
else lemmatizer.lemmatize(word)
for word, tag in tagged_text
]
# Return the cleaned and lemmatized text as a single string
return " ".join(lemmatized_text)
# Function to validate input text
def validate_input(text):
# Check if the input is empty
if not text:
return "Review cannot be empty."
# Check if the input length is at least 10 characters
if len(text) < 10:
return "Review must be at least 10 characters long."
# Create a text area for user input
input_text = st.text_area("Enter your SMS/Email")
# Button to submit the input for classification
if st.button("Predict"):
# Validate the input text
validate = validate_input(input_text)
if validate:
# If validation fails, show an error message
st.error(validate)
else:
# Clean and preprocess the text
text = clean_text(input_text)
# Transform the text using the TF-IDF vectorizer
vectorized_text = tfidf.transform([text])
# Make a prediction using the trained model
prediction = model.predict(vectorized_text)[0]
# Display the result based on the prediction
if prediction == 0:
st.write("Not Spam")
else:
st.write("Spam")
else:
# Prompt the user to submit text for classification
st.write("Click the submit button to classify the text")