-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
90 lines (58 loc) · 1.87 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import numpy as np
data = pd.read_csv('preprocessed_data.csv',nrows=80000)
print(data.shape)
print(data.head())
print(data.keys())
news_data = data[['headline','authors','category']]
print(news_data.head())
news_data['category'].value_counts()
print(news_data.isnull().sum())
news_data.isnull().sum()
#Tokenization
import string
punct = string.punctuation
print(punct)
#Data Cleaning
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
nlp = spacy.load("en")
stopwords = list(STOP_WORDS)
def text_data_cleaning(sentence):
doc = nlp(sentence)
tokens = []
for token in doc:
if token.lemma_ != '-PRON-':
temp = token.lemma_.lower().strip()
else:
temp = token.lower_
tokens.append(temp)
cleaned_tokens = []
for token in tokens:
if token not in stopwords and token not in punct:
cleaned_tokens.append(token)
return cleaned_tokens
#Classification
from sklearn.svm import LinearSVC
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = LinearSVC()
X = news_data['headline']
y = news_data['category']
X_train,X_test,y_train,y_test = train_test_split(X,y) #,test_size=1,random_state=6)
print(X_train.shape, y_train.shape)
clf = Pipeline([('tfidf',tfidf),('clf',classifier)])
print(y_train.head())
clf.fit(X_train,y_train)
text = 'How to boil and peel eggs the right way'
pred = clf.predict([text])
print("data data === "+pred)
accuracy_score(y_test,clf.predict(X_test))
print(classification_report(y_test,clf.predict(X_test)))
#Model Save
import joblib
joblib.dump(clf,'news_classifier.pkl')