fake_news_classifier.py

# -*- coding: utf-8 -*-
"""Fake news classifier

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/11_HwdJSMsCTCAZ1-q7GmhBxJk_O9Qqb7
"""

from google.colab import files
files.upload()

# Let's make sure the kaggle.json file is present.
!ls -lha kaggle.json

!pip install -q kaggle

# The Kaggle API client expects this file to be in ~/.kaggle,
# so move it there.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

!pip install kaggle==1.5.6

!kaggle competitions download -c fake-news

!unzip fake-news.zip -d FakeNews

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

df = pd.read_csv('/content/FakeNews/train.csv')
df.head()

df.describe()

df.count()

df.isnull().sum()

df.shape

df = df.dropna()
df.shape

df = df.drop('id', axis=1)
df.head()

X = df.iloc[:, 0:3]
Y = df.iloc[: , 3]

X.head()

Y.head()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

news = df.copy()
news.head()

news.reset_index(inplace=True)
news.head(10)

len(news)

news['title'][5]

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer

corpus = []
ps = PorterStemmer()

for i in range(0, len(news)):
  check = re.sub('[^a-zA-Z]', ' ', news['title'][i])
  check = check.lower()
  check = check.split()
  check = [ps.stem(word) for word in check if not word in stopwords.words('english')]
  check = ' '.join(check)
  corpus.append(check)

corpus[1]

from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(max_features = 5000, ngram_range = (1,3))
X = countvec.fit_transform(corpus).toarray()
X.shape

y = news.iloc[: , 4]

y.shape

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 3)

countvec.get_params()

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB()

X_train.shape, y_train.shape

X_test.shape, y_test.shape

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

score = metrics.accuracy_score(y_test, y_pred)
score

cm = metrics.confusion_matrix(y_test, y_pred)
cm

"""# **TF-IDF**"""

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features = 5000, ngram_range = (1,3))

X = tfidf.fit_transform(corpus).toarray()
X.shape

y = news.iloc[:, 4]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 3)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print(classification_report(y_test, pred))

score = metrics.accuracy_score(y_test, pred)
score

cm = metrics.confusion_matrix(y_test, pred)
cm

"""# **Hyperparameter tuning**"""

import numpy as np
clf = MultinomialNB(alpha=0.1)

previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    if score>previous_score:
        clf = sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))