Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Submitting the project #491

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Hepatitis C Virus Analysis and Prediction/DATASET
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The dataset that was already provided can be found here at https://www.kaggle.com/datasets/mohamedzaghloula/hepatitis-c-virus-egyptian-patients
1 change: 1 addition & 0 deletions Hepatitis C Virus Analysis and Prediction/Dataset.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The dataset that was used can be found at https://www.kaggle.com/datasets/mohamedzaghloula/hepatitis-c-virus-egyptian-patients
15 changes: 15 additions & 0 deletions Hepatitis C Virus Analysis and Prediction/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Hepatitis C Virus Analysis and Prediction
##Goal
The goal of this project is to create a ML model with maximum accuracy and perform an analysis and prediction model.

## Table Of Content:

- [EDA](#features)
- [Preprocessing](#installation)
- [Model implementation ( using LR, KNN, SVM, RF)](#usage)
- [Classification metrics](#contributing)

## Features

The Models with max accuracy were found to be Logistic Regression and Random Forest with 0.51 and 0.55 respectively.

6 changes: 6 additions & 0 deletions Hepatitis C Virus Analysis and Prediction/Requirements
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Library used:
1. numpy
2. pandas
3. matplotlib
4. seaborn
5. sklearn
371 changes: 371 additions & 0 deletions Hepatitis C Virus Analysis and Prediction/hepatitismodel.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,371 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


df= pd.read_csv('/kaggle/input/hepatitis-c-virus-egyptian-patients/HCV-Egypt-Data.csv')
df.dtypes
##checking for null values
print(df.isnull().sum())
df.describe()
plt.figure(figsize=(3, 4))
sns.histplot(df['BMI'], bins=20, kde=True)
plt.title('Histogram of BMI')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['BMI'])
plt.title('Box Plot of BMI')
plt.xlabel('BMI')
plt.show()
plt.figure(figsize=(3, 4))
sns.histplot(df['Fever'], bins=20, kde=True)
plt.title('Histogram of Fever')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['Fever'])
plt.title('Box Plot of Fever')
plt.xlabel('Fever')
plt.show()
plt.figure(figsize=(3, 4))
sns.histplot(df['ALT 36'], bins=20, kde=True)
plt.title('Histogram of ALT 36')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['ALT 36'])
plt.title('Box Plot of ALT 36')
plt.xlabel('ALT 36')
plt.show()
####OUTLIER DETECTED
plt.figure(figsize=(3, 4))
sns.histplot(df['ALT after 24 w'], bins=20, kde=True)
plt.title('Histogram of ALT after 24 w')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['ALT after 24 w'])
plt.title('Box Plot of ALT after 24 w')
plt.xlabel('ALT after 24 w')
plt.show()plt.figure(figsize=(3, 4))
sns.histplot(df['RNA Base'], bins=20, kde=True)
plt.title('Histogram of RNA Base')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['RNA Base'])
plt.title('Box Plot of RNA Base')
plt.xlabel('RNA Base')
plt.show()plt.figure(figsize=(3, 4))
sns.histplot(df['RNA Base'], bins=20, kde=True)
plt.title('Histogram of RNA 4')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['RNA 4'])
plt.title('Box Plot of RNA 4')
plt.xlabel('RNA 4')
plt.show()###OUTLIER DETECTED
plt.figure(figsize=(3, 4))
sns.histplot(df['RNA Base'], bins=20, kde=True)
plt.title('Histogram of RNA 12')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['RNA 12'])
plt.title('Box Plot of RNA 12')
plt.xlabel('RNA 12')
plt.show()plt.figure(figsize=(3, 4))
sns.histplot(df['RNA EOT'], bins=20, kde=True)
plt.title('Histogram of RNA EOT')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['RNA EOT'])
plt.title('Box Plot of RNA EOT')
plt.xlabel('RNA EOT')
plt.show()plt.figure(figsize=(3, 4))
sns.histplot(df['Baseline histological Grading'], bins=20, kde=True)
plt.title('Histogram of Baseline histological Grading')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['Baseline histological Grading'])
plt.title('Box Plot of Baseline histological Grading')
plt.xlabel('Baseline histological GradingF')
plt.show()plt.figure(figsize=(3, 4))
sns.histplot(df['Baselinehistological staging'], bins=20, kde=True)
plt.title('Histogram of Baselinehistological staging')
plt.xlabel('column')
plt.ylabel('Frequency')
plt.show()

#BOX PLOT
plt.figure(figsize=(3, 4))
sns.boxplot(x=df['Baselinehistological staging'])
plt.title('Box Plot of Baselinehistological staging')
plt.xlabel('Baselinehistological staging')
plt.show()

###DEALING WITH OUTLIERS
from scipy.stats import zscore

z_scores_ALT_24w = zscore(df['ALT after 24 w'])
z_scores_RNA_12 = zscore(df['RNA 12'])
print(z_scores_ALT_24w)
print(z_scores_RNA_12)
# outlier removal
threshold = 3
df_no_outliers_ALT_24w = df[(abs(z_scores_ALT_24w) <= threshold)]
df_no_outliers_RNA_12 = df[(abs(z_scores_RNA_12) <= threshold)]

##new box plots with updated outliers transformation
import seaborn as sns
import matplotlib.pyplot as plt

# Re-run box plots for 'ALT after 24 w' and 'RNA 12' after handling outliers
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.boxplot(x=df_no_outliers_ALT_24w['ALT after 24 w'])
plt.title('Box Plot for ALT after 24 w (Outliers Removed)')

plt.subplot(1, 2, 2)
sns.boxplot(x=df_no_outliers_RNA_12['RNA 12'])
plt.title('Box Plot for RNA 12 (Outliers Removed)')

plt.show()

# Re-run summary statistics for 'ALT after 24 w' and 'RNA 12' after handling outliers
summary_stats_after_outlier_removal_ALT_24w = df_no_outliers_ALT_24w['ALT after 24 w'].describe()
summary_stats_after_outlier_removal_RNA_12 = df_no_outliers_RNA_12['RNA 12'].describe()

print("Summary Statistics for ALT after 24 w (After Outlier Removal):")
print(summary_stats_after_outlier_removal_ALT_24w)

print("\nSummary Statistics for RNA 12 (After Outlier Removal):")
print(summary_stats_after_outlier_removal_RNA_12)

###COORELATION ANALYSIS:
correlation_matrix = df.corr()
plt.figure(figsize=(16, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

#####Conclusion- The white or neutral areas represent lower correlations.

# Assume 'Histological Grading' is the target variable
target_variable = 'Baseline histological Grading'
X = df.drop(target_variable, axis=1)
y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#####RANDOM FOREST #####
clf = RandomForestClassifier(random_state=42)
param_grid = {'n_estimators': [50, 100, 150, 500, 1000], 'max_depth': [None, 10, 20, 30, 40, 50]}
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Print results
print(f"Best Model: {best_model}")
print(f"Accuracy on Test Set: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_str)
# Assuming 'best_model' is your trained RandomForestClassifier
feature_importance = best_model.feature_importances_

# Create a DataFrame to display feature importance scores
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance scores
print(feature_importance_df)

#######LOGISTIC REGRESSION####
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# the parameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty': ['l2']}

# Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# GridSearchCV
grid_search = GridSearchCV(logreg_model, param_grid, cv=20, scoring='accuracy')

# Fitting the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Print the best parameters
print("Best Parameters:", best_params)

# Use the best parameters to create the final model
final_logreg_model = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'], random_state=42, )
final_logreg_model.fit(X_train_scaled, y_train)

# Make predictions on the test set using the final model
y_pred = final_logreg_model.predict(X_test_scaled)

# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Print results for the final model
print(f"Logistic Regression Model (After Hyperparameter Tuning)")
print(f"Accuracy on Test Set: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_str)

#####KNN####
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'X' and 'y' are your features and target variable, respectively

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (important for KNN)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Select top k features using ANOVA F-statistic
k_best = SelectKBest(f_classif, k=5)
X_train_selected = k_best.fit_transform(X_train_scaled, y_train)
X_test_selected = k_best.transform(X_test_scaled)

# Perform Grid Search to find the best k value
param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

best_k = grid_search.best_params_['n_neighbors']

# KNN Classifier
knn_model = KNeighborsClassifier(n_neighbors=best_k)
knn_model.fit(X_train_selected, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test_selected)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Print results
print(f"KNN Classifier with k={best_k}")
print(f"Accuracy on Test Set: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_str)




#####SVM###
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your DataFrame

# Define features (X) and target variable (y)
features = df_no_outliers_ALT_24w[['ALT after 24 w', 'RNA 12']]
target = df_no_outliers_ALT_24w['Baselinehistological staging']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the SVM model
svm_model = SVC(kernel='rbf', C=1.0)
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results
print(f'Accuracy: {accuracy}')
print('\nConfusion Matrix:\n', conf_matrix)
print('\nClassification Report:\n', classification_rep)
Loading