abhisheks008 · akanksha-2002 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/Hepatitis C Virus Analysis and Prediction/DATASET b/Hepatitis C Virus Analysis and Prediction/DATASET
@@ -0,0 +1 @@
+The dataset that was already provided can be found here at https://www.kaggle.com/datasets/mohamedzaghloula/hepatitis-c-virus-egyptian-patients
diff --git a/Hepatitis C Virus Analysis and Prediction/Dataset.txt b/Hepatitis C Virus Analysis and Prediction/Dataset.txt
@@ -0,0 +1 @@
+The dataset that was used can be found at https://www.kaggle.com/datasets/mohamedzaghloula/hepatitis-c-virus-egyptian-patients
diff --git a/Hepatitis C Virus Analysis and Prediction/README.md b/Hepatitis C Virus Analysis and Prediction/README.md
@@ -0,0 +1,15 @@
+# Hepatitis C Virus Analysis and Prediction
+##Goal
+The goal of this project is to create a ML model with maximum accuracy and perform an analysis and prediction model. 
+
+## Table Of Content:
+
+- [EDA](#features)
+- [Preprocessing](#installation)
+- [Model implementation ( using LR, KNN, SVM, RF)](#usage)
+- [Classification metrics](#contributing)
+
+## Features
+
+The Models with max accuracy were found to be Logistic Regression and Random Forest  with 0.51 and 0.55 respectively. 
+
diff --git a/Hepatitis C Virus Analysis and Prediction/Requirements b/Hepatitis C Virus Analysis and Prediction/Requirements
@@ -0,0 +1,6 @@
+Library used:
+1. numpy
+2. pandas
+3. matplotlib
+4. seaborn
+5. sklearn
diff --git a/Hepatitis C Virus Analysis and Prediction/hepatitismodel.ipynb b/Hepatitis C Virus Analysis and Prediction/hepatitismodel.ipynb
@@ -0,0 +1,371 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+
+
+df= pd.read_csv('/kaggle/input/hepatitis-c-virus-egyptian-patients/HCV-Egypt-Data.csv')
+df.dtypes
+##checking for null values
+print(df.isnull().sum())
+df.describe()
+plt.figure(figsize=(3, 4))
+sns.histplot(df['BMI'], bins=20, kde=True)
+plt.title('Histogram of BMI')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['BMI'])
+plt.title('Box Plot of BMI')
+plt.xlabel('BMI')
+plt.show()
+plt.figure(figsize=(3, 4))
+sns.histplot(df['Fever'], bins=20, kde=True)
+plt.title('Histogram of Fever')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['Fever'])
+plt.title('Box Plot of Fever')
+plt.xlabel('Fever')
+plt.show()
+plt.figure(figsize=(3, 4))
+sns.histplot(df['ALT 36'], bins=20, kde=True)
+plt.title('Histogram of ALT 36')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['ALT 36'])
+plt.title('Box Plot of ALT 36')
+plt.xlabel('ALT 36')
+plt.show()
+####OUTLIER DETECTED
+plt.figure(figsize=(3, 4))
+sns.histplot(df['ALT after 24 w'], bins=20, kde=True)
+plt.title('Histogram of ALT after 24 w')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['ALT after 24 w'])
+plt.title('Box Plot of ALT after 24 w')
+plt.xlabel('ALT after 24 w')
+plt.show()plt.figure(figsize=(3, 4))
+sns.histplot(df['RNA Base'], bins=20, kde=True)
+plt.title('Histogram of RNA Base')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['RNA Base'])
+plt.title('Box Plot of RNA Base')
+plt.xlabel('RNA Base')
+plt.show()plt.figure(figsize=(3, 4))
+sns.histplot(df['RNA Base'], bins=20, kde=True)
+plt.title('Histogram of RNA 4')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['RNA 4'])
+plt.title('Box Plot of RNA 4')
+plt.xlabel('RNA 4')
+plt.show()###OUTLIER DETECTED
+plt.figure(figsize=(3, 4))
+sns.histplot(df['RNA Base'], bins=20, kde=True)
+plt.title('Histogram of RNA 12')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['RNA 12'])
+plt.title('Box Plot of RNA 12')
+plt.xlabel('RNA 12')
+plt.show()plt.figure(figsize=(3, 4))
+sns.histplot(df['RNA EOT'], bins=20, kde=True)
+plt.title('Histogram of RNA EOT')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['RNA EOT'])
+plt.title('Box Plot of RNA EOT')
+plt.xlabel('RNA EOT')
+plt.show()plt.figure(figsize=(3, 4))
+sns.histplot(df['Baseline histological Grading'], bins=20, kde=True)
+plt.title('Histogram of Baseline histological Grading')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['Baseline histological Grading'])
+plt.title('Box Plot of Baseline histological Grading')
+plt.xlabel('Baseline histological GradingF')
+plt.show()plt.figure(figsize=(3, 4))
+sns.histplot(df['Baselinehistological staging'], bins=20, kde=True)
+plt.title('Histogram of Baselinehistological staging')
+plt.xlabel('column')
+plt.ylabel('Frequency')
+plt.show()
+
+#BOX PLOT
+plt.figure(figsize=(3, 4))
+sns.boxplot(x=df['Baselinehistological staging'])
+plt.title('Box Plot of Baselinehistological staging')
+plt.xlabel('Baselinehistological staging')
+plt.show()
+
+###DEALING WITH OUTLIERS
+from scipy.stats import zscore
+
+z_scores_ALT_24w = zscore(df['ALT after 24 w'])
+z_scores_RNA_12 = zscore(df['RNA 12'])
+print(z_scores_ALT_24w)
+print(z_scores_RNA_12)
+# outlier removal
+threshold = 3
+df_no_outliers_ALT_24w = df[(abs(z_scores_ALT_24w) <= threshold)]
+df_no_outliers_RNA_12 = df[(abs(z_scores_RNA_12) <= threshold)]
+
+##new box plots with updated outliers transformation
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Re-run box plots for 'ALT after 24 w' and 'RNA 12' after handling outliers
+plt.figure(figsize=(12, 6))
+
+plt.subplot(1, 2, 1)
+sns.boxplot(x=df_no_outliers_ALT_24w['ALT after 24 w'])
+plt.title('Box Plot for ALT after 24 w (Outliers Removed)')
+
+plt.subplot(1, 2, 2)
+sns.boxplot(x=df_no_outliers_RNA_12['RNA 12'])
+plt.title('Box Plot for RNA 12 (Outliers Removed)')
+
+plt.show()
+
+# Re-run summary statistics for 'ALT after 24 w' and 'RNA 12' after handling outliers
+summary_stats_after_outlier_removal_ALT_24w = df_no_outliers_ALT_24w['ALT after 24 w'].describe()
+summary_stats_after_outlier_removal_RNA_12 = df_no_outliers_RNA_12['RNA 12'].describe()
+
+print("Summary Statistics for ALT after 24 w (After Outlier Removal):")
+print(summary_stats_after_outlier_removal_ALT_24w)
+
+print("\nSummary Statistics for RNA 12 (After Outlier Removal):")
+print(summary_stats_after_outlier_removal_RNA_12)
+
+###COORELATION ANALYSIS:
+correlation_matrix = df.corr()
+plt.figure(figsize=(16, 12))
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
+plt.title("Correlation Matrix Heatmap")
+plt.show()
+
+#####Conclusion- The white or neutral areas represent lower correlations.
+
+# Assume 'Histological Grading' is the target variable
+target_variable = 'Baseline histological Grading'
+X = df.drop(target_variable, axis=1)
+y = df[target_variable]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+scaler = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled = scaler.transform(X_test)
+
+  #####RANDOM FOREST #####
+clf = RandomForestClassifier(random_state=42)
+param_grid = {'n_estimators': [50, 100, 150, 500, 1000], 'max_depth': [None, 10, 20, 30, 40, 50]}
+grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
+grid_search.fit(X_train_scaled, y_train)
+# Get the best model from the grid search
+best_model = grid_search.best_estimator_
+
+# Make predictions on the test set
+y_pred = best_model.predict(X_test_scaled)
+
+# Evaluate the model
+accuracy = accuracy_score(y_test, y_pred)
+classification_report_str = classification_report(y_test, y_pred)
+
+# Print results
+print(f"Best Model: {best_model}")
+print(f"Accuracy on Test Set: {accuracy:.2f}")
+print("Classification Report:")
+print(classification_report_str)
+# Assuming 'best_model' is your trained RandomForestClassifier
+feature_importance = best_model.feature_importances_
+
+# Create a DataFrame to display feature importance scores
+feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})
+
+# Sort features by importance in descending order
+feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
+
+# Display the feature importance scores
+print(feature_importance_df)
+
+#######LOGISTIC REGRESSION####
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import accuracy_score, classification_report
+
+
+# Spliting the data into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)
+
+# Feature scaling 
+scaler = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled = scaler.transform(X_test)
+
+# the parameter grid
+param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
+              'penalty': ['l2']}
+
+#  Logistic Regression model
+logreg_model = LogisticRegression(random_state=42)
+
+# GridSearchCV
+grid_search = GridSearchCV(logreg_model, param_grid, cv=20, scoring='accuracy')
+
+# Fitting the grid search to the data
+grid_search.fit(X_train_scaled, y_train)
+
+# Get the best parameters
+best_params = grid_search.best_params_
+
+# Print the best parameters
+print("Best Parameters:", best_params)
+
+# Use the best parameters to create the final model
+final_logreg_model = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'], random_state=42, )
+final_logreg_model.fit(X_train_scaled, y_train)
+
+# Make predictions on the test set using the final model
+y_pred = final_logreg_model.predict(X_test_scaled)
+
+# Evaluate the final model
+accuracy = accuracy_score(y_test, y_pred)
+classification_report_str = classification_report(y_test, y_pred)
+
+# Print results for the final model
+print(f"Logistic Regression Model (After Hyperparameter Tuning)")
+print(f"Accuracy on Test Set: {accuracy:.2f}")
+print("Classification Report:")
+print(classification_report_str)
+
+#####KNN####
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.feature_selection import SelectKBest, f_classif
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import accuracy_score, classification_report
+
+# Assuming 'X' and 'y' are your features and target variable, respectively
+
+# Split the data into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Feature scaling (important for KNN)
+scaler = MinMaxScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled = scaler.transform(X_test)
+
+# Select top k features using ANOVA F-statistic
+k_best = SelectKBest(f_classif, k=5)
+X_train_selected = k_best.fit_transform(X_train_scaled, y_train)
+X_test_selected = k_best.transform(X_test_scaled)
+
+# Perform Grid Search to find the best k value
+param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}
+grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
+grid_search.fit(X_train_selected, y_train)
+
+best_k = grid_search.best_params_['n_neighbors']
+
+# KNN Classifier
+knn_model = KNeighborsClassifier(n_neighbors=best_k)
+knn_model.fit(X_train_selected, y_train)
+
+# Make predictions on the test set
+y_pred = knn_model.predict(X_test_selected)
+
+# Evaluate the model
+accuracy = accuracy_score(y_test, y_pred)
+classification_report_str = classification_report(y_test, y_pred)
+
+# Print results
+print(f"KNN Classifier with k={best_k}")
+print(f"Accuracy on Test Set: {accuracy:.2f}")
+print("Classification Report:")
+print(classification_report_str)
+
+
+
+
+#####SVM###
+# Import necessary libraries
+from sklearn.model_selection import train_test_split
+from sklearn.svm import SVC
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.preprocessing import StandardScaler
+
+# Assuming 'df' is your DataFrame
+
+# Define features (X) and target variable (y)
+features = df_no_outliers_ALT_24w[['ALT after 24 w', 'RNA 12']]
+target = df_no_outliers_ALT_24w['Baselinehistological staging']
+
+# Split the data into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
+
+# Standardize the features using StandardScaler
+scaler = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled = scaler.transform(X_test)
+
+# Create and train the SVM model
+svm_model = SVC(kernel='rbf', C=1.0)
+svm_model.fit(X_train_scaled, y_train)
+
+# Make predictions on the test set
+y_pred = svm_model.predict(X_test_scaled)
+
+# Evaluate the model
+accuracy = accuracy_score(y_test, y_pred)
+conf_matrix = confusion_matrix(y_test, y_pred)
+classification_rep = classification_report(y_test, y_pred)
+
+# Display results
+print(f'Accuracy: {accuracy}')
+print('\nConfusion Matrix:\n', conf_matrix)
+print('\nClassification Report:\n', classification_rep)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The dataset that was already provided can be found here at https://www.kaggle.com/datasets/mohamedzaghloula/hepatitis-c-virus-egyptian-patients
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The dataset that was used can be found at https://www.kaggle.com/datasets/mohamedzaghloula/hepatitis-c-virus-egyptian-patients