diff --git a/Hepatitis C Virus Analysis and Prediction/DATASET b/Hepatitis C Virus Analysis and Prediction/DATASET new file mode 100644 index 000000000..045cf62e6 --- /dev/null +++ b/Hepatitis C Virus Analysis and Prediction/DATASET @@ -0,0 +1 @@ +The dataset that was already provided can be found here at https://www.kaggle.com/datasets/mohamedzaghloula/hepatitis-c-virus-egyptian-patients diff --git a/Hepatitis C Virus Analysis and Prediction/Dataset.txt b/Hepatitis C Virus Analysis and Prediction/Dataset.txt new file mode 100644 index 000000000..13eb2f5e4 --- /dev/null +++ b/Hepatitis C Virus Analysis and Prediction/Dataset.txt @@ -0,0 +1 @@ +The dataset that was used can be found at https://www.kaggle.com/datasets/mohamedzaghloula/hepatitis-c-virus-egyptian-patients diff --git a/Hepatitis C Virus Analysis and Prediction/README.md b/Hepatitis C Virus Analysis and Prediction/README.md new file mode 100644 index 000000000..dc1da0f69 --- /dev/null +++ b/Hepatitis C Virus Analysis and Prediction/README.md @@ -0,0 +1,15 @@ +# Hepatitis C Virus Analysis and Prediction +##Goal +The goal of this project is to create a ML model with maximum accuracy and perform an analysis and prediction model. + +## Table Of Content: + +- [EDA](#features) +- [Preprocessing](#installation) +- [Model implementation ( using LR, KNN, SVM, RF)](#usage) +- [Classification metrics](#contributing) + +## Features + +The Models with max accuracy were found to be Logistic Regression and Random Forest with 0.51 and 0.55 respectively. + diff --git a/Hepatitis C Virus Analysis and Prediction/Requirements b/Hepatitis C Virus Analysis and Prediction/Requirements new file mode 100644 index 000000000..0e710683d --- /dev/null +++ b/Hepatitis C Virus Analysis and Prediction/Requirements @@ -0,0 +1,6 @@ +Library used: +1. numpy +2. pandas +3. matplotlib +4. seaborn +5. sklearn diff --git a/Hepatitis C Virus Analysis and Prediction/hepatitismodel.ipynb b/Hepatitis C Virus Analysis and Prediction/hepatitismodel.ipynb new file mode 100644 index 000000000..b4d05cd8a --- /dev/null +++ b/Hepatitis C Virus Analysis and Prediction/hepatitismodel.ipynb @@ -0,0 +1,371 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score, classification_report +from sklearn.model_selection import GridSearchCV +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression + + +df= pd.read_csv('/kaggle/input/hepatitis-c-virus-egyptian-patients/HCV-Egypt-Data.csv') +df.dtypes +##checking for null values +print(df.isnull().sum()) +df.describe() +plt.figure(figsize=(3, 4)) +sns.histplot(df['BMI'], bins=20, kde=True) +plt.title('Histogram of BMI') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['BMI']) +plt.title('Box Plot of BMI') +plt.xlabel('BMI') +plt.show() +plt.figure(figsize=(3, 4)) +sns.histplot(df['Fever'], bins=20, kde=True) +plt.title('Histogram of Fever') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['Fever']) +plt.title('Box Plot of Fever') +plt.xlabel('Fever') +plt.show() +plt.figure(figsize=(3, 4)) +sns.histplot(df['ALT 36'], bins=20, kde=True) +plt.title('Histogram of ALT 36') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['ALT 36']) +plt.title('Box Plot of ALT 36') +plt.xlabel('ALT 36') +plt.show() +####OUTLIER DETECTED +plt.figure(figsize=(3, 4)) +sns.histplot(df['ALT after 24 w'], bins=20, kde=True) +plt.title('Histogram of ALT after 24 w') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['ALT after 24 w']) +plt.title('Box Plot of ALT after 24 w') +plt.xlabel('ALT after 24 w') +plt.show()plt.figure(figsize=(3, 4)) +sns.histplot(df['RNA Base'], bins=20, kde=True) +plt.title('Histogram of RNA Base') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['RNA Base']) +plt.title('Box Plot of RNA Base') +plt.xlabel('RNA Base') +plt.show()plt.figure(figsize=(3, 4)) +sns.histplot(df['RNA Base'], bins=20, kde=True) +plt.title('Histogram of RNA 4') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['RNA 4']) +plt.title('Box Plot of RNA 4') +plt.xlabel('RNA 4') +plt.show()###OUTLIER DETECTED +plt.figure(figsize=(3, 4)) +sns.histplot(df['RNA Base'], bins=20, kde=True) +plt.title('Histogram of RNA 12') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['RNA 12']) +plt.title('Box Plot of RNA 12') +plt.xlabel('RNA 12') +plt.show()plt.figure(figsize=(3, 4)) +sns.histplot(df['RNA EOT'], bins=20, kde=True) +plt.title('Histogram of RNA EOT') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['RNA EOT']) +plt.title('Box Plot of RNA EOT') +plt.xlabel('RNA EOT') +plt.show()plt.figure(figsize=(3, 4)) +sns.histplot(df['Baseline histological Grading'], bins=20, kde=True) +plt.title('Histogram of Baseline histological Grading') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['Baseline histological Grading']) +plt.title('Box Plot of Baseline histological Grading') +plt.xlabel('Baseline histological GradingF') +plt.show()plt.figure(figsize=(3, 4)) +sns.histplot(df['Baselinehistological staging'], bins=20, kde=True) +plt.title('Histogram of Baselinehistological staging') +plt.xlabel('column') +plt.ylabel('Frequency') +plt.show() + +#BOX PLOT +plt.figure(figsize=(3, 4)) +sns.boxplot(x=df['Baselinehistological staging']) +plt.title('Box Plot of Baselinehistological staging') +plt.xlabel('Baselinehistological staging') +plt.show() + +###DEALING WITH OUTLIERS +from scipy.stats import zscore + +z_scores_ALT_24w = zscore(df['ALT after 24 w']) +z_scores_RNA_12 = zscore(df['RNA 12']) +print(z_scores_ALT_24w) +print(z_scores_RNA_12) +# outlier removal +threshold = 3 +df_no_outliers_ALT_24w = df[(abs(z_scores_ALT_24w) <= threshold)] +df_no_outliers_RNA_12 = df[(abs(z_scores_RNA_12) <= threshold)] + +##new box plots with updated outliers transformation +import seaborn as sns +import matplotlib.pyplot as plt + +# Re-run box plots for 'ALT after 24 w' and 'RNA 12' after handling outliers +plt.figure(figsize=(12, 6)) + +plt.subplot(1, 2, 1) +sns.boxplot(x=df_no_outliers_ALT_24w['ALT after 24 w']) +plt.title('Box Plot for ALT after 24 w (Outliers Removed)') + +plt.subplot(1, 2, 2) +sns.boxplot(x=df_no_outliers_RNA_12['RNA 12']) +plt.title('Box Plot for RNA 12 (Outliers Removed)') + +plt.show() + +# Re-run summary statistics for 'ALT after 24 w' and 'RNA 12' after handling outliers +summary_stats_after_outlier_removal_ALT_24w = df_no_outliers_ALT_24w['ALT after 24 w'].describe() +summary_stats_after_outlier_removal_RNA_12 = df_no_outliers_RNA_12['RNA 12'].describe() + +print("Summary Statistics for ALT after 24 w (After Outlier Removal):") +print(summary_stats_after_outlier_removal_ALT_24w) + +print("\nSummary Statistics for RNA 12 (After Outlier Removal):") +print(summary_stats_after_outlier_removal_RNA_12) + +###COORELATION ANALYSIS: +correlation_matrix = df.corr() +plt.figure(figsize=(16, 12)) +sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5) +plt.title("Correlation Matrix Heatmap") +plt.show() + +#####Conclusion- The white or neutral areas represent lower correlations. + +# Assume 'Histological Grading' is the target variable +target_variable = 'Baseline histological Grading' +X = df.drop(target_variable, axis=1) +y = df[target_variable] +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + + #####RANDOM FOREST ##### +clf = RandomForestClassifier(random_state=42) +param_grid = {'n_estimators': [50, 100, 150, 500, 1000], 'max_depth': [None, 10, 20, 30, 40, 50]} +grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy') +grid_search.fit(X_train_scaled, y_train) +# Get the best model from the grid search +best_model = grid_search.best_estimator_ + +# Make predictions on the test set +y_pred = best_model.predict(X_test_scaled) + +# Evaluate the model +accuracy = accuracy_score(y_test, y_pred) +classification_report_str = classification_report(y_test, y_pred) + +# Print results +print(f"Best Model: {best_model}") +print(f"Accuracy on Test Set: {accuracy:.2f}") +print("Classification Report:") +print(classification_report_str) +# Assuming 'best_model' is your trained RandomForestClassifier +feature_importance = best_model.feature_importances_ + +# Create a DataFrame to display feature importance scores +feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance}) + +# Sort features by importance in descending order +feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) + +# Display the feature importance scores +print(feature_importance_df) + +#######LOGISTIC REGRESSION#### +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import accuracy_score, classification_report + + +# Spliting the data into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42) + +# Feature scaling +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# the parameter grid +param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], + 'penalty': ['l2']} + +# Logistic Regression model +logreg_model = LogisticRegression(random_state=42) + +# GridSearchCV +grid_search = GridSearchCV(logreg_model, param_grid, cv=20, scoring='accuracy') + +# Fitting the grid search to the data +grid_search.fit(X_train_scaled, y_train) + +# Get the best parameters +best_params = grid_search.best_params_ + +# Print the best parameters +print("Best Parameters:", best_params) + +# Use the best parameters to create the final model +final_logreg_model = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'], random_state=42, ) +final_logreg_model.fit(X_train_scaled, y_train) + +# Make predictions on the test set using the final model +y_pred = final_logreg_model.predict(X_test_scaled) + +# Evaluate the final model +accuracy = accuracy_score(y_test, y_pred) +classification_report_str = classification_report(y_test, y_pred) + +# Print results for the final model +print(f"Logistic Regression Model (After Hyperparameter Tuning)") +print(f"Accuracy on Test Set: {accuracy:.2f}") +print("Classification Report:") +print(classification_report_str) + +#####KNN#### +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split +from sklearn.feature_selection import SelectKBest, f_classif +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import accuracy_score, classification_report + +# Assuming 'X' and 'y' are your features and target variable, respectively + +# Split the data into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Feature scaling (important for KNN) +scaler = MinMaxScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# Select top k features using ANOVA F-statistic +k_best = SelectKBest(f_classif, k=5) +X_train_selected = k_best.fit_transform(X_train_scaled, y_train) +X_test_selected = k_best.transform(X_test_scaled) + +# Perform Grid Search to find the best k value +param_grid = {'n_neighbors': [1, 3, 5, 7, 9]} +grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy') +grid_search.fit(X_train_selected, y_train) + +best_k = grid_search.best_params_['n_neighbors'] + +# KNN Classifier +knn_model = KNeighborsClassifier(n_neighbors=best_k) +knn_model.fit(X_train_selected, y_train) + +# Make predictions on the test set +y_pred = knn_model.predict(X_test_selected) + +# Evaluate the model +accuracy = accuracy_score(y_test, y_pred) +classification_report_str = classification_report(y_test, y_pred) + +# Print results +print(f"KNN Classifier with k={best_k}") +print(f"Accuracy on Test Set: {accuracy:.2f}") +print("Classification Report:") +print(classification_report_str) + + + + +#####SVM### +# Import necessary libraries +from sklearn.model_selection import train_test_split +from sklearn.svm import SVC +from sklearn.metrics import accuracy_score, classification_report, confusion_matrix +from sklearn.preprocessing import StandardScaler + +# Assuming 'df' is your DataFrame + +# Define features (X) and target variable (y) +features = df_no_outliers_ALT_24w[['ALT after 24 w', 'RNA 12']] +target = df_no_outliers_ALT_24w['Baselinehistological staging'] + +# Split the data into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42) + +# Standardize the features using StandardScaler +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# Create and train the SVM model +svm_model = SVC(kernel='rbf', C=1.0) +svm_model.fit(X_train_scaled, y_train) + +# Make predictions on the test set +y_pred = svm_model.predict(X_test_scaled) + +# Evaluate the model +accuracy = accuracy_score(y_test, y_pred) +conf_matrix = confusion_matrix(y_test, y_pred) +classification_rep = classification_report(y_test, y_pred) + +# Display results +print(f'Accuracy: {accuracy}') +print('\nConfusion Matrix:\n', conf_matrix) +print('\nClassification Report:\n', classification_rep)