Add files via upload

marcellin-d · Nov 28, 2024 · 2e8f3bd · 2e8f3bd
1 parent 61513d6
commit 2e8f3bd
Show file tree

Hide file tree

Showing 14 changed files with 349 additions and 0 deletions.
diff --git a/credit-card-fraud-detection-1-638.jpg b/credit-card-fraud-detection-1-638.jpg
diff --git a/evaluation_report.md b/evaluation_report.md
@@ -0,0 +1,60 @@
+---
+
+**Evaluation Report:**
+
+**Accuracy:** 0.9434
+
+- The model has an **overall accuracy** of 94.34%, meaning that 94.34% of the predictions were correct. This is a strong indicator of the model's overall performance.
+
+---
+
+**Classification Report:**
+
+|              | Precision | Recall  | F1-Score | Support |
+|--------------|-----------|---------|----------|---------|
+| **Class 0**  | 0.92      | 0.97    | 0.94     | 56463   |
+| **Class 1**  | 0.97      | 0.91    | 0.94     | 56839   |
+| **Accuracy** |           |         | 0.94     | 113302  |
+| **Macro avg**| 0.95      | 0.94    | 0.94     | 113302  |
+| **Weighted avg** | 0.95  | 0.94    | 0.94     | 113302  |
+
+---
+
+### **Interpretation:**
+
+#### **Precision:**
+- **Class 0 (No fraud)**: The **precision** for Class 0 is 0.92, meaning that 92% of the predicted non-fraudulent transactions were correct. The model performs well in predicting normal transactions.
+- **Class 1 (Fraud)**: The **precision** for Class 1 is 0.97, indicating that when the model predicts a transaction as fraudulent, it is correct 97% of the time. This shows the model is highly effective at identifying fraud when it occurs.
+
+#### **Recall:**
+- **Class 0 (No fraud)**: The **recall** for Class 0 is 0.97, meaning that the model identified 97% of all real non-fraudulent transactions. This indicates that very few non-fraudulent transactions were missed.
+- **Class 1 (Fraud)**: The **recall** for Class 1 is 0.91, meaning that the model correctly identified 91% of all fraudulent transactions. Although this is strong, there is room for improvement, as some fraudulent transactions were not detected (false negatives).
+
+#### **F1-Score:**
+- **Class 0 (No fraud)**: The **F1-score** for Class 0 is 0.94, which reflects a good balance between precision and recall for non-fraudulent transactions.
+- **Class 1 (Fraud)**: The **F1-score** for Class 1 is 0.94, indicating a good balance between precision and recall for fraudulent transactions. An F1-score near 1.0 suggests the model is finding a good trade-off between detecting fraud and avoiding false positives.
+
+#### **Average Metrics:**
+- **Macro Average**: The **macro average** gives a global measure of performance by averaging the scores for each class without considering class size. Here, the **macro F1-score** is 0.94, indicating that the model performs well across both classes.
+- **Weighted Average**: The **weighted average** takes into account the proportion of each class in the dataset. The **weighted F1-score** is also 0.94, suggesting that the model is well-balanced in terms of performance across both classes, even though Class 0 (non-fraud) is likely more frequent.
+
+---
+
+**Confusion Matrix:**
+
+```
+[[55008  1455]
+ [ 4955 51884]]
+```
+
+#### **Confusion Matrix Interpretation:**
+- **True Negatives (TN)**: 55,008 correctly predicted non-fraudulent transactions.
+- **False Positives (FP)**: 1,455 incorrectly predicted fraudulent transactions for non-fraudulent instances.
+- **False Negatives (FN)**: 4,955 fraudulent transactions that were not detected by the model.
+- **True Positives (TP)**: 51,884 correctly predicted fraudulent transactions.
+
+### **Conclusion:**
+
+The model performs well with **high precision** and **recall** for both classes. It is particularly effective at identifying fraudulent transactions with a balanced **F1-score** and good **precision** and **recall** for both classes. However, there is still some room for improvement in identifying fraudulent transactions, as evidenced by the false negatives. The confusion matrix shows that the model minimizes false positives, which is important to avoid flagging legitimate transactions as fraudulent. Overall, the model appears to be well-suited for fraud detection with strong performance across the board.
+
+---
diff --git a/main.py b/main.py
@@ -0,0 +1,49 @@
+import os
+from src.data_loader import load_data
+from src.preprocess import preprocess_data
+from src.model import train_model
+from src.evaluate import evaluate_model
+from src.utils import split_data
+
+def main():
+    """
+    Main function to load, preprocess, train, evaluate and print results.
+    """
+    try:
+        # Define the path for processed data
+        data_path = './data/raw/creditcard.csv'
+
+        # Check if the data file exists
+        if not os.path.exists(data_path):
+            raise FileNotFoundError(f"Data file not found at {data_path}. Please check the file path.")
+
+        # Load data
+        df = load_data(data_path)
+
+        # Preprocess the training and testing data
+        df_processed = preprocess_data(df=df, output_path='./data/processed/creditcard_processed.csv')
+
+        # Split data into training and testing sets
+        X_train, X_test, y_train, y_test = split_data(df_processed, 'Class')
+
+
+        # Train the model
+        model = train_model(X_train, y_train, model_type='logistic', solver='liblinear', max_iter=1000)
+
+        # Evaluate the model
+        evaluation_results = evaluate_model(model, X_test, y_test, output_path='evaluation_report.txt')
+
+        # Output the evaluation results
+        print(f"Accuracy: {evaluation_results['accuracy']:.4f}")
+        print(f"Classification Report:\n{evaluation_results['classification_report']}")
+        print(f"Confusion Matrix:\n{evaluation_results['confusion_matrix']}")
+
+    except FileNotFoundError as fnf_error:
+        print(fnf_error)
+    except KeyError as key_error:
+        print(f"Error: {key_error}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+imbalanced_learn==0.12.3
+numpy==2.1.3
+pandas==2.2.3
+scikit_learn==1.5.2
diff --git a/src/__pycache__/data_loader.cpython-312.pyc b/src/__pycache__/data_loader.cpython-312.pyc
diff --git a/src/__pycache__/evaluate.cpython-312.pyc b/src/__pycache__/evaluate.cpython-312.pyc
diff --git a/src/__pycache__/model.cpython-312.pyc b/src/__pycache__/model.cpython-312.pyc
diff --git a/src/__pycache__/preprocess.cpython-312.pyc b/src/__pycache__/preprocess.cpython-312.pyc
diff --git a/src/__pycache__/utils.cpython-312.pyc b/src/__pycache__/utils.cpython-312.pyc
diff --git a/src/data_loader.py b/src/data_loader.py
@@ -0,0 +1,31 @@
+import pandas as pd
+
+def load_data(file_path, **kwargs):
+    """
+    Load dataset from a specified file path.
+    
+    Parameters:
+    ----------
+    file_path : str
+        Path to the CSV file.
+    **kwargs : dict
+        Additional keyword arguments for pandas.read_csv.
+    
+    Returns:
+    -------
+    pd.DataFrame
+        Loaded dataset as a Pandas DataFrame.
+    """
+    try:
+        data = pd.read_csv(file_path, **kwargs)
+        print(f"Data successfully loaded from {file_path}. Shape: {data.shape}")
+        return data
+    except FileNotFoundError:
+        print(f"Error: File not found at {file_path}.")
+        raise
+    except pd.errors.EmptyDataError:
+        print(f"Error: No data found in file at {file_path}.")
+        raise
+    except Exception as e:
+        print(f"An error occurred while loading the data: {e}")
+        raise
diff --git a/src/evaluate.py b/src/evaluate.py
@@ -0,0 +1,59 @@
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+
+def evaluate_model(model, X_test, y_test, output_path=None):
+    """
+    Evaluate the model and return performance metrics.
+    
+    Parameters:
+    ----------
+    model : sklearn.base.BaseEstimator
+        The trained model to evaluate.
+    X_test : pd.DataFrame or np.ndarray
+        The testing feature set.
+    y_test : pd.Series or np.ndarray
+        The true labels for the test set.
+    output_path : str, optional
+        Path to save the evaluation report. Default is None.
+    
+    Returns:
+    -------
+    dict
+        A dictionary containing accuracy, classification report, and confusion matrix.
+    
+    Raises:
+    ------
+    NotFittedError
+        If the model is not fitted before evaluation.
+    """
+    try:
+        # Generate predictions
+        predictions = model.predict(X_test)
+
+        # Calculate metrics
+        acc = accuracy_score(y_test, predictions)
+        report = classification_report(y_test, predictions, output_dict=True)
+        conf_matrix = confusion_matrix(y_test, predictions)
+
+        # Compile evaluation results
+        evaluation_results = {
+            "accuracy": acc,
+            "classification_report": report,
+            "confusion_matrix": conf_matrix.tolist()
+        }
+
+        # Optional: Save evaluation report as a file
+        if output_path:
+            with open(output_path, 'w') as f:
+                f.write("Accuracy: {:.4f}\n".format(acc))
+                f.write("\nClassification Report:\n")
+                f.write(classification_report(y_test, predictions))
+                f.write("\nConfusion Matrix:\n")
+                f.write(str(conf_matrix))
+            print(f"Evaluation report saved to {output_path}.")
+
+        print(f"Model evaluation completed. Accuracy: {acc:.4f}")
+        return evaluation_results
+
+    except Exception as e:
+        print(f"An error occurred during model evaluation: {e}")
+        raise
diff --git a/src/model.py b/src/model.py
@@ -0,0 +1,52 @@
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.exceptions import NotFittedError
+
+def train_model(X_train, y_train, model_type='logistic', **kwargs):
+    """
+    Train a model with the specified type.
+    
+    Parameters:
+    ----------
+    X_train : pd.DataFrame or np.ndarray
+        The training feature set.
+    y_train : pd.Series or np.ndarray
+        The training target labels.
+    model_type : str, optional
+        The type of model to train ('logistic' or 'random_forest').
+        Default is 'logistic'.
+    **kwargs : dict
+        Additional keyword arguments to pass to the model constructor.
+    
+    Returns:
+    -------
+    model : sklearn.base.BaseEstimator
+        The trained model.
+    
+    Raises:
+    ------
+    ValueError
+        If an unsupported model type is provided.
+    """
+    try:
+        if model_type == 'logistic':
+            model = LogisticRegression(**kwargs)
+        elif model_type == 'random_forest':
+            model = RandomForestClassifier(**kwargs)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+
+        # Train the model
+        model.fit(X_train, y_train)
+        print(f"Model of type '{model_type}' trained successfully.")
+        return model
+
+    except ValueError as ve:
+        print(f"ValueError: {ve}")
+        raise
+    except NotFittedError as nfe:
+        print(f"Model fitting failed: {nfe}")
+        raise
+    except Exception as e:
+        print(f"An unexpected error occurred during model training: {e}")
+        raise
diff --git a/src/preprocess.py b/src/preprocess.py
@@ -0,0 +1,46 @@
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+
+def preprocess_data(df, output_path=None):
+    """
+    Preprocess the data, including scaling and handling missing values.
+    
+    Parameters:
+    ----------
+    df : pd.DataFrame
+        The input dataset to preprocess.
+    output_path : str, optional
+        Path to save the processed DataFrame as a CSV file. Default is None.
+    
+    Returns:
+    -------
+    pd.DataFrame
+        The preprocessed DataFrame.
+    """
+    # Handle duplicates
+    df = df.drop_duplicates()
+
+    # Drop unnecessary columns
+    if 'Time' in df.columns:
+        df = df.drop("Time", axis=1)
+
+    # Scale the 'Amount' column
+    scaler = StandardScaler()
+    if 'Amount' in df.columns:
+        df["Amount"] = scaler.fit_transform(df[["Amount"]])
+    else:
+        raise KeyError("Column 'Amount' not found in the DataFrame.")
+
+    #Oversampling
+
+
+    # Save the processed data if an output path is provided
+    if output_path:
+        try:
+            df.to_csv(output_path, index=False)
+            print(f"Processed data successfully saved to {output_path}.")
+        except Exception as e:
+            print(f"Failed to save processed data: {e}")
+            raise
+
+    return df
diff --git a/src/utils.py b/src/utils.py
@@ -0,0 +1,48 @@
+import numpy as np
+from sklearn.model_selection import train_test_split
+import pandas as pd
+from imblearn.over_sampling import SMOTE
+
+def split_data(df: pd.DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 42) -> tuple:
+    """
+    Split data into training and testing sets.
+    
+    Parameters:
+    ----------
+    df : pd.DataFrame
+        The input DataFrame containing features and target.
+    target_column : str
+        The name of the target column to predict.
+    test_size : float, optional
+        The proportion of the dataset to include in the test split. Default is 0.2.
+    random_state : int, optional
+        The random seed for reproducibility. Default is 42.
+    
+    Returns:
+    -------
+    tuple
+        Four objects: X_train, X_test, y_train, y_test.
+    
+    Raises:
+    ------
+    KeyError
+        If the target column is not found in the DataFrame.
+    ValueError
+        If the DataFrame is empty.
+    """
+    if df.empty:
+        raise ValueError("The input DataFrame is empty.")
+
+    if target_column not in df.columns:
+        raise KeyError(f"The target column '{target_column}' is not found in the DataFrame.")
+
+    # Splitting the data
+    X = df.drop(columns=[target_column])
+    y = df[target_column]
+
+    X_res, y_res = SMOTE().fit_resample(X,y)
+
+    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=test_size, random_state=random_state)
+
+    print(f"Data split completed: Train set - {X_train.shape[0]} samples, Test set - {X_test.shape[0]} samples.")
+    return X_train, X_test, y_train, y_test