From cd5e04d4da7fd61f4bdd9bd61eac937c88352918 Mon Sep 17 00:00:00 2001
From: Mannerow <mannerow145@gmail.com>
Date: Mon, 28 Oct 2024 19:49:11 +0000
Subject: [PATCH] Added linting to precommit hooks

---
 .pre-commit-config.yaml |  8 ++++++++
 .pylintrc               |  2 +-
 src/data_preprocess.py  |  9 ++++++---
 src/hpo.py              |  2 +-
 src/monitor_metrics.py  | 24 ++++++++++++------------
 src/register_model.py   |  7 ++++---
 tests/utils_test.py     | 19 +++++--------------
 7 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 805499a..ba795dd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,3 +30,11 @@ repos:
         always_run: true
         args:
           - "tests/"
+-   repo: https://github.com/PyCQA/pylint
+    rev: v3.2.6
+    hooks:
+    -   id: pylint
+        name: pylint
+        entry: pylint
+        language: python
+        types: [python]
\ No newline at end of file
diff --git a/.pylintrc b/.pylintrc
index f16518f..a33b34a 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,2 +1,2 @@
 [MESSAGES CONTROL]
-disable=line-too-long
\ No newline at end of file
+disable=line-too-long, wrong-import-position, invalid-name, no-value-for-parameter, logging-fstring-interpolation, unused-argument, too-many-locals
\ No newline at end of file
diff --git a/src/data_preprocess.py b/src/data_preprocess.py
index 3a2d06b..9947313 100644
--- a/src/data_preprocess.py
+++ b/src/data_preprocess.py
@@ -1,6 +1,6 @@
+"""Reads the data, performs feature engineering, and splits data into training/testing sets."""
+
 import os
-import pickle
-import zipfile
 
 import click
 import pandas as pd
@@ -13,6 +13,7 @@
 
 
 def read_dataframe(raw_data_path: str):
+    """Reads dataframe. Just uses half the data"""
     csv_file_path = os.path.join(raw_data_path, "historical_data.csv")
 
     # Load the CSV file into a pandas DataFrame
@@ -103,6 +104,7 @@ def vectorize_and_split(train_dicts, target):
 
 
 def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
+    """Handles categorical data and drops nulls"""
     # Define imputer for numerical columns
     imputer = SimpleImputer(strategy="mean")
     numerical_columns = df.select_dtypes(include=["number"]).columns
@@ -129,8 +131,8 @@ def preprocess(df: pd.DataFrame, target: str = "delivery_duration"):
     return X_train, X_val, X_test, y_train, y_val, y_test, dv
 
 
-# Downloads raw data from Kaggle to 'raw_data_path'
 def download_data(raw_data_path: str):
+    """Downloads raw data from Kaggle"""
     # Initialize the Kaggle API
     api = KaggleApi()
     api.authenticate()
@@ -159,6 +161,7 @@ def download_data(raw_data_path: str):
     help="Location where the resulting files will be saved",
 )
 def run_data_prep(raw_data_path: str, dest_path: str):
+    """Runs data prep pipeline and dumps the pickles locally."""
 
     download_data(raw_data_path=raw_data_path)
 
diff --git a/src/hpo.py b/src/hpo.py
index 4aa7610..8a99cea 100644
--- a/src/hpo.py
+++ b/src/hpo.py
@@ -14,7 +14,6 @@
 RANDOM_STATE = 42
 
 import os
-import pickle
 
 import click
 import mlflow
@@ -43,6 +42,7 @@
     help="Number of parameter evaluations for the optimizer to explore",
 )
 def run_optimization(data_path: str, num_trials: int):
+    """runs the optimizer"""
 
     X_train, y_train = utils.load_pickle(os.path.join(data_path, "train.pkl"))
     X_val, y_val = utils.load_pickle(os.path.join(data_path, "val.pkl"))
diff --git a/src/monitor_metrics.py b/src/monitor_metrics.py
index 8e21052..8c09d1a 100644
--- a/src/monitor_metrics.py
+++ b/src/monitor_metrics.py
@@ -1,17 +1,13 @@
-import datetime
+"""Uses Evidently and PostgresSQL to monitor and store metrics."""
+
 import logging
 import os
 import random
-import sys
-import uuid
 
-import boto3
 import click
-import joblib
 import mlflow.pyfunc
 import pandas as pd
 import psycopg
-from botocore.exceptions import ClientError
 
 # Load environment variables
 from dotenv import load_dotenv
@@ -26,7 +22,6 @@
 # Monitoring Imports
 from evidently.report import Report
 from mlflow.tracking import MlflowClient
-from scipy.sparse import csr_matrix
 
 import utils
 
@@ -52,6 +47,7 @@
 
 
 def init_mlflow():
+    """Sets MLFlow Tracking URI"""
     logging.info("Initializing MLflow...")
     load_dotenv()  # This will load all the env variables from the .env file
     # Set the MLflow tracking URI
@@ -60,6 +56,7 @@ def init_mlflow():
 
 
 def load_best_model(best_model_bucket, best_model_name, experiment_name):
+    """Loads best model from MLFlow"""
     logging.info("Loading best model from MLflow...")
     client = MlflowClient()
     # Get experiment ID from the experiment name
@@ -84,7 +81,7 @@ def load_best_model(best_model_bucket, best_model_name, experiment_name):
     logging.info(f"Logged Model = {logged_model}")
     model = mlflow.pyfunc.load_model(logged_model)
     logging.info("Best model loaded.")
-    return model, run_id
+    return model
 
 
 def setup_monitoring(
@@ -93,6 +90,7 @@ def setup_monitoring(
     best_model_name: str,
     experiment_name: str,
 ):
+    """Sets up monitoring, returns current_data and reference data."""
     logging.info("Setting up monitoring...")
     init_mlflow()
     # Reference Data
@@ -102,9 +100,8 @@ def setup_monitoring(
     )
     X_test, y_test = utils.load_pickle(os.path.join(test_data_path, "test.pkl"))
     logging.info(f"Loading the model from bucket={best_model_bucket}...")
-    model, run_id = load_best_model(best_model_bucket, best_model_name, experiment_name)
+    model = load_best_model(best_model_bucket, best_model_name, experiment_name)
     logging.info("Applying the model...")
-    # X_test = X_test.fillna(0)
 
     y_pred = model.predict(X_test)
     y_val_pred = model.predict(X_val)
@@ -157,6 +154,7 @@ def setup_monitoring(
 
 
 def prep_db():
+    """Makes connection to PostgresSQL"""
     logging.info("Preparing database...")
     try:
         # Connect to PostgreSQL and create the database and table if they do not exist
@@ -176,19 +174,20 @@ def prep_db():
         raise
 
 
-# Function to generate daily timestamps between two dates
 def generate_daily_timestamps(start_date, end_date):
+    """generate daily timestamps between two dates"""
     return pd.date_range(start=start_date, end=end_date, freq="D")
 
 
-# Function to filter data by day
 def filter_data_by_day(data, day):
+    """filter data by day"""
     start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)
     end_time = start_time + pd.Timedelta(days=1)
     return data[(data["created_at"] >= start_time) & (data["created_at"] < end_time)]
 
 
 def calculate_metrics_postgresql(current_data, reference_data, report, column_mapping):
+    """Calculates and logs metrics"""
     logging.info("Calculating metrics and storing in PostgreSQL...")
 
     # Ensure date columns exist in the data and convert to datetime
@@ -275,6 +274,7 @@ def run(
     best_model_name: str,
     experiment_name: str,
 ):
+    """Runs the flow"""
     try:
         logging.info("Starting the run process...")
         prep_db()
diff --git a/src/register_model.py b/src/register_model.py
index 697cf6d..59e0900 100644
--- a/src/register_model.py
+++ b/src/register_model.py
@@ -1,7 +1,7 @@
+"""Analyzes models based on performance metrics, then registers best model to MLFlow"""
+
 import os
-import pickle
 
-import boto3
 import click
 import mlflow
 from dotenv import load_dotenv
@@ -24,6 +24,7 @@
 
 
 def train_and_log_model(data_path, params):
+    """Trains models and logs to MLFlow"""
     X_train, y_train = utils.load_pickle(os.path.join(data_path, "train.pkl"))
     X_val, y_val = utils.load_pickle(os.path.join(data_path, "val.pkl"))
     X_test, y_test = utils.load_pickle(os.path.join(data_path, "test.pkl"))
@@ -70,7 +71,7 @@ def train_and_log_model(data_path, params):
     help="Number of top models that need to be evaluated to decide which one to promote",
 )
 def run_register_model(data_path: str, top_n: int):
-
+    """Loads best model and registers it."""
     client = MlflowClient()
 
     # Retrieve the top_n model runs and log the models/
diff --git a/tests/utils_test.py b/tests/utils_test.py
index bff1a24..443c4b0 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -8,10 +8,9 @@
 
 from src.utils import decode_dataframe, dump_pickle, load_pickle
 
-"""Tests load_pickle function using a mock."""
-
 
 def test_load_pickle():
+    """Tests load_pickle function using a mock."""
     # Simulates the content in the pickle file
     expected_data = {"key": "value"}
 
@@ -31,10 +30,8 @@ def test_load_pickle():
             assert result == expected_data
 
 
-"""Tests dump_pickle function"""
-
-
 def test_dump_pickle():
+    """Tests dump_pickle function"""
     # The data that we expect to be saved into the pickle file
     data_to_save = {"key": "value"}
 
@@ -55,10 +52,8 @@ def test_dump_pickle():
             assert mock_pickle_dump.called
 
 
-"""Tests decode dataframe"""
-
-
 def test_decode_dataframe_dense():
+    """Tests decode dataframe"""
     # Sample dense DataFrame
     data = {
         "market_id": [1, 2],
@@ -107,10 +102,8 @@ def test_decode_dataframe_dense():
     pd.testing.assert_frame_equal(result, expected_df)
 
 
-"""Tests sparse matrix"""
-
-
 def test_decode_dataframe_sparse():
+    """Tests sparse matrix"""
     # Sample sparse matrix and feature names
     sparse_data = csr_matrix([[1, 0, 3, 1200], [0, 1, 2, 2400]])
     feature_names = [
@@ -158,10 +151,8 @@ def test_decode_dataframe_sparse():
     pd.testing.assert_frame_equal(result, expected_df)
 
 
-"""Tests that any missing columns from DF are added to output and filled with default values"""
-
-
 def test_decode_dataframe_missing_column():
+    """Tests that any missing columns from DF are added to output and filled with default values"""
     # Sample DataFrame missing 'total_items' and other columns
     data = {
         "market_id": [1],