From cd5e04d4da7fd61f4bdd9bd61eac937c88352918 Mon Sep 17 00:00:00 2001 From: Mannerow Date: Mon, 28 Oct 2024 19:49:11 +0000 Subject: [PATCH] Added linting to precommit hooks --- .pre-commit-config.yaml | 8 ++++++++ .pylintrc | 2 +- src/data_preprocess.py | 9 ++++++--- src/hpo.py | 2 +- src/monitor_metrics.py | 24 ++++++++++++------------ src/register_model.py | 7 ++++--- tests/utils_test.py | 19 +++++-------------- 7 files changed, 37 insertions(+), 34 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 805499a..ba795dd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,3 +30,11 @@ repos: always_run: true args: - "tests/" +- repo: https://github.com/PyCQA/pylint + rev: v3.2.6 + hooks: + - id: pylint + name: pylint + entry: pylint + language: python + types: [python] \ No newline at end of file diff --git a/.pylintrc b/.pylintrc index f16518f..a33b34a 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,2 +1,2 @@ [MESSAGES CONTROL] -disable=line-too-long \ No newline at end of file +disable=line-too-long, wrong-import-position, invalid-name, no-value-for-parameter, logging-fstring-interpolation, unused-argument, too-many-locals \ No newline at end of file diff --git a/src/data_preprocess.py b/src/data_preprocess.py index 3a2d06b..9947313 100644 --- a/src/data_preprocess.py +++ b/src/data_preprocess.py @@ -1,6 +1,6 @@ +"""Reads the data, performs feature engineering, and splits data into training/testing sets.""" + import os -import pickle -import zipfile import click import pandas as pd @@ -13,6 +13,7 @@ def read_dataframe(raw_data_path: str): + """Reads dataframe. Just uses half the data""" csv_file_path = os.path.join(raw_data_path, "historical_data.csv") # Load the CSV file into a pandas DataFrame @@ -103,6 +104,7 @@ def vectorize_and_split(train_dicts, target): def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame: + """Handles categorical data and drops nulls""" # Define imputer for numerical columns imputer = SimpleImputer(strategy="mean") numerical_columns = df.select_dtypes(include=["number"]).columns @@ -129,8 +131,8 @@ def preprocess(df: pd.DataFrame, target: str = "delivery_duration"): return X_train, X_val, X_test, y_train, y_val, y_test, dv -# Downloads raw data from Kaggle to 'raw_data_path' def download_data(raw_data_path: str): + """Downloads raw data from Kaggle""" # Initialize the Kaggle API api = KaggleApi() api.authenticate() @@ -159,6 +161,7 @@ def download_data(raw_data_path: str): help="Location where the resulting files will be saved", ) def run_data_prep(raw_data_path: str, dest_path: str): + """Runs data prep pipeline and dumps the pickles locally.""" download_data(raw_data_path=raw_data_path) diff --git a/src/hpo.py b/src/hpo.py index 4aa7610..8a99cea 100644 --- a/src/hpo.py +++ b/src/hpo.py @@ -14,7 +14,6 @@ RANDOM_STATE = 42 import os -import pickle import click import mlflow @@ -43,6 +42,7 @@ help="Number of parameter evaluations for the optimizer to explore", ) def run_optimization(data_path: str, num_trials: int): + """runs the optimizer""" X_train, y_train = utils.load_pickle(os.path.join(data_path, "train.pkl")) X_val, y_val = utils.load_pickle(os.path.join(data_path, "val.pkl")) diff --git a/src/monitor_metrics.py b/src/monitor_metrics.py index 8e21052..8c09d1a 100644 --- a/src/monitor_metrics.py +++ b/src/monitor_metrics.py @@ -1,17 +1,13 @@ -import datetime +"""Uses Evidently and PostgresSQL to monitor and store metrics.""" + import logging import os import random -import sys -import uuid -import boto3 import click -import joblib import mlflow.pyfunc import pandas as pd import psycopg -from botocore.exceptions import ClientError # Load environment variables from dotenv import load_dotenv @@ -26,7 +22,6 @@ # Monitoring Imports from evidently.report import Report from mlflow.tracking import MlflowClient -from scipy.sparse import csr_matrix import utils @@ -52,6 +47,7 @@ def init_mlflow(): + """Sets MLFlow Tracking URI""" logging.info("Initializing MLflow...") load_dotenv() # This will load all the env variables from the .env file # Set the MLflow tracking URI @@ -60,6 +56,7 @@ def init_mlflow(): def load_best_model(best_model_bucket, best_model_name, experiment_name): + """Loads best model from MLFlow""" logging.info("Loading best model from MLflow...") client = MlflowClient() # Get experiment ID from the experiment name @@ -84,7 +81,7 @@ def load_best_model(best_model_bucket, best_model_name, experiment_name): logging.info(f"Logged Model = {logged_model}") model = mlflow.pyfunc.load_model(logged_model) logging.info("Best model loaded.") - return model, run_id + return model def setup_monitoring( @@ -93,6 +90,7 @@ def setup_monitoring( best_model_name: str, experiment_name: str, ): + """Sets up monitoring, returns current_data and reference data.""" logging.info("Setting up monitoring...") init_mlflow() # Reference Data @@ -102,9 +100,8 @@ def setup_monitoring( ) X_test, y_test = utils.load_pickle(os.path.join(test_data_path, "test.pkl")) logging.info(f"Loading the model from bucket={best_model_bucket}...") - model, run_id = load_best_model(best_model_bucket, best_model_name, experiment_name) + model = load_best_model(best_model_bucket, best_model_name, experiment_name) logging.info("Applying the model...") - # X_test = X_test.fillna(0) y_pred = model.predict(X_test) y_val_pred = model.predict(X_val) @@ -157,6 +154,7 @@ def setup_monitoring( def prep_db(): + """Makes connection to PostgresSQL""" logging.info("Preparing database...") try: # Connect to PostgreSQL and create the database and table if they do not exist @@ -176,19 +174,20 @@ def prep_db(): raise -# Function to generate daily timestamps between two dates def generate_daily_timestamps(start_date, end_date): + """generate daily timestamps between two dates""" return pd.date_range(start=start_date, end=end_date, freq="D") -# Function to filter data by day def filter_data_by_day(data, day): + """filter data by day""" start_time = day.replace(hour=0, minute=0, second=0, microsecond=0) end_time = start_time + pd.Timedelta(days=1) return data[(data["created_at"] >= start_time) & (data["created_at"] < end_time)] def calculate_metrics_postgresql(current_data, reference_data, report, column_mapping): + """Calculates and logs metrics""" logging.info("Calculating metrics and storing in PostgreSQL...") # Ensure date columns exist in the data and convert to datetime @@ -275,6 +274,7 @@ def run( best_model_name: str, experiment_name: str, ): + """Runs the flow""" try: logging.info("Starting the run process...") prep_db() diff --git a/src/register_model.py b/src/register_model.py index 697cf6d..59e0900 100644 --- a/src/register_model.py +++ b/src/register_model.py @@ -1,7 +1,7 @@ +"""Analyzes models based on performance metrics, then registers best model to MLFlow""" + import os -import pickle -import boto3 import click import mlflow from dotenv import load_dotenv @@ -24,6 +24,7 @@ def train_and_log_model(data_path, params): + """Trains models and logs to MLFlow""" X_train, y_train = utils.load_pickle(os.path.join(data_path, "train.pkl")) X_val, y_val = utils.load_pickle(os.path.join(data_path, "val.pkl")) X_test, y_test = utils.load_pickle(os.path.join(data_path, "test.pkl")) @@ -70,7 +71,7 @@ def train_and_log_model(data_path, params): help="Number of top models that need to be evaluated to decide which one to promote", ) def run_register_model(data_path: str, top_n: int): - + """Loads best model and registers it.""" client = MlflowClient() # Retrieve the top_n model runs and log the models/ diff --git a/tests/utils_test.py b/tests/utils_test.py index bff1a24..443c4b0 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -8,10 +8,9 @@ from src.utils import decode_dataframe, dump_pickle, load_pickle -"""Tests load_pickle function using a mock.""" - def test_load_pickle(): + """Tests load_pickle function using a mock.""" # Simulates the content in the pickle file expected_data = {"key": "value"} @@ -31,10 +30,8 @@ def test_load_pickle(): assert result == expected_data -"""Tests dump_pickle function""" - - def test_dump_pickle(): + """Tests dump_pickle function""" # The data that we expect to be saved into the pickle file data_to_save = {"key": "value"} @@ -55,10 +52,8 @@ def test_dump_pickle(): assert mock_pickle_dump.called -"""Tests decode dataframe""" - - def test_decode_dataframe_dense(): + """Tests decode dataframe""" # Sample dense DataFrame data = { "market_id": [1, 2], @@ -107,10 +102,8 @@ def test_decode_dataframe_dense(): pd.testing.assert_frame_equal(result, expected_df) -"""Tests sparse matrix""" - - def test_decode_dataframe_sparse(): + """Tests sparse matrix""" # Sample sparse matrix and feature names sparse_data = csr_matrix([[1, 0, 3, 1200], [0, 1, 2, 2400]]) feature_names = [ @@ -158,10 +151,8 @@ def test_decode_dataframe_sparse(): pd.testing.assert_frame_equal(result, expected_df) -"""Tests that any missing columns from DF are added to output and filled with default values""" - - def test_decode_dataframe_missing_column(): + """Tests that any missing columns from DF are added to output and filled with default values""" # Sample DataFrame missing 'total_items' and other columns data = { "market_id": [1],