Skip to content

Commit

Permalink
Added linting to precommit hooks
Browse files Browse the repository at this point in the history
  • Loading branch information
Mannerow committed Oct 28, 2024
1 parent 10387c9 commit cd5e04d
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 34 deletions.
8 changes: 8 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,11 @@ repos:
always_run: true
args:
- "tests/"
- repo: https://github.com/PyCQA/pylint
rev: v3.2.6
hooks:
- id: pylint
name: pylint
entry: pylint
language: python
types: [python]
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[MESSAGES CONTROL]
disable=line-too-long
disable=line-too-long, wrong-import-position, invalid-name, no-value-for-parameter, logging-fstring-interpolation, unused-argument, too-many-locals
9 changes: 6 additions & 3 deletions src/data_preprocess.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Reads the data, performs feature engineering, and splits data into training/testing sets."""

import os
import pickle
import zipfile

import click
import pandas as pd
Expand All @@ -13,6 +13,7 @@


def read_dataframe(raw_data_path: str):
"""Reads dataframe. Just uses half the data"""
csv_file_path = os.path.join(raw_data_path, "historical_data.csv")

# Load the CSV file into a pandas DataFrame
Expand Down Expand Up @@ -103,6 +104,7 @@ def vectorize_and_split(train_dicts, target):


def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
"""Handles categorical data and drops nulls"""
# Define imputer for numerical columns
imputer = SimpleImputer(strategy="mean")
numerical_columns = df.select_dtypes(include=["number"]).columns
Expand All @@ -129,8 +131,8 @@ def preprocess(df: pd.DataFrame, target: str = "delivery_duration"):
return X_train, X_val, X_test, y_train, y_val, y_test, dv


# Downloads raw data from Kaggle to 'raw_data_path'
def download_data(raw_data_path: str):
"""Downloads raw data from Kaggle"""
# Initialize the Kaggle API
api = KaggleApi()
api.authenticate()
Expand Down Expand Up @@ -159,6 +161,7 @@ def download_data(raw_data_path: str):
help="Location where the resulting files will be saved",
)
def run_data_prep(raw_data_path: str, dest_path: str):
"""Runs data prep pipeline and dumps the pickles locally."""

download_data(raw_data_path=raw_data_path)

Expand Down
2 changes: 1 addition & 1 deletion src/hpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
RANDOM_STATE = 42

import os
import pickle

import click
import mlflow
Expand Down Expand Up @@ -43,6 +42,7 @@
help="Number of parameter evaluations for the optimizer to explore",
)
def run_optimization(data_path: str, num_trials: int):
"""runs the optimizer"""

X_train, y_train = utils.load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = utils.load_pickle(os.path.join(data_path, "val.pkl"))
Expand Down
24 changes: 12 additions & 12 deletions src/monitor_metrics.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import datetime
"""Uses Evidently and PostgresSQL to monitor and store metrics."""

import logging
import os
import random
import sys
import uuid

import boto3
import click
import joblib
import mlflow.pyfunc
import pandas as pd
import psycopg
from botocore.exceptions import ClientError

# Load environment variables
from dotenv import load_dotenv
Expand All @@ -26,7 +22,6 @@
# Monitoring Imports
from evidently.report import Report
from mlflow.tracking import MlflowClient
from scipy.sparse import csr_matrix

import utils

Expand All @@ -52,6 +47,7 @@


def init_mlflow():
"""Sets MLFlow Tracking URI"""
logging.info("Initializing MLflow...")
load_dotenv() # This will load all the env variables from the .env file
# Set the MLflow tracking URI
Expand All @@ -60,6 +56,7 @@ def init_mlflow():


def load_best_model(best_model_bucket, best_model_name, experiment_name):
"""Loads best model from MLFlow"""
logging.info("Loading best model from MLflow...")
client = MlflowClient()
# Get experiment ID from the experiment name
Expand All @@ -84,7 +81,7 @@ def load_best_model(best_model_bucket, best_model_name, experiment_name):
logging.info(f"Logged Model = {logged_model}")
model = mlflow.pyfunc.load_model(logged_model)
logging.info("Best model loaded.")
return model, run_id
return model


def setup_monitoring(
Expand All @@ -93,6 +90,7 @@ def setup_monitoring(
best_model_name: str,
experiment_name: str,
):
"""Sets up monitoring, returns current_data and reference data."""
logging.info("Setting up monitoring...")
init_mlflow()
# Reference Data
Expand All @@ -102,9 +100,8 @@ def setup_monitoring(
)
X_test, y_test = utils.load_pickle(os.path.join(test_data_path, "test.pkl"))
logging.info(f"Loading the model from bucket={best_model_bucket}...")
model, run_id = load_best_model(best_model_bucket, best_model_name, experiment_name)
model = load_best_model(best_model_bucket, best_model_name, experiment_name)
logging.info("Applying the model...")
# X_test = X_test.fillna(0)

y_pred = model.predict(X_test)
y_val_pred = model.predict(X_val)
Expand Down Expand Up @@ -157,6 +154,7 @@ def setup_monitoring(


def prep_db():
"""Makes connection to PostgresSQL"""
logging.info("Preparing database...")
try:
# Connect to PostgreSQL and create the database and table if they do not exist
Expand All @@ -176,19 +174,20 @@ def prep_db():
raise


# Function to generate daily timestamps between two dates
def generate_daily_timestamps(start_date, end_date):
"""generate daily timestamps between two dates"""
return pd.date_range(start=start_date, end=end_date, freq="D")


# Function to filter data by day
def filter_data_by_day(data, day):
"""filter data by day"""
start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)
end_time = start_time + pd.Timedelta(days=1)
return data[(data["created_at"] >= start_time) & (data["created_at"] < end_time)]


def calculate_metrics_postgresql(current_data, reference_data, report, column_mapping):
"""Calculates and logs metrics"""
logging.info("Calculating metrics and storing in PostgreSQL...")

# Ensure date columns exist in the data and convert to datetime
Expand Down Expand Up @@ -275,6 +274,7 @@ def run(
best_model_name: str,
experiment_name: str,
):
"""Runs the flow"""
try:
logging.info("Starting the run process...")
prep_db()
Expand Down
7 changes: 4 additions & 3 deletions src/register_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Analyzes models based on performance metrics, then registers best model to MLFlow"""

import os
import pickle

import boto3
import click
import mlflow
from dotenv import load_dotenv
Expand All @@ -24,6 +24,7 @@


def train_and_log_model(data_path, params):
"""Trains models and logs to MLFlow"""
X_train, y_train = utils.load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = utils.load_pickle(os.path.join(data_path, "val.pkl"))
X_test, y_test = utils.load_pickle(os.path.join(data_path, "test.pkl"))
Expand Down Expand Up @@ -70,7 +71,7 @@ def train_and_log_model(data_path, params):
help="Number of top models that need to be evaluated to decide which one to promote",
)
def run_register_model(data_path: str, top_n: int):

"""Loads best model and registers it."""
client = MlflowClient()

# Retrieve the top_n model runs and log the models/
Expand Down
19 changes: 5 additions & 14 deletions tests/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@

from src.utils import decode_dataframe, dump_pickle, load_pickle

"""Tests load_pickle function using a mock."""


def test_load_pickle():
"""Tests load_pickle function using a mock."""
# Simulates the content in the pickle file
expected_data = {"key": "value"}

Expand All @@ -31,10 +30,8 @@ def test_load_pickle():
assert result == expected_data


"""Tests dump_pickle function"""


def test_dump_pickle():
"""Tests dump_pickle function"""
# The data that we expect to be saved into the pickle file
data_to_save = {"key": "value"}

Expand All @@ -55,10 +52,8 @@ def test_dump_pickle():
assert mock_pickle_dump.called


"""Tests decode dataframe"""


def test_decode_dataframe_dense():
"""Tests decode dataframe"""
# Sample dense DataFrame
data = {
"market_id": [1, 2],
Expand Down Expand Up @@ -107,10 +102,8 @@ def test_decode_dataframe_dense():
pd.testing.assert_frame_equal(result, expected_df)


"""Tests sparse matrix"""


def test_decode_dataframe_sparse():
"""Tests sparse matrix"""
# Sample sparse matrix and feature names
sparse_data = csr_matrix([[1, 0, 3, 1200], [0, 1, 2, 2400]])
feature_names = [
Expand Down Expand Up @@ -158,10 +151,8 @@ def test_decode_dataframe_sparse():
pd.testing.assert_frame_equal(result, expected_df)


"""Tests that any missing columns from DF are added to output and filled with default values"""


def test_decode_dataframe_missing_column():
"""Tests that any missing columns from DF are added to output and filled with default values"""
# Sample DataFrame missing 'total_items' and other columns
data = {
"market_id": [1],
Expand Down

0 comments on commit cd5e04d

Please sign in to comment.