Address PR comments

tryolabs · May 2, 2024 · abaa307 · abaa307
1 parent 93f77d8
commit abaa307
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 15 deletions.
diff --git a/pipeline_lib/core/pipeline.py b/pipeline_lib/core/pipeline.py
@@ -3,7 +3,6 @@
 import json
 import logging
 import os
-import pprint
 import time
 from datetime import datetime
 from typing import Any, Optional
@@ -59,9 +58,6 @@ def run(self, is_train: bool, save: bool = True) -> DataContainer:
             self.logger.info("Predicting with the pipeline")
 
         self.data.is_train = is_train
-
-        pprint.pprint(self.data.data)
-
         for i, step in enumerate(steps_to_run):
             start_time = time.time()
             log_str = f"Running {step.__class__.__name__} - {i + 1} / {len(steps_to_run)}"

diff --git a/pipeline_lib/core/steps/generate.py b/pipeline_lib/core/steps/generate.py
@@ -69,20 +69,29 @@ def execute(self, data: DataContainer) -> DataContainer:
             # For example if an integer column has no NA values in the train set but has a NA on
             # the test set, we need to include it in the schema inference so that the column is
             # assigned a float dtype, instead of int, so that NA values are properly handled
-            opt_df = pd.concat([df, data.test]) if data.test is not None else df
+            # Handle the target column separately, since the prediction df won't have a target
+            opt_X = pd.concat([df, data.test]) if data.test is not None else df
+            opt_y = opt_X[[data.target]]
+            opt_X = opt_X.drop(columns=[data.target])
+
             if self.predict_path:
-                opt_df = pd.concat([df, self._load_data_from_file(self.predict_path)])
+                predict_df = self._load_data_from_file(self.predict_path)
+                if data.target in predict_df.columns:
+                    opt_y = pd.concat([opt_y, predict_df[[data.target]]])
+                    opt_X = pd.concat([opt_X, predict_df.drop(columns=[data.target])])
+                else:
+                    opt_X = pd.concat([opt_X, predict_df])
 
             if self.drop_columns is not None:
-                opt_df.drop(columns=self.drop_columns, inplace=True)
+                opt_X.drop(columns=self.drop_columns, inplace=True)
 
             if self.optimize_dtypes:
-                apply_all_dtype_conversions(
-                    df=opt_df, skip_cols=set(self.optimize_dtypes_skip_cols)
-                )
+                apply_all_dtype_conversions(df=opt_X, skip_cols=set(self.optimize_dtypes_skip_cols))
+                apply_all_dtype_conversions(df=opt_y, skip_cols=set(self.optimize_dtypes_skip_cols))
 
             # Save the schema for future use in predictions
-            data._generate_step_dtypes = opt_df.dtypes.to_dict()
+            data._generate_step_dtypes = opt_X.dtypes.to_dict()
+            data._generate_step_dtypes.update(opt_y.dtypes.to_dict())
             if self.train_path.endswith(".csv") or self.optimize_dtypes:
                 # Log the inferred schema for csvs or if we optimized dtypes
                 self.logger.info(
@@ -91,9 +100,13 @@ def execute(self, data: DataContainer) -> DataContainer:
 
             # Re-split the optimized df into train/test, discard prediction since we're doing
             # training for now
-            df = opt_df.iloc[0 : len(df)]
+            i_max_row = len(df) + len(data.test) if data.test is not None else len(df)
+            opt_X = opt_X.iloc[:i_max_row, :]
+            opt_y = opt_y.iloc[:i_max_row, :]
+            opt_X = pd.concat([opt_X, opt_y], axis=1)
+            df = opt_X.iloc[0 : len(df)]
             if data.test is not None:
-                data.test = opt_df.iloc[len(df) :]
+                data.test = opt_X.iloc[len(df) :]
         else:
             # Apply the schema saved during training to the DataFrame
             for key, value in data._generate_step_dtypes.items():

diff --git a/pipeline_lib/implementation/tabular/xgboost/model.py b/pipeline_lib/implementation/tabular/xgboost/model.py
@@ -1,5 +1,3 @@
-from typing import Any
-
 import pandas as pd
 import xgboost as xgb