adapt steps for prediction with data.flow

tryolabs · Apr 18, 2024 · 7d2cff1 · 7d2cff1
1 parent f5f580d
commit 7d2cff1
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 11 deletions.
diff --git a/pipeline_lib/core/steps/calculate_features.py b/pipeline_lib/core/steps/calculate_features.py
@@ -86,7 +86,11 @@ def execute(self, data: DataContainer) -> DataContainer:
         """Execute the step."""
         self.logger.info("Calculating features")
 
-        data.train = self._create_datetime_features(data.train, log=True)
+        if not data.is_train:
+            data.flow = self._create_datetime_features(data.flow, log=True)
+
+        if data.train is not None:
+            data.train = self._create_datetime_features(data.train, log=True)
 
         if data.validation is not None:
             data.validation = self._create_datetime_features(data.validation)

diff --git a/pipeline_lib/core/steps/clean.py b/pipeline_lib/core/steps/clean.py
@@ -32,16 +32,18 @@ def __init__(
     def execute(self, data: DataContainer) -> DataContainer:
         self.logger.info("Cleaning tabular data...")
 
-        df_train = self._clean_df(data.train)
-        data.train = df_train
+        if not data.is_train:
+            data.flow = self._clean_df(data.flow)
+            return data
+
+        if data.train is not None:
+            data.train = self._clean_df(data.train)
 
         if data.validation is not None and self.apply_validation:
-            df_validation = self._clean_df(data.validation)
-            data.validation = df_validation
+            data.validation = self._clean_df(data.validation)
 
         if data.test is not None and self.apply_test:
-            df_test = self._clean_df(data.test)
-            data.test = df_test
+            data.test = self._clean_df(data.test)
 
         return data
 

diff --git a/pipeline_lib/core/steps/encode.py b/pipeline_lib/core/steps/encode.py
@@ -39,10 +39,20 @@ def execute(self, data: DataContainer) -> DataContainer:
         """Execute the encoding step."""
         self.logger.info("Encoding data")
 
-        if not data.target:
-            raise ValueError("Target column not found in any parameter before encoding.")
+        if not data.is_train:
+            categorical_features, numeric_features = self._get_feature_types(data.flow, data.target)
+            data.flow, _ = self._apply_encoding(
+                data.flow,
+                data.target,
+                categorical_features,
+                numeric_features,
+                saved_encoder=data._encoder,
+                log=True,
+            )
+            return data
 
         target_column_name = data.target
+
         categorical_features, numeric_features = self._get_feature_types(
             data.train, target_column_name
         )
@@ -53,6 +63,7 @@ def execute(self, data: DataContainer) -> DataContainer:
             categorical_features,
             numeric_features,
             fit_encoders=True,
+            log=True,
         )
 
         if data.validation is not None:
@@ -83,6 +94,7 @@ def _apply_encoding(
         numeric_features: List[str],
         fit_encoders: Optional[bool] = False,
         saved_encoder: Optional[ColumnTransformer] = None,
+        log: Optional[bool] = False,
     ) -> Tuple[pd.DataFrame, Optional[ColumnTransformer]]:
         """Apply the encoding to the data."""
         if not fit_encoders and not saved_encoder:
@@ -127,7 +139,7 @@ def _apply_encoding(
             encoded_data, feature_encoder_map
         )
 
-        if fit_encoders:
+        if log:
             self._log_feature_info(
                 categorical_features,
                 numeric_features,
@@ -139,9 +151,12 @@ def _apply_encoding(
         return encoded_data, encoder
 
     def _get_feature_types(
-        self, df: pd.DataFrame, target_column_name: str
+        self, df: pd.DataFrame, target_column_name: Optional[str] = None
     ) -> Tuple[List[str], List[str]]:
         """Get categorical and numeric feature lists."""
+        if target_column_name is None:
+            target_column_name = ""
+
         categorical_features = [
             col for col in df.columns if df[col].dtype == "object" and col != target_column_name
         ]
@@ -150,6 +165,7 @@ def _get_feature_types(
             for col in df.columns
             if col not in categorical_features and col != target_column_name
         ]
+
         return categorical_features, numeric_features
 
     def _split_categorical_features(

diff --git a/pipeline_lib/core/steps/generate.py b/pipeline_lib/core/steps/generate.py
@@ -58,8 +58,14 @@ def execute(self, data: DataContainer) -> DataContainer:
 
         data.raw = df
         data.flow = df
+
         data.target = self.target
 
+        # remove target if it doesn't exist in the DataFrame for prediction
+        if not data.is_train:
+            if self.target not in df.columns:
+                data.target = None
+
         self.logger.info(f"Generated DataFrame with shape: {df.shape}")
 
         return data