Skip to content

Commit

Permalink
adapt steps for prediction with data.flow
Browse files Browse the repository at this point in the history
  • Loading branch information
diegomarvid committed Apr 18, 2024
1 parent f5f580d commit 7d2cff1
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 11 deletions.
6 changes: 5 additions & 1 deletion pipeline_lib/core/steps/calculate_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,11 @@ def execute(self, data: DataContainer) -> DataContainer:
"""Execute the step."""
self.logger.info("Calculating features")

data.train = self._create_datetime_features(data.train, log=True)
if not data.is_train:
data.flow = self._create_datetime_features(data.flow, log=True)

if data.train is not None:
data.train = self._create_datetime_features(data.train, log=True)

if data.validation is not None:
data.validation = self._create_datetime_features(data.validation)
Expand Down
14 changes: 8 additions & 6 deletions pipeline_lib/core/steps/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,18 @@ def __init__(
def execute(self, data: DataContainer) -> DataContainer:
self.logger.info("Cleaning tabular data...")

df_train = self._clean_df(data.train)
data.train = df_train
if not data.is_train:
data.flow = self._clean_df(data.flow)
return data

if data.train is not None:
data.train = self._clean_df(data.train)

if data.validation is not None and self.apply_validation:
df_validation = self._clean_df(data.validation)
data.validation = df_validation
data.validation = self._clean_df(data.validation)

if data.test is not None and self.apply_test:
df_test = self._clean_df(data.test)
data.test = df_test
data.test = self._clean_df(data.test)

return data

Expand Down
24 changes: 20 additions & 4 deletions pipeline_lib/core/steps/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,20 @@ def execute(self, data: DataContainer) -> DataContainer:
"""Execute the encoding step."""
self.logger.info("Encoding data")

if not data.target:
raise ValueError("Target column not found in any parameter before encoding.")
if not data.is_train:
categorical_features, numeric_features = self._get_feature_types(data.flow, data.target)
data.flow, _ = self._apply_encoding(
data.flow,
data.target,
categorical_features,
numeric_features,
saved_encoder=data._encoder,
log=True,
)
return data

target_column_name = data.target

categorical_features, numeric_features = self._get_feature_types(
data.train, target_column_name
)
Expand All @@ -53,6 +63,7 @@ def execute(self, data: DataContainer) -> DataContainer:
categorical_features,
numeric_features,
fit_encoders=True,
log=True,
)

if data.validation is not None:
Expand Down Expand Up @@ -83,6 +94,7 @@ def _apply_encoding(
numeric_features: List[str],
fit_encoders: Optional[bool] = False,
saved_encoder: Optional[ColumnTransformer] = None,
log: Optional[bool] = False,
) -> Tuple[pd.DataFrame, Optional[ColumnTransformer]]:
"""Apply the encoding to the data."""
if not fit_encoders and not saved_encoder:
Expand Down Expand Up @@ -127,7 +139,7 @@ def _apply_encoding(
encoded_data, feature_encoder_map
)

if fit_encoders:
if log:
self._log_feature_info(
categorical_features,
numeric_features,
Expand All @@ -139,9 +151,12 @@ def _apply_encoding(
return encoded_data, encoder

def _get_feature_types(
self, df: pd.DataFrame, target_column_name: str
self, df: pd.DataFrame, target_column_name: Optional[str] = None
) -> Tuple[List[str], List[str]]:
"""Get categorical and numeric feature lists."""
if target_column_name is None:
target_column_name = ""

categorical_features = [
col for col in df.columns if df[col].dtype == "object" and col != target_column_name
]
Expand All @@ -150,6 +165,7 @@ def _get_feature_types(
for col in df.columns
if col not in categorical_features and col != target_column_name
]

return categorical_features, numeric_features

def _split_categorical_features(
Expand Down
6 changes: 6 additions & 0 deletions pipeline_lib/core/steps/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,14 @@ def execute(self, data: DataContainer) -> DataContainer:

data.raw = df
data.flow = df

data.target = self.target

# remove target if it doesn't exist in the DataFrame for prediction
if not data.is_train:
if self.target not in df.columns:
data.target = None

self.logger.info(f"Generated DataFrame with shape: {df.shape}")

return data
Expand Down

0 comments on commit 7d2cff1

Please sign in to comment.