Skip to content

Commit

Permalink
update reports step to be used in train
Browse files Browse the repository at this point in the history
  • Loading branch information
diegomarvid committed May 14, 2024
1 parent 7f77bef commit 7fecab4
Showing 1 changed file with 13 additions and 7 deletions.
20 changes: 13 additions & 7 deletions pipeline_lib/core/steps/calculate_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
class CalculateReportsStep(PipelineStep):
"""Calculate reports."""

used_for_prediction = True
used_for_training = False
used_for_prediction = False
used_for_training = True

def __init__(self, max_samples: int = 1000) -> None:
"""Initialize CalculateReportsStep."""
Expand All @@ -28,7 +28,16 @@ def execute(self, data: DataContainer) -> DataContainer:
if model is None:
raise ValueError("Model not found in data container.")

df = data.flow
df = (
data.test
if data.test is not None
else data.validation if data.validation is not None else None
)
if df is None:
raise ValueError(
"Both test and validation are None. A validation or test set is required."
)

if len(df) > self.max_samples:
# Randomly sample a subset of data points if the dataset is larger than max_samples
self.logger.info(
Expand All @@ -38,10 +47,7 @@ def execute(self, data: DataContainer) -> DataContainer:
self.logger.info(f"Sampling {self.max_samples} data points from the dataset.")
df = df.sample(n=self.max_samples, random_state=42)

drop_columns = (
data._drop_columns + ["predictions"] if data._drop_columns else ["predictions"]
)
df = df.drop(columns=drop_columns)
df = df.drop(columns=data._drop_columns)
X = df.drop(columns=[data.target])

# Calculate SHAP values with progress tracking and logging
Expand Down

0 comments on commit 7fecab4

Please sign in to comment.