Skip to content

Commit

Permalink
update calculate features
Browse files Browse the repository at this point in the history
  • Loading branch information
diegomarvid committed Apr 18, 2024
1 parent e98b389 commit eb6113e
Showing 1 changed file with 44 additions and 17 deletions.
61 changes: 44 additions & 17 deletions pipeline_lib/core/steps/calculate_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,31 @@ def __init__(
f"Unsupported datetime features: {unsupported_features}"
)

def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> None:
if self.datetime_columns and not self.features:
raise ValueError(
"No datetime features specified. Must specify at least one feature. Possible"
f" features: {list(self.feature_extractors.keys())}"
)

def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
"""Convert a column to datetime."""
# Check if the column is already a datetime type
if not is_datetime64_any_dtype(df[column]):
try:
df.loc[:, column] = pd.to_datetime(df[column], errors="raise")
self.logger.info(f"Column '{column}' converted to datetime.")
except Exception as e:
df[column] = pd.to_datetime(
df[column],
errors="raise",
)
self.logger.info(f"Column '{column}' automatically converted to datetime.")
except ValueError as e:
self.logger.error(f"Error converting column '{column}' to datetime: {e}")
except Exception as e:
self.logger.error(f"Unexpected error converting column '{column}' to datetime: {e}")
else:
self.logger.debug(f"Column '{column}' is already a datetime type.")

return df

def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
"""Extract a single feature from a datetime column."""
extractor = self.feature_extractors[feature]
Expand All @@ -73,33 +86,47 @@ def execute(self, data: DataContainer) -> DataContainer:
"""Execute the step."""
self.logger.info("Calculating features")

df = data.flow
data.train = self._create_datetime_features(data.train, log=True)

if data.validation is not None:
data.validation = self._create_datetime_features(data.validation)

if data.test is not None:
data.test = self._create_datetime_features(data.test)

return data

def _create_datetime_features(
self, df: pd.DataFrame, log: Optional[bool] = False
) -> pd.DataFrame:
"""Create datetime features."""
created_features = []

if self.datetime_columns:
for column in self.datetime_columns:
if column in df.columns:
self._convert_column_to_datetime(df, column)
df = self._convert_column_to_datetime(df, column)

if self.features:
for feature in self.features:
self._extract_feature(df, column, feature)
created_features.append(f"{column}_{feature}")
else:
self.logger.warning(
"No datetime features specified. Skipping feature extraction."
)
if log:
self.logger.warning(
"No datetime features specified. Skipping feature extraction."
)
else:
self.logger.warning(f"Datetime column '{column}' not found in the DataFrame")
if log:
self.logger.warning("Datetime column '{column}' not found in the DataFrame")
else:
self.logger.warning("No datetime columns specified. Skipping feature extraction.")
if log:
self.logger.warning("No datetime columns specified. Skipping feature extraction.")

# drop original datetime columns
if self.datetime_columns:
df = df.drop(columns=self.datetime_columns)
self.logger.info(f"Dropped datetime columns: {self.datetime_columns}")
if log:
self.logger.info("Dropped datetime columns: {self.datetime_columns}")

self.logger.info(f"Created new features: {created_features}")

data.flow = df

return data
return df

0 comments on commit eb6113e

Please sign in to comment.