diff --git a/docs/changelog-news.qmd b/docs/changelog-news.qmd index c585b8e0..bf83a1a3 100644 --- a/docs/changelog-news.qmd +++ b/docs/changelog-news.qmd @@ -10,7 +10,9 @@ number-sections: false ## Improvements: -- `.augment_lags()` and `.augment_leads()`: value_column only accepts numeric dtype. Now accepts any dtype. #295 + +- Implement `sort_dataframe()`: This function is used internally to make sure Polars and Pandas engines perform grouped operations consistently and correctly. #286 #290 +- `.augment_lags()` and `.augment_leads()`: value_column now accepts any dtype. #295 # pytimetk 0.4.0 diff --git a/src/pytimetk/__init__.py b/src/pytimetk/__init__.py index 61558ef0..02ae5b3d 100644 --- a/src/pytimetk/__init__.py +++ b/src/pytimetk/__init__.py @@ -187,7 +187,7 @@ reduce_memory_usage ) from .utils.pandas_helpers import ( - flatten_multiindex_column_names, glimpse, drop_zero_variance, transform_columns + flatten_multiindex_column_names, glimpse, drop_zero_variance, transform_columns, sort_dataframe ) from .utils.parallel_helpers import ( parallel_apply, progress_apply diff --git a/src/pytimetk/core/anomalize.py b/src/pytimetk/core/anomalize.py index 4f480fad..90e910d4 100644 --- a/src/pytimetk/core/anomalize.py +++ b/src/pytimetk/core/anomalize.py @@ -6,7 +6,9 @@ from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column from pytimetk.core.frequency import get_frequency, get_seasonal_frequency, get_trend_frequency + from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe from pytimetk.utils.parallel_helpers import parallel_apply, get_threads, progress_apply @@ -282,6 +284,8 @@ def anomalize( if reduce_memory: data = reduce_memory_usage(data) + + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) if isinstance(data, pd.DataFrame): result = _anomalize( @@ -354,6 +358,9 @@ def anomalize( if reduce_memory: result = reduce_memory_usage(result) + result.index = idx_unsorted + result = result.sort_index() + return result # Monkey patch the method to pandas groupby objects diff --git a/src/pytimetk/feature_engineering/diffs.py b/src/pytimetk/feature_engineering/diffs.py index e591e313..cf5b3668 100644 --- a/src/pytimetk/feature_engineering/diffs.py +++ b/src/pytimetk/feature_engineering/diffs.py @@ -6,6 +6,7 @@ from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe @pf.register_dataframe_method def augment_diffs( @@ -130,6 +131,8 @@ def augment_diffs( if reduce_memory: data = reduce_memory_usage(data) + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) + if engine == 'pandas': ret = _augment_diffs_pandas(data, date_column, value_column, periods, normalize=normalize) elif engine == 'polars': @@ -140,6 +143,9 @@ def augment_diffs( if reduce_memory: ret = reduce_memory_usage(ret) + ret.index = idx_unsorted + ret = ret.sort_index() + return ret # Monkey patch the method to pandas groupby objects diff --git a/src/pytimetk/feature_engineering/ewm.py b/src/pytimetk/feature_engineering/ewm.py index 6a7e5d67..7367969e 100644 --- a/src/pytimetk/feature_engineering/ewm.py +++ b/src/pytimetk/feature_engineering/ewm.py @@ -106,13 +106,9 @@ def augment_ewm( display(ewm_df) ``` """ - # Ensure data is a DataFrame or a GroupBy object + # Checks check_dataframe_or_groupby(data) - - # Ensure date column exists and is properly formatted check_date_column(data, date_column) - - # Ensure value column(s) exist check_value_column(data, value_column) # Convert string value column to list for consistency diff --git a/src/pytimetk/feature_engineering/expanding.py b/src/pytimetk/feature_engineering/expanding.py index 4bbf5a5b..892b9ed4 100644 --- a/src/pytimetk/feature_engineering/expanding.py +++ b/src/pytimetk/feature_engineering/expanding.py @@ -14,6 +14,7 @@ from pytimetk.utils.parallel_helpers import conditional_tqdm, get_threads from pytimetk.utils.polars_helpers import update_dict from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe @pf.register_dataframe_method def augment_expanding( @@ -214,6 +215,8 @@ def augment_expanding( if reduce_memory: data = reduce_memory_usage(data) + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) + # Convert string value column to list for consistency if isinstance(value_column, str): value_column = [value_column] @@ -257,6 +260,9 @@ def augment_expanding( if reduce_memory: ret = reduce_memory_usage(ret) + ret.index = idx_unsorted + ret = ret.sort_index() + return ret diff --git a/src/pytimetk/feature_engineering/fourier.py b/src/pytimetk/feature_engineering/fourier.py index 1cfada95..339a237f 100644 --- a/src/pytimetk/feature_engineering/fourier.py +++ b/src/pytimetk/feature_engineering/fourier.py @@ -9,6 +9,7 @@ from pytimetk.core.ts_summary import ts_summary from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe @pf.register_dataframe_method @@ -112,6 +113,9 @@ def augment_fourier( check_dataframe_or_groupby(data) check_date_column(data, date_column) + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) + + if isinstance(periods, int): periods = [periods] elif isinstance(periods, tuple): @@ -133,6 +137,9 @@ def augment_fourier( if reduce_memory: ret = reduce_memory_usage(ret) + + ret.index = idx_unsorted + ret = ret.sort_index() return ret diff --git a/src/pytimetk/feature_engineering/hilbert.py b/src/pytimetk/feature_engineering/hilbert.py index 512cbfb2..ed2eb3b9 100644 --- a/src/pytimetk/feature_engineering/hilbert.py +++ b/src/pytimetk/feature_engineering/hilbert.py @@ -9,6 +9,7 @@ from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column from pytimetk.utils.polars_helpers import pandas_to_polars_frequency, pandas_to_polars_aggregation_mapping from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe @pf.register_dataframe_method def augment_hilbert( @@ -157,6 +158,9 @@ def augment_hilbert( if reduce_memory: data = reduce_memory_usage(data) + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) + + if engine == 'pandas': ret = _augment_hilbert_pandas(data, date_column, value_column) elif engine == 'polars': @@ -167,6 +171,9 @@ def augment_hilbert( if reduce_memory: ret = reduce_memory_usage(ret) + ret.index = idx_unsorted + ret = ret.sort_index() + return ret diff --git a/src/pytimetk/feature_engineering/holiday_signature.py b/src/pytimetk/feature_engineering/holiday_signature.py index aea581b7..767abc85 100644 --- a/src/pytimetk/feature_engineering/holiday_signature.py +++ b/src/pytimetk/feature_engineering/holiday_signature.py @@ -14,6 +14,7 @@ from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_series_or_datetime, check_installed from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe @pf.register_dataframe_method @@ -204,6 +205,8 @@ def augment_holiday_signature( if reduce_memory: data = reduce_memory_usage(data) + + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) if engine == 'pandas': ret = _augment_holiday_signature_pandas(data, date_column, country_name) @@ -215,6 +218,9 @@ def augment_holiday_signature( if reduce_memory: ret = reduce_memory_usage(ret) + ret.index = idx_unsorted + ret = ret.sort_index() + return ret # Monkey patch the method to pandas groupby objects diff --git a/src/pytimetk/feature_engineering/lags.py b/src/pytimetk/feature_engineering/lags.py index dbc220c5..2f14ca4f 100644 --- a/src/pytimetk/feature_engineering/lags.py +++ b/src/pytimetk/feature_engineering/lags.py @@ -6,6 +6,7 @@ from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe @pf.register_dataframe_method def augment_lags( @@ -123,6 +124,8 @@ def augment_lags( if reduce_memory: data = reduce_memory_usage(data) + + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) if engine == 'pandas': ret = _augment_lags_pandas(data, date_column, value_column, lags) @@ -133,6 +136,9 @@ def augment_lags( if reduce_memory: ret = reduce_memory_usage(ret) + + ret.index = idx_unsorted + ret = ret.sort_index() return ret diff --git a/src/pytimetk/feature_engineering/leads.py b/src/pytimetk/feature_engineering/leads.py index 3aed443c..8c8ab465 100644 --- a/src/pytimetk/feature_engineering/leads.py +++ b/src/pytimetk/feature_engineering/leads.py @@ -6,6 +6,7 @@ from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe @pf.register_dataframe_method def augment_leads( @@ -124,6 +125,8 @@ def augment_leads( if reduce_memory: data = reduce_memory_usage(data) + + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) if engine == 'pandas': ret = _augment_leads_pandas(data, date_column, value_column, leads) @@ -134,6 +137,9 @@ def augment_leads( if reduce_memory: ret = reduce_memory_usage(ret) + + ret.index = idx_unsorted + ret = ret.sort_index() return ret diff --git a/src/pytimetk/feature_engineering/rolling.py b/src/pytimetk/feature_engineering/rolling.py index b25c21b2..dc595957 100644 --- a/src/pytimetk/feature_engineering/rolling.py +++ b/src/pytimetk/feature_engineering/rolling.py @@ -14,6 +14,7 @@ from pytimetk.utils.parallel_helpers import conditional_tqdm, get_threads from pytimetk.utils.polars_helpers import update_dict from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe @pf.register_dataframe_method def augment_rolling( @@ -201,6 +202,8 @@ def augment_rolling( if reduce_memory: data = reduce_memory_usage(data) + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) + # Convert string value column to list for consistency if isinstance(value_column, str): value_column = [value_column] @@ -252,6 +255,9 @@ def augment_rolling( if reduce_memory: ret = reduce_memory_usage(ret) + + ret.index = idx_unsorted + ret = ret.sort_index() return ret diff --git a/src/pytimetk/feature_engineering/wavelet.py b/src/pytimetk/feature_engineering/wavelet.py index 44aa23f2..35eb2cfc 100644 --- a/src/pytimetk/feature_engineering/wavelet.py +++ b/src/pytimetk/feature_engineering/wavelet.py @@ -9,6 +9,8 @@ from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column from pytimetk.utils.polars_helpers import pandas_to_polars_frequency, pandas_to_polars_aggregation_mapping from pytimetk.utils.memory_helpers import reduce_memory_usage +from pytimetk.utils.pandas_helpers import sort_dataframe + #@pf.register_dataframe_method @@ -197,6 +199,8 @@ def augment_wavelet( if reduce_memory: data = reduce_memory_usage(data) + + data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True) wavelet_functions = { 'morlet': morlet_wavelet, @@ -242,6 +246,9 @@ def _apply_cwt(df): if reduce_memory: ret = reduce_memory_usage(ret) + + ret.index = idx_unsorted + ret = ret.sort_index() return ret diff --git a/src/pytimetk/utils/checks.py b/src/pytimetk/utils/checks.py index 781cbf54..772c8a7d 100644 --- a/src/pytimetk/utils/checks.py +++ b/src/pytimetk/utils/checks.py @@ -125,4 +125,7 @@ def check_installed(package_name: str): # data = data.groupby(group_names) # return data + + + \ No newline at end of file diff --git a/src/pytimetk/utils/pandas_helpers.py b/src/pytimetk/utils/pandas_helpers.py index 617f40eb..1f452881 100644 --- a/src/pytimetk/utils/pandas_helpers.py +++ b/src/pytimetk/utils/pandas_helpers.py @@ -112,6 +112,70 @@ def make_lalign_formatter(df, cols=None): return None + +@pf.register_dataframe_method +def sort_dataframe( + data: Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy], + date_column: str, + keep_grouped_df: bool = True, +) -> Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]: + '''The function `sort_dataframe` sorts a DataFrame by a specified date column, handling both regular + DataFrames and grouped DataFrames. + + Parameters + ---------- + data : Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy] + The `data` parameter in the `sort_dataframe` function can accept either a pandas DataFrame or a + grouped DataFrame (DataFrameGroupBy object). + date_column + The `date_column` parameter in the `sort_dataframe` method is used to specify the column in the + DataFrame by which the sorting will be performed. This column contains dates that will be used as + the basis for sorting the DataFrame or DataFrameGroupBy object. + keep_grouped_df + If `True` and `data` is a grouped data frame, a grouped data frame will be returned. If `False`, an ungrouped data frame is returned. + + Returns + ------- + The `sort_dataframe` function returns a sorted DataFrame based on the specified date column. If the + input data is a regular DataFrame, it sorts the DataFrame by the specified date column. If the input + data is a grouped DataFrame (DataFrameGroupBy object), it sorts the DataFrame by the group names and + the specified date column. The function returns the sorted DataFrame. + + Examples + -------- + ```{python} + import pytimetk as tk + import pandas as pd + + df = tk.load_dataset('walmart_sales_weekly', parse_dates=['Date']) + + df.sort_dataframe('Date') + + df.groupby('id').sort_dataframe('Date').obj + + df.groupby(['id', 'Store', 'Dept']).sort_dataframe('Date').obj + ``` + + ''' + + group_names = None + if isinstance(data, pd.DataFrame): + df = data.copy() + df.sort_values(by=[date_column], inplace=True) + index_after_sort = df.index + + if isinstance(data, pd.core.groupby.generic.DataFrameGroupBy): + group_names = data.grouper.names + df = data.obj.copy() + df.sort_values(by=[*group_names, date_column], inplace=True) + index_after_sort = df.index + if keep_grouped_df: + df = df.groupby(group_names) + + return df, index_after_sort + +pd.core.groupby.generic.DataFrameGroupBy.sort_dataframe = sort_dataframe + @pf.register_dataframe_method def drop_zero_variance(data: pd.DataFrame, ): '''The function `drop_zero_variance` takes a pandas DataFrame as input and returns a new DataFrame with