diff --git a/oups/store/writer.py b/oups/store/writer.py index 6871185..45d7ba8 100644 --- a/oups/store/writer.py +++ b/oups/store/writer.py @@ -223,6 +223,45 @@ def to_midx(idx: Index, levels: List[str] = None) -> MultiIndex: return MultiIndex.from_tuples(tuples, names=levels) +def check_cmidx(chunk): + """ + Check cmidx to have it complying with fastparquet limitations. + + Most notably, fastparquet requires names for each level in a Multiindex. + If these are not set, there are set to '', an empty string. + + Multiindex is modified in-place. + + Parameters + ---------- + chunk : DataFrame + DataFrame which multi-index is to check. + + Returns + ------- + None + + """ + cmidx = chunk.columns + # If an item of the column name is not a string, turn it into a string. + # Using 'set_levels()' instead of rconstructing a MultiIndex to preserve + # index names directly. + level_updated_idx = [] + level_updated = [] + for i, level in enumerate(cmidx.levels): + str_level = [name if isinstance(name, str) else str(name) for name in level] + if level.to_list() != str_level: + level_updated_idx.append(i) + level_updated.append(str_level) + if level_updated: + chunk.columns = chunk.columns.set_levels(level_updated, level=level_updated_idx) + # If a name is 'None', set it to '' instead. + cmidx = chunk.columns + for i, name in enumerate(cmidx.names): + if name is None: + cmidx.set_names("", level=i, inplace=True) + + def write_metadata( pf: ParquetFile, metadata: Dict[str, str] = None, @@ -563,6 +602,10 @@ def write( chunk = next(iter_data) if cmidx_expand: chunk.columns = to_midx(chunk.columns, cmidx_levels) + # In case multi-index is used, check that it complies with fastparquet + # limitations. + if isinstance(chunk.columns, MultiIndex): + check_cmidx(chunk) fp_write( dirpath, chunk,