Skip to content

Commit

Permalink
Implementing and testing 'DataOverlapInfo()' dataclass.
Browse files Browse the repository at this point in the history
  • Loading branch information
yohplala committed Nov 18, 2024
1 parent a5c9417 commit 1d2da90
Show file tree
Hide file tree
Showing 7 changed files with 758 additions and 167 deletions.
6 changes: 4 additions & 2 deletions .cursorrules
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ You are an expert in data analysis and a genius at reasoning, with a focus on Py
- Follow proper **naming conventions**.
- Use snake_case for variable and definition names or CamelCase for class names.
- Follow PEP 8 style guidelines for Python code.
- End all sentences of comments or docstrings with a period.
- Code style consistency using Ruff.
- Use Python type hints for all function signatures.
- Add return types when possible.
- Handle errors at the beginning of functions with early returns.
- Use guard clauses and avoid deeply nested if statements.
- When using a function with more than one parameter, always name the following parameters (e.g., `enumerate([1,2,3], start=1)` and not `enumerate([1,2,3], 1)`).

### Performance Optimization
- Prefer vectorized operations over explicit loops for better performance.
Expand All @@ -45,8 +45,10 @@ You are an expert in data analysis and a genius at reasoning, with a focus on Py
- Add descriptive docstrings to all python functions and classes, using PEP 257 and numpydoc convention.
- Update existing docstrings if need be.
- Comments must describe purpose, not effect.
- Comments should be more detailed for complex logic
- Comments should be more detailed for complex logic.
- Make sure you keep any comments that exist in a file.
- Add a blank line at the end of each docstring (between the last line of comment and the """).
- End all sentences of comments or docstrings with a period.

### Error Handling
- Start error message with lower case letter.
Expand Down
5 changes: 4 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,18 @@ repos:
--statistics,
--ignore=C101 RST210 S101 W503,
--per-file-ignores=
tests/*.py:D103
oups/__init__.py:F401
oups/store/__init__.py:F401
oups/store/writer.py:C901 S403 S301
oups/store/iter_data.py:C901
oups/aggstream/__init__.py:F401
oups/aggstream/jcumsegagg.py:C901
oups/aggstream/segmentby.py:C901
oups/aggstream/cumsegagg.py:C901 E203
oups/aggstream/aggstream.py:C901
tests/*.py:D103
tests/test_store/test_data_overlap.py:F401 F811
tests/test_store/test_iter_data.py:F401 F811
]

# Pydocstyle
Expand Down
125 changes: 125 additions & 0 deletions oups/store/data_overlap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/env python3
"""
Created on Thu Nov 14 18:00:00 2024.
@author: yoh
"""
from dataclasses import dataclass
from typing import Optional

import numpy as np
from fastparquet import ParquetFile
from pandas import DataFrame


@dataclass
class DataOverlapInfo:
"""
Information about how DataFrame and ParquetFile overlap.
Parameters
----------
has_pf_head : bool
True if ParquetFile has row groups before DataFrame overlap.
has_df_head : bool
True if DataFrame has sufficient rows before ParquetFile overlap.
has_overlap : bool
True if DataFrame and ParquetFile have overlapping data.
has_pf_tail : bool
True if ParquetFile has row groups after DataFrame overlap.
has_df_tail : bool
True if DataFrame has rows after ParquetFile overlap.
df_idx_overlap_start : Optional[int]
Index of first overlapping row in DataFrame, if any.
df_idx_overlap_end_excl : Optional[int]
Index of the row after the last overlapping row in DataFrame, if any.
rg_idx_overlap_start : Optional[int]
Index of first overlapping row group, if any.
rg_idx_overlap_end : Optional[int]
Index of last overlapping row group, if any.
df_idx_rg_starts : ndarray
Indices where each row group starts in DataFrame.
df_idx_rg_ends : ndarray
Indices where each row group ends in DataFrame.
"""

df_idx_rg_starts: np.ndarray
df_idx_rg_ends: np.ndarray
df_idx_overlap_start: Optional[int]
df_idx_overlap_end_excl: Optional[int]
rg_idx_overlap_start: Optional[int]
rg_idx_overlap_end: Optional[int]
has_pf_head: bool
has_df_head: bool
has_overlap: bool
has_pf_tail: bool
has_df_tail: bool

@classmethod
def analyze(
cls,
df: DataFrame,
pf: ParquetFile,
ordered_on: str,
max_row_group_size: int,
) -> "DataOverlapInfo":
"""
Analyze how DataFrame and ParquetFile data overlap.
Parameters
----------
df : DataFrame
Input DataFrame.
pf : ParquetFile
Input ParquetFile.
ordered_on : str
Column name by which data is ordered.
max_row_group_size : int
Maximum number of rows per chunk.
Returns
-------
DataOverlapInfo
Instance containing overlap analysis information.
"""
# Find overlapping regions in dataframe
rg_mins = pf.statistics["min"][ordered_on]
rg_maxs = pf.statistics["max"][ordered_on]
df_idx_rg_starts = np.searchsorted(df.loc[:, ordered_on], rg_mins, side="left")
df_idx_rg_ends = np.searchsorted(df.loc[:, ordered_on], rg_maxs, side="right")

# Determine overlap start/end indices in row groups
rg_idx_overlap_start = df_idx_rg_ends.astype(bool).argmax()
rg_idx_overlap_end = df_idx_rg_ends.argmax()
# Analyze overlap patterns
has_pf_head = rg_idx_overlap_start > 0 or df_idx_rg_ends[-1] == 0
has_df_head = df_idx_rg_starts[0] >= max_row_group_size
has_pf_tail = rg_idx_overlap_end + 1 < len(rg_mins) and df_idx_rg_ends[-1] != 0
has_df_tail = df_idx_rg_ends[rg_idx_overlap_end] < len(df)
if rg_idx_overlap_start != rg_idx_overlap_end:
has_overlap = True
df_idx_overlap_start = df_idx_rg_starts[rg_idx_overlap_start]
df_idx_overlap_end_excl = df_idx_rg_ends[rg_idx_overlap_end]
else:
has_overlap = False
rg_idx_overlap_start = None
rg_idx_overlap_end = None
df_idx_overlap_start = None
df_idx_overlap_end_excl = None

return cls(
has_pf_head=has_pf_head,
has_df_head=has_df_head,
has_overlap=has_overlap,
has_pf_tail=has_pf_tail,
has_df_tail=has_df_tail,
df_idx_overlap_start=df_idx_overlap_start,
df_idx_overlap_end_excl=df_idx_overlap_end_excl,
rg_idx_overlap_start=rg_idx_overlap_start,
rg_idx_overlap_end=rg_idx_overlap_end,
df_idx_rg_starts=df_idx_rg_starts,
df_idx_rg_ends=df_idx_rg_ends,
)
Loading

0 comments on commit 1d2da90

Please sign in to comment.