Implementing and testing 'DataOverlapInfo()' dataclass.

yohplala · Nov 18, 2024 · 1d2da90 · 1d2da90
1 parent a5c9417
commit 1d2da90
Show file tree

Hide file tree

Showing 7 changed files with 758 additions and 167 deletions.
diff --git a/.cursorrules b/.cursorrules
@@ -23,12 +23,12 @@ You are an expert in data analysis and a genius at reasoning, with a focus on Py
 - Follow proper **naming conventions**.
 - Use snake_case for variable and definition names or CamelCase for class names.
 - Follow PEP 8 style guidelines for Python code.
-- End all sentences of comments or docstrings with a period.
 - Code style consistency using Ruff.
 - Use Python type hints for all function signatures.
 - Add return types when possible.
 - Handle errors at the beginning of functions with early returns.
 - Use guard clauses and avoid deeply nested if statements.
+- When using a function with more than one parameter, always name the following parameters (e.g., `enumerate([1,2,3], start=1)` and not `enumerate([1,2,3], 1)`).
 
 ### Performance Optimization
 - Prefer vectorized operations over explicit loops for better performance.
@@ -45,8 +45,10 @@ You are an expert in data analysis and a genius at reasoning, with a focus on Py
 - Add descriptive docstrings to all python functions and classes, using PEP 257 and numpydoc convention.
 - Update existing docstrings if need be.
 - Comments must describe purpose, not effect.
-- Comments should be more detailed for complex logic
+- Comments should be more detailed for complex logic.
 - Make sure you keep any comments that exist in a file.
+- Add a blank line at the end of each docstring (between the last line of comment and the """).
+- End all sentences of comments or docstrings with a period.
 
 ### Error Handling
 - Start error message with lower case letter.

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -85,15 +85,18 @@ repos:
             --statistics,
             --ignore=C101 RST210 S101 W503,
             --per-file-ignores=
-                tests/*.py:D103
                 oups/__init__.py:F401
                 oups/store/__init__.py:F401
                 oups/store/writer.py:C901 S403 S301
+                oups/store/iter_data.py:C901
                 oups/aggstream/__init__.py:F401
                 oups/aggstream/jcumsegagg.py:C901
                 oups/aggstream/segmentby.py:C901
                 oups/aggstream/cumsegagg.py:C901 E203
                 oups/aggstream/aggstream.py:C901
+                tests/*.py:D103
+                tests/test_store/test_data_overlap.py:F401 F811
+                tests/test_store/test_iter_data.py:F401 F811
         ]
 
   # Pydocstyle

diff --git a/oups/store/data_overlap.py b/oups/store/data_overlap.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Created on Thu Nov 14 18:00:00 2024.
+
+@author: yoh
+
+"""
+from dataclasses import dataclass
+from typing import Optional
+
+import numpy as np
+from fastparquet import ParquetFile
+from pandas import DataFrame
+
+
+@dataclass
+class DataOverlapInfo:
+    """
+    Information about how DataFrame and ParquetFile overlap.
+
+    Parameters
+    ----------
+    has_pf_head : bool
+        True if ParquetFile has row groups before DataFrame overlap.
+    has_df_head : bool
+        True if DataFrame has sufficient rows before ParquetFile overlap.
+    has_overlap : bool
+        True if DataFrame and ParquetFile have overlapping data.
+    has_pf_tail : bool
+        True if ParquetFile has row groups after DataFrame overlap.
+    has_df_tail : bool
+        True if DataFrame has rows after ParquetFile overlap.
+    df_idx_overlap_start : Optional[int]
+        Index of first overlapping row in DataFrame, if any.
+    df_idx_overlap_end_excl : Optional[int]
+        Index of the row after the last overlapping row in DataFrame, if any.
+    rg_idx_overlap_start : Optional[int]
+        Index of first overlapping row group, if any.
+    rg_idx_overlap_end : Optional[int]
+        Index of last overlapping row group, if any.
+    df_idx_rg_starts : ndarray
+        Indices where each row group starts in DataFrame.
+    df_idx_rg_ends : ndarray
+        Indices where each row group ends in DataFrame.
+
+    """
+
+    df_idx_rg_starts: np.ndarray
+    df_idx_rg_ends: np.ndarray
+    df_idx_overlap_start: Optional[int]
+    df_idx_overlap_end_excl: Optional[int]
+    rg_idx_overlap_start: Optional[int]
+    rg_idx_overlap_end: Optional[int]
+    has_pf_head: bool
+    has_df_head: bool
+    has_overlap: bool
+    has_pf_tail: bool
+    has_df_tail: bool
+
+    @classmethod
+    def analyze(
+        cls,
+        df: DataFrame,
+        pf: ParquetFile,
+        ordered_on: str,
+        max_row_group_size: int,
+    ) -> "DataOverlapInfo":
+        """
+        Analyze how DataFrame and ParquetFile data overlap.
+
+        Parameters
+        ----------
+        df : DataFrame
+            Input DataFrame.
+        pf : ParquetFile
+            Input ParquetFile.
+        ordered_on : str
+            Column name by which data is ordered.
+        max_row_group_size : int
+            Maximum number of rows per chunk.
+
+        Returns
+        -------
+        DataOverlapInfo
+            Instance containing overlap analysis information.
+
+        """
+        # Find overlapping regions in dataframe
+        rg_mins = pf.statistics["min"][ordered_on]
+        rg_maxs = pf.statistics["max"][ordered_on]
+        df_idx_rg_starts = np.searchsorted(df.loc[:, ordered_on], rg_mins, side="left")
+        df_idx_rg_ends = np.searchsorted(df.loc[:, ordered_on], rg_maxs, side="right")
+
+        # Determine overlap start/end indices in row groups
+        rg_idx_overlap_start = df_idx_rg_ends.astype(bool).argmax()
+        rg_idx_overlap_end = df_idx_rg_ends.argmax()
+        # Analyze overlap patterns
+        has_pf_head = rg_idx_overlap_start > 0 or df_idx_rg_ends[-1] == 0
+        has_df_head = df_idx_rg_starts[0] >= max_row_group_size
+        has_pf_tail = rg_idx_overlap_end + 1 < len(rg_mins) and df_idx_rg_ends[-1] != 0
+        has_df_tail = df_idx_rg_ends[rg_idx_overlap_end] < len(df)
+        if rg_idx_overlap_start != rg_idx_overlap_end:
+            has_overlap = True
+            df_idx_overlap_start = df_idx_rg_starts[rg_idx_overlap_start]
+            df_idx_overlap_end_excl = df_idx_rg_ends[rg_idx_overlap_end]
+        else:
+            has_overlap = False
+            rg_idx_overlap_start = None
+            rg_idx_overlap_end = None
+            df_idx_overlap_start = None
+            df_idx_overlap_end_excl = None
+
+        return cls(
+            has_pf_head=has_pf_head,
+            has_df_head=has_df_head,
+            has_overlap=has_overlap,
+            has_pf_tail=has_pf_tail,
+            has_df_tail=has_df_tail,
+            df_idx_overlap_start=df_idx_overlap_start,
+            df_idx_overlap_end_excl=df_idx_overlap_end_excl,
+            rg_idx_overlap_start=rg_idx_overlap_start,
+            rg_idx_overlap_end=rg_idx_overlap_end,
+            df_idx_rg_starts=df_idx_rg_starts,
+            df_idx_rg_ends=df_idx_rg_ends,
+        )