Reduce TTFB for diff generation

Lazily populate diff items. This mostly eliminates TTFB for diffs and also avoids linear-scaled memory consumption during large diffs Output seems to be ~30% faster overall
koordinates · Jan 8, 2025 · d8d78ff · d8d78ff
1 parent e111062
commit d8d78ff
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,9 @@ Please note that compatibility for 0.x releases (software or repositories) isn't
 
 _When adding new entries to the changelog, please include issue/PR numbers wherever possible._
 
+## Unreleased
+
+- diff: Much faster and much lower memory usage when using `--no-sort-keys` [#1032](https://github.com/koordinates/kart/pull/1032)
 
 ## 0.16.0
 

diff --git a/kart/base_diff_writer.py b/kart/base_diff_writer.py
@@ -456,7 +456,7 @@ def get_file_diff(self):
     def iter_deltadiff_items(self, deltas):
         if self.sort_keys:
             return deltas.sorted_items()
-        return deltas.items()
+        return deltas.iter_items()
 
     def filtered_dataset_deltas(self, ds_path, ds_diff):
         """

diff --git a/kart/diff_structs.py b/kart/diff_structs.py
@@ -1,6 +1,6 @@
 from collections import UserDict
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Iterator
 
 from .exceptions import InvalidOperation
 
@@ -335,8 +335,7 @@ def prune(self, recurse=True):
         Deletes any empty RichDicts that are children of self.
         If recurse is True, also deletes non-empty RichDicts, as long as they only contain empty RichDicts in the end.
         """
-        items = list(self.items())
-        for key, value in items:
+        for key, value in list(self.items()):
             if key == "data_changes" and value == False and len(self) == 1:
                 del self[key]
             if not isinstance(value, RichDict):
@@ -430,6 +429,10 @@ def __json__(self):
         return {k: v for k, v in self.items()}
 
 
+class InvalidatedDeltaDiff(Exception):
+    pass
+
+
 class DeltaDiff(Diff):
     """
     A DeltaDiff is the inner-most type of Diff, the one that actually contains Deltas.
@@ -439,16 +442,83 @@ class DeltaDiff(Diff):
     child_type = Delta
 
     def __init__(self, initial_contents=()):
+        self._lazy_initial_contents = None
         if isinstance(initial_contents, (dict, UserDict)):
             super().__init__(initial_contents)
         else:
+            if isinstance(initial_contents, Iterator):
+                self._lazy_initial_contents = (
+                    (delta.key, delta) for delta in initial_contents
+                )
+                initial_contents = ()
             super().__init__((delta.key, delta) for delta in initial_contents)
 
+    def __getitem__(self, key):
+        if key in self.data:
+            return self.data[key]
+        self._evaluate_lazy_initial_contents()
+        return self.data[key]
+
     def __setitem__(self, key, delta):
         if key != delta.key:
             raise ValueError("Delta must be added at the appropriate key")
         super().__setitem__(key, delta)
 
+    def _evaluate_lazy_initial_contents(self):
+        if self._lazy_initial_contents is None:
+            return
+        for k, v in self._lazy_initial_contents:
+            if k not in self:
+                self[k] = v
+        self._lazy_initial_contents = None
+
+    def __bool__(self):
+        result = bool(self.data)
+        if (not result) and self._lazy_initial_contents:
+            # If the DeltaDiff is empty, but has lazy initial contents, evaluate the first item to check booleanness.
+            try:
+                k, v = next(self._lazy_initial_contents)
+            except StopIteration:
+                return False
+            else:
+                # remember this result
+                self.data[k] = v
+                return True
+        return result
+
+    def __len__(self):
+        self._evaluate_lazy_initial_contents()
+        return super().__len__()
+
+    def items(self):
+        self._evaluate_lazy_initial_contents()
+        return super().items()
+
+    def iter_items(self):
+        """
+        Iterates over the items in the DeltaDiff, including any lazy initial contents.
+
+        This method consumes the iterator without storing its contents. It's not safe to call this method and then call items() on the same object.
+        """
+        yield from self.data.items()
+        if self._lazy_initial_contents:
+            for k, v in self._lazy_initial_contents:
+                if k not in self:
+                    yield (k, v)
+
+        # Invalidate this DeltaDiff; it's not safe to consume it again after this.
+        self.data = InvalidatedDeltaDiff(
+            "DeltaDiff can't be used after iter_items() has been called"
+        )
+
+    def keys(self):
+        self._evaluate_lazy_initial_contents()
+        return super().keys()
+
+    def values(self):
+        self._evaluate_lazy_initial_contents()
+        return super().values()
+
     def add_delta(self, delta):
         """Add the given delta at the appropriate key."""
         super().__setitem__(delta.key, delta)

diff --git a/kart/diff_util.py b/kart/diff_util.py
@@ -154,8 +154,9 @@ def get_dataset_diff(
     ds_diff = DatasetDiff.concatenated(
         base_target_diff, target_wc_diff, overwrite_original=True
     )
-    # Get rid of parts of the diff-structure that are "empty":
-    ds_diff.prune()
+    if include_wc_diff:
+        # Get rid of parts of the diff-structure that are "empty":
+        ds_diff.prune()
     return ds_diff
 
 

diff --git a/tests/test_diff.py b/tests/test_diff.py
@@ -11,6 +11,7 @@
 
 import kart
 from kart.tabular.v3 import TableV3
+from kart.base_dataset import BaseDataset
 from kart.diff_format import DiffFormat
 from kart.diff_structs import Delta, DeltaDiff
 from kart.html_diff_writer import HtmlDiffWriter
@@ -319,6 +320,42 @@ def slow_down_the_diff(self, ds_path, ds_diff, diff_format=DiffFormat.FULL):
         ]
 
 
+def test_diff_doesnt_evaluate_all_deltas_up_front_if_you_dont_sort_keys(
+    data_archive_readonly, monkeypatch, cli_runner
+):
+    # Test that we can start outputting features before we have instantiated all the feature deltas.
+    with data_archive_readonly("points") as repo_path:
+        orig_delta_as_json = JsonLinesDiffWriter.delta_as_json
+        features_written = 0
+
+        def delta_as_json(self, *args, **kwargs):
+            nonlocal features_written
+            features_written += 1
+            return orig_delta_as_json(self, *args, **kwargs)
+
+        with monkeypatch.setattr(JsonLinesDiffWriter, "delta_as_json", delta_as_json):
+            orig_wrap_deltas_from_raw_diff = BaseDataset.wrap_deltas_from_raw_diff
+
+            def wrap_deltas_from_raw_diff(self, *args, **kwargs):
+                yield from orig_wrap_deltas_from_raw_diff(self, *args, **kwargs)
+                if not features_written:
+                    pytest.fail(
+                        "All deltas shouldn't be evaluated until some features are written"
+                    )
+
+            with monkeypatch.setattr(
+                BaseDataset, "wrap_deltas_from_raw_diff", wrap_deltas_from_raw_diff
+            ):
+                r = cli_runner.invoke(
+                    [
+                        "diff",
+                        f"--output-format=json-lines",
+                        "--no-sort-keys",
+                        "[EMPTY]...",
+                    ]
+                )
+
+
 @pytest.mark.parametrize("output_format", DIFF_OUTPUT_FORMATS)
 def test_diff_reprojection(output_format, data_working_copy, cli_runner):
     """diff the working copy against HEAD"""