Skip to content

Commit

Permalink
Reduce TTFB for diff generation
Browse files Browse the repository at this point in the history
Lazily populate diff items.

This mostly eliminates TTFB for diffs and also avoids linear-scaled
memory consumption during large diffs

Output seems to be ~30% faster overall
  • Loading branch information
craigds committed Jan 8, 2025
1 parent e111062 commit d8d78ff
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 6 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ Please note that compatibility for 0.x releases (software or repositories) isn't

_When adding new entries to the changelog, please include issue/PR numbers wherever possible._

## Unreleased

- diff: Much faster and much lower memory usage when using `--no-sort-keys` [#1032](https://github.com/koordinates/kart/pull/1032)

## 0.16.0

Expand Down
2 changes: 1 addition & 1 deletion kart/base_diff_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ def get_file_diff(self):
def iter_deltadiff_items(self, deltas):
if self.sort_keys:
return deltas.sorted_items()
return deltas.items()
return deltas.iter_items()

def filtered_dataset_deltas(self, ds_path, ds_diff):
"""
Expand Down
76 changes: 73 additions & 3 deletions kart/diff_structs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import UserDict
from dataclasses import dataclass
from typing import Any
from typing import Any, Iterator

from .exceptions import InvalidOperation

Expand Down Expand Up @@ -335,8 +335,7 @@ def prune(self, recurse=True):
Deletes any empty RichDicts that are children of self.
If recurse is True, also deletes non-empty RichDicts, as long as they only contain empty RichDicts in the end.
"""
items = list(self.items())
for key, value in items:
for key, value in list(self.items()):
if key == "data_changes" and value == False and len(self) == 1:
del self[key]
if not isinstance(value, RichDict):
Expand Down Expand Up @@ -430,6 +429,10 @@ def __json__(self):
return {k: v for k, v in self.items()}


class InvalidatedDeltaDiff(Exception):
pass


class DeltaDiff(Diff):
"""
A DeltaDiff is the inner-most type of Diff, the one that actually contains Deltas.
Expand All @@ -439,16 +442,83 @@ class DeltaDiff(Diff):
child_type = Delta

def __init__(self, initial_contents=()):
self._lazy_initial_contents = None
if isinstance(initial_contents, (dict, UserDict)):
super().__init__(initial_contents)
else:
if isinstance(initial_contents, Iterator):
self._lazy_initial_contents = (
(delta.key, delta) for delta in initial_contents
)
initial_contents = ()
super().__init__((delta.key, delta) for delta in initial_contents)

def __getitem__(self, key):
if key in self.data:
return self.data[key]
self._evaluate_lazy_initial_contents()
return self.data[key]

def __setitem__(self, key, delta):
if key != delta.key:
raise ValueError("Delta must be added at the appropriate key")
super().__setitem__(key, delta)

def _evaluate_lazy_initial_contents(self):
if self._lazy_initial_contents is None:
return
for k, v in self._lazy_initial_contents:
if k not in self:
self[k] = v
self._lazy_initial_contents = None

def __bool__(self):
result = bool(self.data)
if (not result) and self._lazy_initial_contents:
# If the DeltaDiff is empty, but has lazy initial contents, evaluate the first item to check booleanness.
try:
k, v = next(self._lazy_initial_contents)
except StopIteration:
return False
else:
# remember this result
self.data[k] = v
return True
return result

def __len__(self):
self._evaluate_lazy_initial_contents()
return super().__len__()

def items(self):
self._evaluate_lazy_initial_contents()
return super().items()

def iter_items(self):
"""
Iterates over the items in the DeltaDiff, including any lazy initial contents.
This method consumes the iterator without storing its contents. It's not safe to call this method and then call items() on the same object.
"""
yield from self.data.items()
if self._lazy_initial_contents:
for k, v in self._lazy_initial_contents:
if k not in self:
yield (k, v)

# Invalidate this DeltaDiff; it's not safe to consume it again after this.
self.data = InvalidatedDeltaDiff(
"DeltaDiff can't be used after iter_items() has been called"
)

def keys(self):
self._evaluate_lazy_initial_contents()
return super().keys()

def values(self):
self._evaluate_lazy_initial_contents()
return super().values()

def add_delta(self, delta):
"""Add the given delta at the appropriate key."""
super().__setitem__(delta.key, delta)
Expand Down
5 changes: 3 additions & 2 deletions kart/diff_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,9 @@ def get_dataset_diff(
ds_diff = DatasetDiff.concatenated(
base_target_diff, target_wc_diff, overwrite_original=True
)
# Get rid of parts of the diff-structure that are "empty":
ds_diff.prune()
if include_wc_diff:
# Get rid of parts of the diff-structure that are "empty":
ds_diff.prune()
return ds_diff


Expand Down
37 changes: 37 additions & 0 deletions tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import kart
from kart.tabular.v3 import TableV3
from kart.base_dataset import BaseDataset
from kart.diff_format import DiffFormat
from kart.diff_structs import Delta, DeltaDiff
from kart.html_diff_writer import HtmlDiffWriter
Expand Down Expand Up @@ -319,6 +320,42 @@ def slow_down_the_diff(self, ds_path, ds_diff, diff_format=DiffFormat.FULL):
]


def test_diff_doesnt_evaluate_all_deltas_up_front_if_you_dont_sort_keys(
data_archive_readonly, monkeypatch, cli_runner
):
# Test that we can start outputting features before we have instantiated all the feature deltas.
with data_archive_readonly("points") as repo_path:
orig_delta_as_json = JsonLinesDiffWriter.delta_as_json
features_written = 0

def delta_as_json(self, *args, **kwargs):
nonlocal features_written
features_written += 1
return orig_delta_as_json(self, *args, **kwargs)

with monkeypatch.setattr(JsonLinesDiffWriter, "delta_as_json", delta_as_json):
orig_wrap_deltas_from_raw_diff = BaseDataset.wrap_deltas_from_raw_diff

def wrap_deltas_from_raw_diff(self, *args, **kwargs):
yield from orig_wrap_deltas_from_raw_diff(self, *args, **kwargs)
if not features_written:
pytest.fail(
"All deltas shouldn't be evaluated until some features are written"
)

with monkeypatch.setattr(
BaseDataset, "wrap_deltas_from_raw_diff", wrap_deltas_from_raw_diff
):
r = cli_runner.invoke(
[
"diff",
f"--output-format=json-lines",
"--no-sort-keys",
"[EMPTY]...",
]
)


@pytest.mark.parametrize("output_format", DIFF_OUTPUT_FORMATS)
def test_diff_reprojection(output_format, data_working_copy, cli_runner):
"""diff the working copy against HEAD"""
Expand Down

0 comments on commit d8d78ff

Please sign in to comment.