From 0f93d0eb8911ec079683ee092f6cc47522765725 Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Mon, 30 Oct 2023 16:32:32 +0300 Subject: [PATCH 01/10] bump 0.0.7_dev --- src/nbmetaclean/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbmetaclean/version.py b/src/nbmetaclean/version.py index 62c73e6..6f26183 100644 --- a/src/nbmetaclean/version.py +++ b/src/nbmetaclean/version.py @@ -1 +1 @@ -__version__ = "0.0.6" # pragma: no cover +__version__ = "0.0.7_dev" # pragma: no cover From 90609a71f2789ed61cd45f3ca81d209842df43d5 Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Thu, 2 Nov 2023 11:29:15 +0300 Subject: [PATCH 02/10] typing --- src/nbmetaclean/clean.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/nbmetaclean/clean.py b/src/nbmetaclean/clean.py index fd25a68..8758ef6 100644 --- a/src/nbmetaclean/clean.py +++ b/src/nbmetaclean/clean.py @@ -5,7 +5,8 @@ from pathlib import Path from typing import Optional, Union -from .core import read_nb, write_nb, PathOrStr +from nbmetaclean.core import read_nb, write_nb + from .typing import NbNode, Metadata NB_METADATA_PRESERVE_MASKS = [ @@ -14,11 +15,11 @@ def filter_meta_mask( - nb_meta: Union[Metadata, str], + nb_meta: Union[str, int, Metadata], mask: [tuple[str, ...]] = None, -) -> Union[Metadata, str]: +) -> Union[str, int, Metadata]: """Filter metadata by mask. If no mask return empty dict.""" - if isinstance(nb_meta, str) or mask == (): + if isinstance(nb_meta, (str, int)) or mask == (): return nb_meta if mask is None: return {} @@ -37,9 +38,9 @@ def filter_metadata( """Clean notebooknode metadata.""" if masks is None: return {} - filtered_meta = {} + filtered_meta: Metadata = {} for mask in masks: - filtered_meta.update(filter_meta_mask(nb_meta, mask)) + filtered_meta.update(filter_meta_mask(nb_meta, mask)) # type: ignore return filtered_meta @@ -84,7 +85,7 @@ def clean_nb( clear_execution_count: bool = True, clear_outputs: bool = False, preserve_nb_metadata_masks: Optional[list[tuple[str, ...]],] = None, - preserve_cell_metadata_mask: Optional[str] = None, + preserve_cell_metadata_mask: Optional[tuple[str, ...]] = None, ) -> tuple[NbNode, bool]: """Clean notebook - metadata, execution_count, outputs. @@ -118,7 +119,7 @@ def clean_nb( def clean_nb_file( - path: Union[PathOrStr, list[PathOrStr]], + path: Union[Path, list[Path]], clear_nb_metadata: bool = True, clear_cell_metadata: bool = True, clear_execution_count: bool = True, From 414467d608ddfc1dbee5a849c70bcd6c1bd9b4d2 Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Fri, 3 Nov 2023 12:06:09 +0300 Subject: [PATCH 03/10] try exept on clean --- src/nbmetaclean/app.py | 6 ++++- src/nbmetaclean/clean.py | 49 ++++++++++++++++++++++++---------------- tests/test_clean.py | 13 ++++++----- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/src/nbmetaclean/app.py b/src/nbmetaclean/app.py index cb257b0..f6dba39 100644 --- a/src/nbmetaclean/app.py +++ b/src/nbmetaclean/app.py @@ -41,13 +41,17 @@ def app() -> None: print(f"{path} not exists!") if not cfg.silent: print(f"notebooks to check: {len(nb_files)} ") - cleaned = clean_nb_file( + cleaned, errors = clean_nb_file( nb_files, silent=cfg.silent, preserve_timestamp=not cfg.not_pt, ) if not cfg.silent: print(f"cleaned nbs: {len(cleaned)}") + if errors: + print(f"with errors: {len(errors)}") + for nb, exc in errors: + print(f"{nb}: {exc}") if __name__ == "__main__": diff --git a/src/nbmetaclean/clean.py b/src/nbmetaclean/clean.py index 8758ef6..478c61b 100644 --- a/src/nbmetaclean/clean.py +++ b/src/nbmetaclean/clean.py @@ -126,7 +126,7 @@ def clean_nb_file( clear_outputs: bool = False, preserve_timestamp: bool = True, silent: bool = False, -) -> list[Path]: +) -> tuple[list[Path], list[tuple[Path, Exception]]]: """Clean metadata and execution count from notebook. Args: @@ -139,28 +139,37 @@ def clean_nb_file( silent (bool): Silent mode. Defaults to False. Returns: - List[Path]: List of cleaned notebooks + tuple[List[Path], List[TuplePath]]: List of cleaned notebooks, list of notebooks with errors. """ if not isinstance(path, list): path = [path] cleaned: list[Path] = [] + errors: list[tuple[Path, Exception]] = [] to_clean = len(path) for num, filename in enumerate(path): - nb = read_nb(filename) - nb, result = clean_nb( - nb, - clear_execution_count=clear_execution_count, - clear_outputs=clear_outputs, - clear_nb_metadata=clear_nb_metadata, - clear_cell_metadata=clear_cell_metadata, - ) - if result: - cleaned.append(filename) - if preserve_timestamp: - stat = filename.stat() - write_nb(nb, filename) - if preserve_timestamp: - os.utime(filename, (stat.st_atime, stat.st_mtime)) - if not silent: - print(f"done {num + 1} of {to_clean}: {filename}") - return cleaned + try: + nb = read_nb(filename) + except Exception as ex: + errors.append((filename, ex)) + continue + try: + nb, result = clean_nb( + nb, + clear_execution_count=clear_execution_count, + clear_outputs=clear_outputs, + clear_nb_metadata=clear_nb_metadata, + clear_cell_metadata=clear_cell_metadata, + ) + if result: + cleaned.append(filename) + if preserve_timestamp: + stat = filename.stat() + write_nb(nb, filename) + if preserve_timestamp: + os.utime(filename, (stat.st_atime, stat.st_mtime)) + if not silent: + print(f"done {num + 1} of {to_clean}: {filename}") + except Exception as ex: + errors.append((filename, ex)) + continue + return cleaned, errors diff --git a/tests/test_clean.py b/tests/test_clean.py index e2277b0..35352ad 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -164,8 +164,9 @@ def test_clean_nb_file(tmp_path: Path, capsys: CaptureFixture[str]): test_nb_path = write_nb(read_nb(path / nb_name), tmp_path / nb_name) # clean meta, leave execution_count - cleaned = clean_nb_file(test_nb_path, clear_execution_count=False) + cleaned, errors = clean_nb_file(test_nb_path, clear_execution_count=False) assert len(cleaned) == 1 + assert len(errors) == 0 nb = read_nb(cleaned[0]) assert nb["metadata"] == nb_clean["metadata"] assert nb["cells"][1]["execution_count"] == 1 @@ -173,7 +174,7 @@ def test_clean_nb_file(tmp_path: Path, capsys: CaptureFixture[str]): # clean meta, execution_count # path as list - cleaned = clean_nb_file([test_nb_path]) + cleaned, errors = clean_nb_file([test_nb_path]) captured = capsys.readouterr() out = captured.out assert out.startswith("done") @@ -183,7 +184,7 @@ def test_clean_nb_file(tmp_path: Path, capsys: CaptureFixture[str]): assert nb == nb_clean # try clean cleaned - cleaned = clean_nb_file(test_nb_path) + cleaned, errors = clean_nb_file(test_nb_path) assert len(cleaned) == 0 captured = capsys.readouterr() out = captured.out @@ -191,7 +192,7 @@ def test_clean_nb_file(tmp_path: Path, capsys: CaptureFixture[str]): # silent test_nb_path = write_nb(read_nb(path / nb_name), tmp_path / nb_name) - cleaned = clean_nb_file(test_nb_path, silent=True) + cleaned, errors = clean_nb_file(test_nb_path, silent=True) assert len(cleaned) == 1 captured = capsys.readouterr() assert not captured.out.strip() @@ -210,7 +211,7 @@ def test_clean_nb_file_timestamp(tmp_path: Path): assert test_nb_stat.st_atime == nb_stat.st_atime assert test_nb_stat.st_mtime == nb_stat.st_mtime - cleaned = clean_nb_file(test_nb_path) + cleaned, errors = clean_nb_file(test_nb_path) assert len(cleaned) == 1 cleaned_stat = cleaned[0].stat() assert True @@ -219,7 +220,7 @@ def test_clean_nb_file_timestamp(tmp_path: Path): # dont preserve timestamp test_nb_path = write_nb(read_nb(path / nb_name), tmp_path / nb_name) os.utime(test_nb_path, (nb_stat.st_atime, nb_stat.st_mtime)) - cleaned = clean_nb_file(test_nb_path, preserve_timestamp=False) + cleaned, errors = clean_nb_file(test_nb_path, preserve_timestamp=False) assert len(cleaned) == 1 cleaned_stat = cleaned[0].stat() assert True From de066781f7376c9a8926d2d56a835569f12fc284 Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Fri, 3 Nov 2023 13:53:23 +0300 Subject: [PATCH 04/10] add catch errors --- src/nbmetaclean/clean.py | 36 ++++++++++++++++-------------------- tests/test_clean.py | 25 +++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/src/nbmetaclean/clean.py b/src/nbmetaclean/clean.py index 478c61b..b2f5bb3 100644 --- a/src/nbmetaclean/clean.py +++ b/src/nbmetaclean/clean.py @@ -152,24 +152,20 @@ def clean_nb_file( except Exception as ex: errors.append((filename, ex)) continue - try: - nb, result = clean_nb( - nb, - clear_execution_count=clear_execution_count, - clear_outputs=clear_outputs, - clear_nb_metadata=clear_nb_metadata, - clear_cell_metadata=clear_cell_metadata, - ) - if result: - cleaned.append(filename) - if preserve_timestamp: - stat = filename.stat() - write_nb(nb, filename) - if preserve_timestamp: - os.utime(filename, (stat.st_atime, stat.st_mtime)) - if not silent: - print(f"done {num + 1} of {to_clean}: {filename}") - except Exception as ex: - errors.append((filename, ex)) - continue + nb, result = clean_nb( + nb, + clear_execution_count=clear_execution_count, + clear_outputs=clear_outputs, + clear_nb_metadata=clear_nb_metadata, + clear_cell_metadata=clear_cell_metadata, + ) + if result: + cleaned.append(filename) + if preserve_timestamp: + stat = filename.stat() + write_nb(nb, filename) + if preserve_timestamp: + os.utime(filename, (stat.st_atime, stat.st_mtime)) + if not silent: + print(f"done {num + 1} of {to_clean}: {filename}") return cleaned, errors diff --git a/tests/test_clean.py b/tests/test_clean.py index 35352ad..ffaaa1e 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -194,10 +194,33 @@ def test_clean_nb_file(tmp_path: Path, capsys: CaptureFixture[str]): test_nb_path = write_nb(read_nb(path / nb_name), tmp_path / nb_name) cleaned, errors = clean_nb_file(test_nb_path, silent=True) assert len(cleaned) == 1 + assert len(errors) == 0 captured = capsys.readouterr() assert not captured.out.strip() +def test_clean_nb_file_errors(capsys: CaptureFixture[str], tmp_path: Path): + """test clean_nb_file, errors""" + path = tmp_path / "wrong_name" + cleaned, errors = clean_nb_file(path) + assert len(cleaned) == 0 + assert len(errors) == 1 + assert errors[0][0] == path + assert "No such file or directory" in str(errors[0][1]) + captured = capsys.readouterr() + assert not captured.out + assert not captured.err + with path.open("w", encoding="utf-8") as fh: + fh.write("wrong nb") + cleaned, errors = clean_nb_file(path) + assert "wrong_name" in str(errors[0]) + assert len(cleaned) == 0 + assert len(errors) == 1 + captured = capsys.readouterr() + assert not captured.out + assert not captured.err + + def test_clean_nb_file_timestamp(tmp_path: Path): """test clean_nb_file, timestamp""" path = Path("tests/test_nbs") @@ -213,6 +236,7 @@ def test_clean_nb_file_timestamp(tmp_path: Path): cleaned, errors = clean_nb_file(test_nb_path) assert len(cleaned) == 1 + assert len(errors) == 0 cleaned_stat = cleaned[0].stat() assert True assert cleaned_stat.st_mtime == test_nb_stat.st_mtime @@ -222,6 +246,7 @@ def test_clean_nb_file_timestamp(tmp_path: Path): os.utime(test_nb_path, (nb_stat.st_atime, nb_stat.st_mtime)) cleaned, errors = clean_nb_file(test_nb_path, preserve_timestamp=False) assert len(cleaned) == 1 + assert len(errors) == 0 cleaned_stat = cleaned[0].stat() assert True assert cleaned_stat.st_mtime != nb_stat.st_mtime From d0efd63676390ff455ceb35024c61d9186655899 Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Fri, 17 Nov 2023 13:46:09 +0300 Subject: [PATCH 05/10] change hidden argument --- src/nbmetaclean/core.py | 12 +++++++----- tests/test_read_write.py | 11 +++++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/nbmetaclean/core.py b/src/nbmetaclean/core.py index 70b3074..8b24a7d 100644 --- a/src/nbmetaclean/core.py +++ b/src/nbmetaclean/core.py @@ -51,14 +51,14 @@ def write_nb( def get_nb_names( path: Optional[PathOrStr] = None, recursive: bool = True, - filter_hidden: bool = True, + hidden: bool = False, ) -> list[Path]: """Return list of notebooks from `path`. If no `path` return notebooks from current folder. Args: path (Union[Path, str, None]): Path for nb or folder with notebooks. recursive bool: Recursive search. - filter_hidden bool: Filter hidden paths. + hidden bool: Skip or not hidden paths, defaults to False. Raises: sys.exit: If filename or dir not exists or not nb file. @@ -75,14 +75,16 @@ def get_nb_names( result = [] for item in nb_path.iterdir(): if item.is_file() and item.suffix == ".ipynb": - if filter_hidden and item.name.startswith("."): + if not hidden and item.name.startswith("."): continue result.append(item) if item.is_dir(): if recursive: - if filter_hidden and item.name.startswith("."): + if not hidden and item.name.startswith("."): continue - result.extend(get_nb_names(item, recursive, filter_hidden)) + if "checkpoint" in item.name: + continue + result.extend(get_nb_names(item, recursive, hidden)) return result diff --git a/tests/test_read_write.py b/tests/test_read_write.py index 5b07b55..55ee39c 100644 --- a/tests/test_read_write.py +++ b/tests/test_read_write.py @@ -45,16 +45,19 @@ def test_write_nb(tmp_path: Path): def test_get_nb_names(): """test get_nb_names""" path = Path("tests/test_nbs") + # filename as argument file = path / "test_nb_1.ipynb" names = get_nb_names(file) assert len(names) == 1 names.sort(key=lambda x: x.name) assert names[0] == file + # path as argument names = get_nb_names(path) assert len(names) == 2 names.sort(key=lambda x: x.name) assert names[0] == file - names = get_nb_names(path, filter_hidden=False) + # path as argument. add hidden files + names = get_nb_names(path, hidden=True) assert len(names) == 3 try: get_nb_names("wrong_name") @@ -78,7 +81,7 @@ def test_get_nb_names_recursive_hidden(tmp_path: Path): pass files = get_nb_names(tmp_path) assert len(files) == 1 - files = get_nb_names(tmp_path, filter_hidden=False) + files = get_nb_names(tmp_path, hidden=True) assert len(files) == 2 # add simple file with open((tmp_path / "simple"), "w", encoding="utf-8") as _: @@ -95,7 +98,7 @@ def test_get_nb_names_recursive_hidden(tmp_path: Path): pass files = get_nb_names(tmp_path) assert len(files) == 2 - files = get_nb_names(tmp_path, filter_hidden=False) + files = get_nb_names(tmp_path, hidden=True) assert len(files) == 4 files = get_nb_names(tmp_path, recursive=False) @@ -108,7 +111,7 @@ def test_get_nb_names_recursive_hidden(tmp_path: Path): pass with open((hid_dir / ".tst").with_suffix(suffix), "w", encoding="utf-8") as _: pass - files = get_nb_names(tmp_path, filter_hidden=False) + files = get_nb_names(tmp_path, hidden=True) assert len(files) == 6 files = get_nb_names(tmp_path) assert len(files) == 2 From 1bdce1c4a1235f186585e862a1cf331f11d6c0b8 Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Fri, 17 Nov 2023 17:30:07 +0300 Subject: [PATCH 06/10] cfg for clean --- src/nbmetaclean/app.py | 8 ++- src/nbmetaclean/clean.py | 146 +++++++++++++++++++++++---------------- tests/test_clean.py | 99 +++++++++++++++++++++----- 3 files changed, 173 insertions(+), 80 deletions(-) diff --git a/src/nbmetaclean/app.py b/src/nbmetaclean/app.py index f6dba39..bc2e4b8 100644 --- a/src/nbmetaclean/app.py +++ b/src/nbmetaclean/app.py @@ -1,7 +1,7 @@ import argparse from pathlib import Path -from .clean import clean_nb_file +from .clean import CleanConfig, clean_nb_file from .core import get_nb_names parser = argparse.ArgumentParser( @@ -43,8 +43,10 @@ def app() -> None: print(f"notebooks to check: {len(nb_files)} ") cleaned, errors = clean_nb_file( nb_files, - silent=cfg.silent, - preserve_timestamp=not cfg.not_pt, + CleanConfig( + silent=cfg.silent, + preserve_timestamp=not cfg.not_pt, + ), ) if not cfg.silent: print(f"cleaned nbs: {len(cleaned)}") diff --git a/src/nbmetaclean/clean.py b/src/nbmetaclean/clean.py index b2f5bb3..604373b 100644 --- a/src/nbmetaclean/clean.py +++ b/src/nbmetaclean/clean.py @@ -1,5 +1,6 @@ from __future__ import annotations import copy +from dataclasses import dataclass import os from pathlib import Path @@ -9,9 +10,34 @@ from .typing import NbNode, Metadata -NB_METADATA_PRESERVE_MASKS = [ - ("language_info", "name"), -] +NB_METADATA_PRESERVE_MASKS = (("language_info", "name"),) + + +@dataclass +class CleanConfig: + """Clean config. + + Args: + clear_nb_metadata (bool, optional): Clear notebook metadata. Defaults to True. + clear_cell_metadata (bool, optional): Clear cell metadata. Defaults to False. + clear_execution_count (bool, optional): Clear cell execution count. Defaults to True. + clear_outputs (bool, optional): Clear cell outputs. Defaults to False. + preserve_timestamp (bool, optional): Preserve timestamp. Defaults to True. + silent (bool, optional): Silent mode. Defaults to False. + nb_metadata_preserve_mask (Optional[tuple[str, ...]], optional): + Preserve mask for notebook metadata. Defaults to None. + cell_metadata_preserve_mask (Optional[tuple[str, ...]], optional): + Preserve mask for cell metadata. Defaults to None. + """ + + clear_nb_metadata: bool = True + clear_cell_metadata: bool = False + clear_execution_count: bool = True + clear_outputs: bool = False + preserve_timestamp: bool = True + silent: bool = False + nb_metadata_preserve_mask: Optional[tuple[str, ...]] = None + cell_metadata_preserve_mask: Optional[tuple[str, ...]] = None def filter_meta_mask( @@ -44,49 +70,59 @@ def filter_metadata( return filtered_meta -def clean_cell_metadata( +def clean_cell( cell: NbNode, - clear_execution_count: bool = True, - clear_outputs: bool = False, - preserve_cell_metadata_mask: Optional[list[tuple[str, ...]]] = None, + cfg: CleanConfig, ) -> bool: - """Clean cell metadata.""" + """Clean cell: optionally metadata, execution_count and outputs.""" changed = False - if metadata := cell.get("metadata", None): - old_metadata = copy.deepcopy(metadata) - cell["metadata"] = filter_metadata(metadata, preserve_cell_metadata_mask) - if cell["metadata"] != old_metadata: - changed = True - if clear_outputs and cell.get("outputs"): - cell["outputs"] = [] - changed = True - if clear_execution_count and cell.get("execution_count"): + + if cfg.clear_cell_metadata: + if metadata := cell.get("metadata", None): + old_metadata = copy.deepcopy(metadata) + cell["metadata"] = filter_metadata( + metadata, cfg.cell_metadata_preserve_mask + ) + if cell["metadata"] != old_metadata: + changed = True + + if cfg.clear_execution_count and cell.get("execution_count"): cell["execution_count"] = None changed = True - if outputs := cell.get("outputs"): - for output in outputs: - if clear_execution_count and output.get("execution_count", None): - output["execution_count"] = None + + if cell.get("outputs"): + if cfg.clear_outputs: + cell["outputs"] = [] + changed = True + elif cfg.clear_cell_metadata or cfg.clear_execution_count: + result = clean_outputs(cell["outputs"], cfg) + if result: + changed = True + + return changed + + +def clean_outputs(outputs: list[NbNode], cfg: CleanConfig) -> bool: + """Clean outputs.""" + changed = False + for output in outputs: + if cfg.clear_execution_count and output.get("execution_count", None): + output["execution_count"] = None + changed = True + if cfg.clear_cell_metadata and (metadata := output.get("metadata", None)): + old_metadata = copy.deepcopy(metadata) + output["metadata"] = filter_metadata( + metadata, cfg.cell_metadata_preserve_mask + ) + if output["metadata"] != old_metadata: changed = True - if metadata := output.get("metadata", None): - old_metadata = copy.deepcopy(metadata) - output["metadata"] = filter_metadata( - metadata, preserve_cell_metadata_mask - ) - if output["metadata"] != old_metadata: - changed = True return changed def clean_nb( nb: NbNode, - clear_nb_metadata: bool = True, - clear_cell_metadata: bool = True, - clear_execution_count: bool = True, - clear_outputs: bool = False, - preserve_nb_metadata_masks: Optional[list[tuple[str, ...]],] = None, - preserve_cell_metadata_mask: Optional[tuple[str, ...]] = None, -) -> tuple[NbNode, bool]: + cfg: CleanConfig, +) -> bool: """Clean notebook - metadata, execution_count, outputs. Args: @@ -98,41 +134,36 @@ def clean_nb( bool: True if changed. """ changed = False - if clear_nb_metadata and (metadata := nb.get("metadata")): + if cfg.clear_nb_metadata and (metadata := nb.get("metadata")): old_metadata = copy.deepcopy(metadata) - masks = preserve_nb_metadata_masks or NB_METADATA_PRESERVE_MASKS + masks = ( + cfg.nb_metadata_preserve_mask or NB_METADATA_PRESERVE_MASKS + ) # todo: merge or replace? nb["metadata"] = filter_metadata(metadata, masks=masks) if nb["metadata"] != old_metadata: changed = True - if clear_cell_metadata: + if cfg.clear_cell_metadata or cfg.clear_execution_count or cfg.clear_outputs: for cell in nb["cells"]: - result = clean_cell_metadata( + result = clean_cell( cell, - clear_execution_count=clear_execution_count, - clear_outputs=clear_outputs, - preserve_cell_metadata_mask=preserve_cell_metadata_mask, + cfg, ) if result: changed = True - return nb, changed + return changed def clean_nb_file( path: Union[Path, list[Path]], - clear_nb_metadata: bool = True, - clear_cell_metadata: bool = True, - clear_execution_count: bool = True, - clear_outputs: bool = False, - preserve_timestamp: bool = True, - silent: bool = False, + cfg: Optional[CleanConfig] = None, ) -> tuple[list[Path], list[tuple[Path, Exception]]]: """Clean metadata and execution count from notebook. Args: path (Union[str, PosixPath]): Notebook filename or list of names. clear_nb_metadata (bool): Clear notebook metadata. Defaults to True. - clear_cell_metadata (bool): Clear cell metadata. Defaults to True. + clear_cell_metadata (bool): Clear cell metadata. Defaults to False. clear_outputs (bool): Clear outputs. Defaults to False. preserve_timestamp (bool): Preserve timestamp. Defaults to True. clear_execution_count (bool, optional): Clean execution count. Defaults to True. @@ -141,6 +172,8 @@ def clean_nb_file( Returns: tuple[List[Path], List[TuplePath]]: List of cleaned notebooks, list of notebooks with errors. """ + if cfg is None: + cfg = CleanConfig() if not isinstance(path, list): path = [path] cleaned: list[Path] = [] @@ -152,20 +185,17 @@ def clean_nb_file( except Exception as ex: errors.append((filename, ex)) continue - nb, result = clean_nb( + result = clean_nb( nb, - clear_execution_count=clear_execution_count, - clear_outputs=clear_outputs, - clear_nb_metadata=clear_nb_metadata, - clear_cell_metadata=clear_cell_metadata, + cfg, ) if result: cleaned.append(filename) - if preserve_timestamp: + if cfg.preserve_timestamp: stat = filename.stat() write_nb(nb, filename) - if preserve_timestamp: + if cfg.preserve_timestamp: os.utime(filename, (stat.st_atime, stat.st_mtime)) - if not silent: + if not cfg.silent: print(f"done {num + 1} of {to_clean}: {filename}") return cleaned, errors diff --git a/tests/test_clean.py b/tests/test_clean.py index ffaaa1e..e8a7fd3 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -6,7 +6,8 @@ from nbmetaclean.clean import ( NB_METADATA_PRESERVE_MASKS, - clean_cell_metadata, + CleanConfig, + clean_cell, clean_nb, clean_nb_file, filter_meta_mask, @@ -60,7 +61,13 @@ def test_clean_cell_metadata(): assert not cell.get("metadata") assert cell.get("execution_count") == 1 cell["metadata"] = {"some key": "some value"} - changed = clean_cell_metadata(cell, clear_outputs=True) + changed = clean_cell( + cell, + cfg=CleanConfig( + clear_outputs=True, + clear_cell_metadata=True, + ), + ) assert changed assert not cell.get("outputs") assert not cell.get("metadata") @@ -73,24 +80,35 @@ def test_clean_cell_metadata(): "some key": "some value", "some other key": "some value", } - changed = clean_cell_metadata( + changed = clean_cell( cell, - clear_execution_count=False, - preserve_cell_metadata_mask=[("some key",)], + CleanConfig( + clear_execution_count=False, + clear_cell_metadata=True, + cell_metadata_preserve_mask=(("some key",),), + ), ) assert changed assert cell["outputs"][0]["metadata"] == {"some key": "some value"} assert cell["metadata"] == {"some key": "some value"} assert cell["execution_count"] == 1 - # clear outputs, same mask -> no changes meta, clear ex - changed = clean_cell_metadata(cell, preserve_cell_metadata_mask=[("some key",)]) + # clear outputs, same mask -> no changes meta, clear execution_count + changed = clean_cell( + cell, + cfg=CleanConfig(), + ) assert changed assert cell["execution_count"] is None assert cell["metadata"] == {"some key": "some value"} # clear execution_count, metadata - changed = clean_cell_metadata(cell) + changed = clean_cell( + cell, + cfg=CleanConfig( + clear_cell_metadata=True, + ), + ) assert changed assert not cell["outputs"][0]["metadata"] assert not cell["execution_count"] @@ -98,12 +116,49 @@ def test_clean_cell_metadata(): assert not cell["outputs"][0]["metadata"] +def test_clean_cell(): + """test clean_cel""" + test_nb = read_nb("tests/test_nbs/.test_nb_2_meta.ipynb") + + # nothing to clean. + cell = copy.deepcopy(test_nb.get("cells")[1]) + assert cell.get("outputs") + assert not cell.get("metadata") + assert cell.get("execution_count") == 1 + result = clean_cell(cell, CleanConfig(clear_execution_count=False)) + assert not result + + # clean cell metadata, cell without metadata + cell["metadata"] = {} + result = clean_cell(cell, CleanConfig(clear_cell_metadata=True)) + assert result + assert not cell.get("metadata") + assert cell.get("outputs") + + # clear output metadata + cell["outputs"][0]["metadata"] = {"some key": "some value"} + result = clean_cell( + cell, + CleanConfig( + clear_cell_metadata=True, + cell_metadata_preserve_mask=(("some key",),), + ), + ) + assert not result + assert cell["outputs"][0].get("metadata") == {"some key": "some value"} + + def test_clean_cell_metadata_markdown(): """test clean_cell_metadata with markdown cell""" test_nb = read_nb("tests/test_nbs/.test_nb_2_meta.ipynb") cell = copy.deepcopy(test_nb["cells"][0]) cell["metadata"] = {"some key": "some value"} - changed = clean_cell_metadata(cell) + changed = clean_cell( + cell, + cfg=CleanConfig( + clear_cell_metadata=True, + ), + ) assert changed assert not cell["metadata"] @@ -117,7 +172,7 @@ def test_clean_nb(): assert nb["cells"][1]["execution_count"] == 1 assert nb["cells"][1]["outputs"][0]["execution_count"] == 1 assert nb["metadata"] - nb, result = clean_nb(nb) + result = clean_nb(nb, cfg=CleanConfig()) assert result is True assert nb["cells"][1]["execution_count"] is None assert nb["cells"][1]["outputs"][0]["execution_count"] is None @@ -125,12 +180,15 @@ def test_clean_nb(): assert nb == nb_clean # # try clean cleaned - nb, result = clean_nb(nb_clean) + result = clean_nb(nb_clean, cfg=CleanConfig()) assert not result # # clean metadata, leave execution_count nb = read_nb(nb_path) - nb, result = clean_nb(nb, clear_execution_count=False) + result = clean_nb( + nb, + cfg=CleanConfig(clear_execution_count=False), + ) assert result assert nb["cells"][1]["execution_count"] == 1 assert nb["cells"][1]["outputs"][0]["execution_count"] == 1 @@ -139,7 +197,7 @@ def test_clean_nb(): # clean nb metadata, leave cells metadata nb = read_nb(nb_path) nb["cells"][1]["metadata"] = {"some key": "some value"} - nb, result = clean_nb(nb, clear_cell_metadata=False) + result = clean_nb(nb, CleanConfig(clear_execution_count=False)) assert result assert nb["metadata"] == nb_clean["metadata"] assert nb["cells"][1]["metadata"] == {"some key": "some value"} @@ -148,7 +206,7 @@ def test_clean_nb(): # clean cells metadata, leave nb metadata nb = read_nb(nb_path) nb_meta = copy.deepcopy(nb["metadata"]) - nb, result = clean_nb(nb, clear_nb_metadata=False) + result = clean_nb(nb, CleanConfig(clear_nb_metadata=False)) assert result assert nb["metadata"] == nb_meta assert nb["cells"][1]["execution_count"] is None @@ -164,7 +222,10 @@ def test_clean_nb_file(tmp_path: Path, capsys: CaptureFixture[str]): test_nb_path = write_nb(read_nb(path / nb_name), tmp_path / nb_name) # clean meta, leave execution_count - cleaned, errors = clean_nb_file(test_nb_path, clear_execution_count=False) + cleaned, errors = clean_nb_file( + test_nb_path, + cfg=CleanConfig(clear_execution_count=False), + ) assert len(cleaned) == 1 assert len(errors) == 0 nb = read_nb(cleaned[0]) @@ -174,7 +235,7 @@ def test_clean_nb_file(tmp_path: Path, capsys: CaptureFixture[str]): # clean meta, execution_count # path as list - cleaned, errors = clean_nb_file([test_nb_path]) + cleaned, errors = clean_nb_file([test_nb_path], CleanConfig()) captured = capsys.readouterr() out = captured.out assert out.startswith("done") @@ -184,7 +245,7 @@ def test_clean_nb_file(tmp_path: Path, capsys: CaptureFixture[str]): assert nb == nb_clean # try clean cleaned - cleaned, errors = clean_nb_file(test_nb_path) + cleaned, errors = clean_nb_file(test_nb_path, CleanConfig()) assert len(cleaned) == 0 captured = capsys.readouterr() out = captured.out @@ -192,7 +253,7 @@ def test_clean_nb_file(tmp_path: Path, capsys: CaptureFixture[str]): # silent test_nb_path = write_nb(read_nb(path / nb_name), tmp_path / nb_name) - cleaned, errors = clean_nb_file(test_nb_path, silent=True) + cleaned, errors = clean_nb_file(test_nb_path, CleanConfig(silent=True)) assert len(cleaned) == 1 assert len(errors) == 0 captured = capsys.readouterr() @@ -244,7 +305,7 @@ def test_clean_nb_file_timestamp(tmp_path: Path): # dont preserve timestamp test_nb_path = write_nb(read_nb(path / nb_name), tmp_path / nb_name) os.utime(test_nb_path, (nb_stat.st_atime, nb_stat.st_mtime)) - cleaned, errors = clean_nb_file(test_nb_path, preserve_timestamp=False) + cleaned, errors = clean_nb_file(test_nb_path, CleanConfig(preserve_timestamp=False)) assert len(cleaned) == 1 assert len(errors) == 0 cleaned_stat = cleaned[0].stat() From cdf1fced84c9a940516221707a753a4622b24c86 Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Fri, 17 Nov 2023 18:03:25 +0300 Subject: [PATCH 07/10] add authors to mask --- src/nbmetaclean/clean.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/nbmetaclean/clean.py b/src/nbmetaclean/clean.py index 604373b..f5e8e8d 100644 --- a/src/nbmetaclean/clean.py +++ b/src/nbmetaclean/clean.py @@ -10,7 +10,10 @@ from .typing import NbNode, Metadata -NB_METADATA_PRESERVE_MASKS = (("language_info", "name"),) +NB_METADATA_PRESERVE_MASKS = ( + ("language_info", "name"), + ("authors",), +) @dataclass From d97974c43472e9f08d7dd35d368a30baa8888d77 Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Fri, 17 Nov 2023 18:06:08 +0300 Subject: [PATCH 08/10] sign nbs --- tests/test_nbs/.test_nb_2_meta.ipynb | 6 ++++++ tests/test_nbs/test_nb_1.ipynb | 6 ++++++ tests/test_nbs/test_nb_2_clean.ipynb | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/tests/test_nbs/.test_nb_2_meta.ipynb b/tests/test_nbs/.test_nb_2_meta.ipynb index b75ce89..1a024c9 100644 --- a/tests/test_nbs/.test_nb_2_meta.ipynb +++ b/tests/test_nbs/.test_nb_2_meta.ipynb @@ -29,6 +29,12 @@ } ], "metadata": { + "authors": [ + { + "github": "https://github.com/ayasyrev", + "name": "Andrei Yasyrev" + } + ], "kernelspec": { "display_name": "nbmetaclean", "language": "python", diff --git a/tests/test_nbs/test_nb_1.ipynb b/tests/test_nbs/test_nb_1.ipynb index 5c6deca..4c229c7 100644 --- a/tests/test_nbs/test_nb_1.ipynb +++ b/tests/test_nbs/test_nb_1.ipynb @@ -14,6 +14,12 @@ } ], "metadata": { + "authors": [ + { + "github": "https://github.com/ayasyrev", + "name": "Andrei Yasyrev" + } + ], "language_info": { "name": "python" } diff --git a/tests/test_nbs/test_nb_2_clean.ipynb b/tests/test_nbs/test_nb_2_clean.ipynb index f316a23..82d52cc 100644 --- a/tests/test_nbs/test_nb_2_clean.ipynb +++ b/tests/test_nbs/test_nb_2_clean.ipynb @@ -29,6 +29,12 @@ } ], "metadata": { + "authors": [ + { + "github": "https://github.com/ayasyrev", + "name": "Andrei Yasyrev" + } + ], "language_info": { "name": "python" } From c9c24fa95dc0839e901161d4cc2a4deabd1533cb Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Fri, 17 Nov 2023 19:13:52 +0300 Subject: [PATCH 09/10] mask merge --- src/nbmetaclean/clean.py | 21 ++++++++++++++------- tests/test_clean.py | 26 ++++++++++++++++++++++++++ tests/test_read_write.py | 3 ++- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/src/nbmetaclean/clean.py b/src/nbmetaclean/clean.py index f5e8e8d..747a302 100644 --- a/src/nbmetaclean/clean.py +++ b/src/nbmetaclean/clean.py @@ -4,7 +4,7 @@ import os from pathlib import Path -from typing import Optional, Union +from typing import Iterable, Optional, Union from nbmetaclean.core import read_nb, write_nb @@ -31,6 +31,8 @@ class CleanConfig: Preserve mask for notebook metadata. Defaults to None. cell_metadata_preserve_mask (Optional[tuple[str, ...]], optional): Preserve mask for cell metadata. Defaults to None. + mask_merge (bool, optional): Merge masks. Add new mask to default. + If False - use new mask. Defaults to True. """ clear_nb_metadata: bool = True @@ -39,13 +41,14 @@ class CleanConfig: clear_outputs: bool = False preserve_timestamp: bool = True silent: bool = False - nb_metadata_preserve_mask: Optional[tuple[str, ...]] = None - cell_metadata_preserve_mask: Optional[tuple[str, ...]] = None + nb_metadata_preserve_mask: Optional[Iterable[tuple[str, ...]]] = None + cell_metadata_preserve_mask: Optional[Iterable[tuple[str, ...]]] = None + mask_merge: bool = True def filter_meta_mask( nb_meta: Union[str, int, Metadata], - mask: [tuple[str, ...]] = None, + mask: Optional[Iterable[tuple[str, ...]]] = None, ) -> Union[str, int, Metadata]: """Filter metadata by mask. If no mask return empty dict.""" if isinstance(nb_meta, (str, int)) or mask == (): @@ -139,9 +142,13 @@ def clean_nb( changed = False if cfg.clear_nb_metadata and (metadata := nb.get("metadata")): old_metadata = copy.deepcopy(metadata) - masks = ( - cfg.nb_metadata_preserve_mask or NB_METADATA_PRESERVE_MASKS - ) # todo: merge or replace? + masks = NB_METADATA_PRESERVE_MASKS + if cfg.nb_metadata_preserve_mask: + if not cfg.mask_merge: + masks = cfg.nb_metadata_preserve_mask + else: + masks = cfg.nb_metadata_preserve_mask + masks + nb["metadata"] = filter_metadata(metadata, masks=masks) if nb["metadata"] != old_metadata: changed = True diff --git a/tests/test_clean.py b/tests/test_clean.py index e8a7fd3..f0e8f6b 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -51,6 +51,32 @@ def test_new_metadata(): assert new_meta == {"language_info": {"name": "python"}} +def test_clean_nb_metadata(): + """test clean_nb_metadata""" + test_nb = read_nb("tests/test_nbs/test_nb_2_clean.ipynb") + cfg = CleanConfig() + result = clean_nb(test_nb, cfg) + assert not result + + # add metadata, new filter, mask not merged + test_nb["metadata"]["some key"] = "some value" + cfg.nb_metadata_preserve_mask = (("some key",),) + cfg.mask_merge = False + result = clean_nb(test_nb, cfg) + assert result + assert test_nb["metadata"] == {"some key": "some value"} + + # add metadata, new filter, mask merged + test_nb = read_nb("tests/test_nbs/test_nb_2_clean.ipynb") + test_nb["metadata"]["some_key"] = {"key_1": 1, "key_2": 2} + cfg.nb_metadata_preserve_mask = (("some_key", "key_1"),) + cfg.mask_merge = True + result = clean_nb(test_nb, cfg) + assert result + assert test_nb["metadata"]["authors"][0]["name"] == "Andrei Yasyrev" + assert test_nb["metadata"]["some_key"] == {"key_1": 1} + + def test_clean_cell_metadata(): """test clean_cell_metadata""" test_nb = read_nb("tests/test_nbs/.test_nb_2_meta.ipynb") diff --git a/tests/test_read_write.py b/tests/test_read_write.py index 55ee39c..964959d 100644 --- a/tests/test_read_write.py +++ b/tests/test_read_write.py @@ -8,7 +8,8 @@ def test_read_nb(): file = Path("tests/test_nbs/test_nb_1.ipynb") nb = read_nb(file) assert isinstance(nb, dict) - assert nb["metadata"] == {"language_info": {"name": "python"}} + assert nb["metadata"]["language_info"] == {"name": "python"} + assert nb["metadata"]["authors"][0]["name"] == "Andrei Yasyrev" assert nb["nbformat"] == 4 assert nb["nbformat_minor"] == 2 cells = nb["cells"] From c294eb69ce1497a14e4fd06e0397aadf4fc3ece8 Mon Sep 17 00:00:00 2001 From: ayasyrev Date: Fri, 17 Nov 2023 19:19:32 +0300 Subject: [PATCH 10/10] bump 0.0.7 --- src/nbmetaclean/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbmetaclean/version.py b/src/nbmetaclean/version.py index 6f26183..5382d2c 100644 --- a/src/nbmetaclean/version.py +++ b/src/nbmetaclean/version.py @@ -1 +1 @@ -__version__ = "0.0.7_dev" # pragma: no cover +__version__ = "0.0.7" # pragma: no cover