Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove audbackend.checksum() and use MD5 sum #255

Merged
merged 5 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion audbackend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from audbackend.core.backend.base import Base as Backend # legacy
from audbackend.core.backend.filesystem import FileSystem # legacy
from audbackend.core.errors import BackendError
from audbackend.core.utils import checksum
from audbackend.core.repository import Repository

# Import optional backends (legacy)
Expand Down
44 changes: 30 additions & 14 deletions audbackend/core/backend/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,22 +63,38 @@ def _assert_equal_checksum(
):
r"""Assert checksums are equal.

Compare the MD5 sum of a file
(``path``)
to the MD5 sum of a reference file
(``path_ref``).
If check fails,
``path`` is removed
and an error is raised.

"""
if path_is_local:
checksum = utils.checksum(path)
else:
checksum = self.checksum(path)
Both ``path`` and ``path_ref``
can be local files,
or stored on any backend.

if path_ref_is_local:
checksum_ref = utils.checksum(path_ref)
else:
checksum_ref = self.checksum(path_ref)
Args:
path: path to a file.
Its MD5 sum is compared
to a reference one,
calculated from ``path_ref``
path_is_local: if ``True``,
assumes ``path`` is stored on local machine
path_ref: path to a file.
Its MD5 sum is used as reference
path_ref_is_local: if ``True``,
assumes ``path_ref`` is stored on local machine

Raises:
InterruptedError: if the MD5 sums do not match

"""
md5 = audeer.md5(path) if path_is_local else self.checksum(path)
md5_ref = audeer.md5(path_ref) if path_ref_is_local else self.checksum(path_ref)

if checksum != checksum_ref:
if md5 != md5_ref:
if path_is_local:
os.remove(path)
location = "local file system"
Expand All @@ -90,9 +106,9 @@ def _assert_equal_checksum(
f"Execution is interrupted because "
f"{path} "
f"has checksum "
f"'{checksum}' "
f"'{md5}' "
"when the expected checksum is "
f"'{checksum_ref}'. "
f"'{md5_ref}'. "
f"The file has been removed from the "
f"{location}."
)
Expand Down Expand Up @@ -569,7 +585,7 @@ def get_file(
msg = f"Permission denied: '{dst_path}'"
raise PermissionError(msg)

if not os.path.exists(dst_path) or utils.checksum(dst_path) != self.checksum(
if not os.path.exists(dst_path) or audeer.md5(dst_path) != self.checksum(
src_path
):
# get file to a temporary directory first,
Expand Down Expand Up @@ -1042,7 +1058,7 @@ def put_file(
elif os.path.isdir(src_path):
raise utils.raise_is_a_directory(src_path)

checksum = utils.checksum(src_path)
checksum = audeer.md5(src_path)

# skip if file with same checksum already exists
if not self.exists(dst_path) or self.checksum(dst_path) != checksum:
Expand Down
2 changes: 1 addition & 1 deletion audbackend/core/backend/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def _checksum(
) -> str:
r"""MD5 checksum of file on backend."""
path = self._expand(path)
return utils.checksum(path)
return audeer.md5(path)

def _collapse(
self,
Expand Down
2 changes: 1 addition & 1 deletion audbackend/core/interface/versioned.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def checksum(
Examples:
>>> file = "src.txt"
>>> import audeer
>>> audbackend.checksum(file)
>>> audeer.md5(file)
'd41d8cd98f00b204e9800998ecf8427e'
>>> interface.put_file(file, "/file.txt", "1.0.0")
>>> interface.checksum("/file.txt", "1.0.0")
Expand Down
58 changes: 0 additions & 58 deletions audbackend/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import re
import time

import audeer

from audbackend.core.errors import BackendError


Expand Down Expand Up @@ -106,62 +104,6 @@ def check_version(version: str) -> str:
return version


def checksum(file: str) -> str:
r"""Checksum of file.

This function is used by backends
to get the checksum of local files,
using :func:`audeer.md5`.

An exception are parquet files,
for which their ``"hash"`` metadata entry
is used as checksum,
if the entry is available
and pyarrow_ is installed.

.. _pyarrow: https://arrow.apache.org/docs/python/index.html

Args:
file: file path with extension

Returns:
MD5 checksum of file

Raises:
FileNotFoundError: if ``file`` does not exist

Examples:
>>> checksum("src.txt")
'd41d8cd98f00b204e9800998ecf8427e'
>>> import audformat
>>> import pandas as pd
>>> import pyarrow as pa
>>> import pyarrow.parquet as pq
>>> df = pd.DataFrame([0, 1], columns=["a"])
>>> hash = audformat.utils.hash(df, strict=True)
>>> hash
'9021a9b6e1e696ba9de4fe29346319b2'
>>> parquet_file = audeer.path("file.parquet")
>>> table = pa.Table.from_pandas(df)
>>> table = table.replace_schema_metadata({"hash": hash})
>>> pq.write_table(table, parquet_file, compression="snappy")
>>> checksum(parquet_file)
'9021a9b6e1e696ba9de4fe29346319b2'

"""
ext = audeer.file_extension(file)
if ext == "parquet":
try:
import pyarrow.parquet as parquet

metadata = parquet.read_schema(file).metadata or {}
if b"hash" in metadata:
return metadata[b"hash"].decode()
except ModuleNotFoundError:
pass
return audeer.md5(file)


def date_format(date: datetime.datetime) -> str:
return date.strftime("%Y-%m-%d")

Expand Down
1 change: 0 additions & 1 deletion docs/api-src/audbackend.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,3 @@ and functions are available.

BackendError
Repository
checksum
1 change: 0 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
autodoc_inherit_docstrings = False # disable docstring inheritance
intersphinx_mapping = {
"audeer": ("https://audeering.github.io/audeer/", None),
"audformat": ("https://audeering.github.io/audformat/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
"python": ("https://docs.python.org/3/", None),
}
Expand Down
4 changes: 3 additions & 1 deletion tests/bad_file_system.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import audeer

import audbackend


Expand All @@ -14,7 +16,7 @@ def put_file(
verbose: bool = False,
):
r"""Put file on backend."""
checksum = audbackend.checksum(src_path)
checksum = audeer.md5(src_path)
audbackend.core.utils.call_function_on_backend(
self._put_file,
src_path,
Expand Down
29 changes: 29 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

import audeer
import audformat

import audbackend

Expand Down Expand Up @@ -83,6 +84,34 @@ def owner(request):
yield owner


@pytest.fixture(scope="function")
def parquet_file(tmpdir):
r"""Provide a parquet file with checksum stored in metadata.

``audformat`` provides the possibility
to store a checksum,
based on the content of a parquet file,
in the metadata of that file.
The motivation is that a parquet file
cannot be written in a deterministic way
and the checksum is a way to track,
if the content has changed.

"""
db = audformat.Database("mydb")
db.schemes["age"] = audformat.Scheme("int")
db["files"] = audformat.Table(audformat.filewise_index(["f1"]))
db["files"]["age"] = audformat.Column(scheme_id="age")
db["files"]["age"].set([40])
path = audeer.path(tmpdir, "files.parquet")
db["files"].save(
audeer.replace_file_extension(path, ""),
storage_format="parquet",
)

yield path


@pytest.fixture(scope="function", autouse=False)
def interface(tmpdir_factory, request):
r"""Create a backend with interface.
Expand Down
20 changes: 20 additions & 0 deletions tests/test_backend_artifactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,23 @@ def test_open_close(host, repository):
audbackend.backend.Artifactory.create(host, repository)
backend.open()
backend.close()


@pytest.mark.parametrize(
"interface",
[(audbackend.backend.Artifactory, audbackend.interface.Maven)],
indirect=True,
)
def test_parquet_file(interface, parquet_file):
"""Test uploading a parquet file with hash in metadata.

We need to make sure to hand the MD5 sum
to the deploy method of Artifactory,
not the checksum hash of the parquet file metadata.
See https://github.com/audeering/audbackend/issues/254.

"""
dst_file = f"/{os.path.basename(parquet_file)}"
version = "1.0.0"
interface.put_file(parquet_file, dst_file, version)
assert interface.exists(dst_file, version)
112 changes: 0 additions & 112 deletions tests/test_utils.py

This file was deleted.