Skip to content

Commit

Permalink
Merge pull request #201 from simleo/file_content_size
Browse files Browse the repository at this point in the history
Add an option to record file content size
  • Loading branch information
simleo authored Sep 30, 2024
2 parents a2dec85 + 9356f88 commit 619ba5e
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 13 deletions.
9 changes: 8 additions & 1 deletion rocrate/model/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def write(self, base_path):
out_file_path.parent.mkdir(parents=True, exist_ok=True)
mode = 'w' + ('b' if isinstance(self.source, BytesIO) else 't')
with open(out_file_path, mode) as out_file:
out_file.write(self.source.getvalue())
content = self.source.getvalue()
out_file.write(content)
if self.record_size:
self._jsonld['contentSize'] = str(len(content))
elif is_url(str(self.source)):
if self.fetch_remote or self.validate_url:
if self.validate_url:
Expand All @@ -62,10 +65,14 @@ def write(self, base_path):
out_file_path.parent.mkdir(parents=True, exist_ok=True)
urllib.request.urlretrieve(self.source, out_file_path)
self._jsonld['contentUrl'] = str(self.source)
if self.record_size:
self._jsonld['contentSize'] = str(out_file_path.stat().st_size)
elif self.source is None:
# Allows to record a File entity whose @id does not exist, see #73
warnings.warn(f"No source for {self.id}")
else:
out_file_path.parent.mkdir(parents=True, exist_ok=True)
if not out_file_path.exists() or not out_file_path.samefile(self.source):
shutil.copy(self.source, out_file_path)
if self.record_size:
self._jsonld['contentSize'] = str(out_file_path.stat().st_size)
3 changes: 2 additions & 1 deletion rocrate/model/file_or_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,12 @@
class FileOrDir(DataEntity):

def __init__(self, crate, source=None, dest_path=None, fetch_remote=False,
validate_url=False, properties=None):
validate_url=False, properties=None, record_size=False):
if properties is None:
properties = {}
self.fetch_remote = fetch_remote
self.validate_url = validate_url
self.record_size = record_size
self.source = source
if dest_path:
dest_path = Path(dest_path)
Expand Down
17 changes: 10 additions & 7 deletions rocrate/rocrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,15 +342,17 @@ def add_file(
dest_path=None,
fetch_remote=False,
validate_url=False,
properties=None
properties=None,
record_size=False
):
return self.add(File(
self,
source=source,
dest_path=dest_path,
fetch_remote=fetch_remote,
validate_url=validate_url,
properties=properties
properties=properties,
record_size=record_size
))

def add_dataset(
Expand Down Expand Up @@ -478,11 +480,12 @@ def write_zip(self, out_path):

def add_workflow(
self, source=None, dest_path=None, fetch_remote=False, validate_url=False, properties=None,
main=False, lang="cwl", lang_version=None, gen_cwl=False, cls=ComputationalWorkflow
main=False, lang="cwl", lang_version=None, gen_cwl=False, cls=ComputationalWorkflow,
record_size=False
):
workflow = self.add(cls(
self, source=source, dest_path=dest_path, fetch_remote=fetch_remote,
validate_url=validate_url, properties=properties
validate_url=validate_url, properties=properties, record_size=record_size
))
if isinstance(lang, ComputerLanguage):
assert lang.crate is self
Expand All @@ -503,7 +506,7 @@ def add_workflow(
cwl_dest_path = Path(source).with_suffix(".cwl").name
cwl_workflow = self.add_workflow(
source=cwl_source, dest_path=cwl_dest_path, fetch_remote=fetch_remote, properties=properties,
main=False, lang="cwl", gen_cwl=False, cls=WorkflowDescription
main=False, lang="cwl", gen_cwl=False, cls=WorkflowDescription, record_size=record_size
)
workflow.subjectOf = cwl_workflow
return workflow
Expand Down Expand Up @@ -542,12 +545,12 @@ def add_test_instance(self, suite, url, resource="", service="jenkins", identifi

def add_test_definition(
self, suite, source=None, dest_path=None, fetch_remote=False, validate_url=False, properties=None,
engine="planemo", engine_version=None
engine="planemo", engine_version=None, record_size=False
):
suite = self.__validate_suite(suite)
definition = self.add(
TestDefinition(self, source=source, dest_path=dest_path, fetch_remote=fetch_remote,
validate_url=validate_url, properties=properties)
validate_url=validate_url, properties=properties, record_size=record_size)
)
if isinstance(engine, SoftwareApplication):
assert engine.crate is self
Expand Down
18 changes: 14 additions & 4 deletions test/test_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_file_writing(test_data_dir, tmpdir, helpers, gen_preview, to_zip):
file_subdir_id = 'sample_file_subdir.txt'

sample_file = test_data_dir / sample_file_id
file_returned = crate.add_file(sample_file)
file_returned = crate.add_file(sample_file, record_size=True)
assert file_returned.id == sample_file_id
file_returned_subdir = crate.add_file(sample_file, sample_file2_id)
assert file_returned_subdir.id == sample_file2_id
Expand Down Expand Up @@ -99,15 +99,17 @@ def test_file_writing(test_data_dir, tmpdir, helpers, gen_preview, to_zip):
assert json_entities[formatted_creator_id]["name"] == creator_name
if gen_preview:
assert helpers.PREVIEW_FILE_NAME in json_entities
file_entity = json_entities[sample_file_id]
assert file_entity["contentSize"] == str(file1.stat().st_size)


@pytest.mark.parametrize("stream_cls", [io.BytesIO, io.StringIO])
def test_in_mem_stream(stream_cls, tmpdir, helpers):
crate = ROCrate()

test_file_id = 'a/b/test_file.txt'
file_content = b'\x00\x01' if stream_cls is io.BytesIO else 'foo\n'
file_returned = crate.add_file(stream_cls(file_content), test_file_id)
file_content = b'\x00\x01\x02' if stream_cls is io.BytesIO else 'foo'
file_returned = crate.add_file(stream_cls(file_content), test_file_id, record_size=True)
assert file_returned.id == test_file_id

out_path = tmpdir / 'ro_crate_out'
Expand All @@ -121,6 +123,9 @@ def test_in_mem_stream(stream_cls, tmpdir, helpers):
mode = 'r' + ('b' if stream_cls is io.BytesIO else 't')
with open(file1, mode) as f:
assert f.read() == file_content
json_entities = helpers.read_json_entities(out_path)
file_entity = json_entities[test_file_id]
assert file_entity['contentSize'] == '3'


@pytest.mark.parametrize(
Expand All @@ -132,7 +137,11 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip):
url = ('https://raw.githubusercontent.com/ResearchObject/ro-crate-py/'
'master/test/test-data/sample_file.txt')
relpath = "a/b/sample_file.txt"
kw = {"fetch_remote": fetch_remote, "validate_url": validate_url}
kw = {
"fetch_remote": fetch_remote,
"validate_url": validate_url,
"record_size": True,
}
if fetch_remote:
file_ = crate.add_file(url, relpath, **kw)
assert file_.id == relpath
Expand All @@ -154,6 +163,7 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip):
out_file = out_crate.dereference(file_.id)
assert (out_path / relpath).is_file()
assert out_file["contentUrl"] == url
assert out_file["contentSize"] == str((out_path / file_.id).stat().st_size)
else:
out_file = out_crate.dereference(url)
assert not (out_path / relpath).exists()
Expand Down

0 comments on commit 619ba5e

Please sign in to comment.