From e384a900d1eac8eb117e608fda5690515c2f1744 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 6 Mar 2024 11:12:40 +0000 Subject: [PATCH 01/16] update fixture type hint --- src/python/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py index 551f342fa..02314957a 100644 --- a/src/python/tests/conftest.py +++ b/src/python/tests/conftest.py @@ -52,7 +52,7 @@ def shared_data_dir(pytestconfig: Config) -> Path: @pytest.fixture(name="json_data") -def fixture_json_data(data_dir: Path) -> Callable: +def fixture_json_data(data_dir: Path) -> Callable[[str], Any]: """Returns a JSON test object factory. Args: From faa87bca6bb55ac98ee7bb0285a83bf56c312daf Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 6 Mar 2024 11:14:18 +0000 Subject: [PATCH 02/16] update fixture name and docstring --- src/python/tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py index 02314957a..b6021a119 100644 --- a/src/python/tests/conftest.py +++ b/src/python/tests/conftest.py @@ -68,8 +68,8 @@ def _json_data(file_name: str) -> Any: @pytest.fixture(name="assert_files") -def assert_files() -> Callable[[Path, Path], None]: - """Provide a function that asserts two files and show a diff if they differ.""" +def fixture_assert_files() -> Callable[[Path, Path], None]: + """Returns a function that asserts if two files are equal and shows a diff if they differ.""" def _assert_files(result_path: Path, expected_path: Path) -> None: with open(result_path, "r") as result_fh: From 6c1f8aa930a89718c9f715b596d88c1eb531a5a5 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 6 Mar 2024 11:17:52 +0000 Subject: [PATCH 03/16] update docstring and use it in parser --- .../ensembl/io/genomio/genome_metadata/prepare.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index d33f18159..3afb6a7c2 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Expand the genome_metadata with more details for: -the provider, assembly and gene build version, and the taxonomy. +"""Expand the genome metadata file adding information about the provider, taxonomy, and assembly and +gene build versions. """ __all__ = [ @@ -257,12 +257,7 @@ def prepare_genome_metadata( def main() -> None: """Module's entry-point.""" - parser = ArgumentParser( - description=( - "Add information about provider, taxonomy and assembly and gene build version to the genome " - "metadata file." - ) - ) + parser = ArgumentParser(description=__doc__) parser.add_argument_src_path("--input_file", required=True, help="Genome metadata JSON file") parser.add_argument_dst_path( "--output_file", required=True, help="Output path for the new genome metadata file" From 7b514c130053325c0293548281e31acf3f6ee8a8 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 6 Mar 2024 11:17:30 +0000 Subject: [PATCH 04/16] update prepare.add_provider() and add unit test also update NCBI's provider URLS --- .../io/genomio/genome_metadata/prepare.py | 34 +++--- .../tests/genome_metadata/test_prepare.py | 100 ++++++++++++++++++ .../test_prepare/cncb_genome.json | 8 ++ .../test_prepare/genbank_genome.json | 13 +++ .../test_prepare/refseq_genome.json | 13 +++ .../test_prepare/updated_genome.json | 17 +++ 6 files changed, 168 insertions(+), 17 deletions(-) create mode 100644 src/python/tests/genome_metadata/test_prepare.py create mode 100644 src/python/tests/genome_metadata/test_prepare/cncb_genome.json create mode 100644 src/python/tests/genome_metadata/test_prepare/genbank_genome.json create mode 100644 src/python/tests/genome_metadata/test_prepare/refseq_genome.json create mode 100644 src/python/tests/genome_metadata/test_prepare/updated_genome.json diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index 3afb6a7c2..691dc905a 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -46,21 +46,21 @@ "GenBank": { "assembly": { "provider_name": "GenBank", - "provider_url": "https://www.ncbi.nlm.nih.gov/assembly", + "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome", }, "annotation": { "provider_name": "GenBank", - "provider_url": "https://www.ncbi.nlm.nih.gov/assembly", + "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome", }, }, "RefSeq": { "assembly": { "provider_name": "RefSeq", - "provider_url": "https://www.ncbi.nlm.nih.gov/refseq", + "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome", }, "annotation": { "provider_name": "RefSeq", - "provider_url": "https://www.ncbi.nlm.nih.gov/refseq", + "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome", }, }, } @@ -75,41 +75,41 @@ class MetadataError(Exception): """When a metadata value is not expected.""" -def add_provider(genome_data: Dict, gff3_file: Optional[PathLike] = None) -> None: - """Adds provider metadata for assembly and gene models in `genome_data`. +def add_provider(genome_metadata: Dict, gff3_file: Optional[PathLike] = None) -> None: + """Updates the genome metadata adding provider information for assembly and gene models. - Assembly provider metadata will only be added if it is missing, i.e. neither ``provider_name`` or - ``provider_url`` are present. The gene model metadata will only be added if `gff3_file` is provided. + Assembly provider metadata will only be added if it is missing, i.e. neither `provider_name` or + `provider_url` are present. The gene model metadata will only be added if `gff3_file` is provided. Args: genome_data: Genome information of assembly, accession and annotation. gff3_file: Path to GFF3 file to use as annotation source for this genome. + Raises: + MetadataError: If accession's format in genome metadata does not match with a known provider. """ # Get accession provider - accession = genome_data["assembly"]["accession"] + accession = genome_metadata["assembly"]["accession"] if accession.startswith("GCF"): provider = PROVIDER_DATA["RefSeq"] elif accession.startswith("GCA"): provider = PROVIDER_DATA["GenBank"] else: - raise MetadataError(f"Accession doesn't look like an INSDC or RefSeq accession: {accession}") + raise MetadataError(f"Accession does not look like an INSDC or RefSeq accession: {accession}") # Add assembly provider (if missing) - assembly = genome_data["assembly"] + assembly = genome_metadata["assembly"] if (not "provider_name" in assembly) and (not "provider_url" in assembly): assembly["provider_name"] = provider["assembly"]["provider_name"] - assembly["provider_url"] = provider["assembly"]["provider_url"] + assembly["provider_url"] = f'{provider["assembly"]["provider_url"]}/{accession}' # Add annotation provider if there are gene models if gff3_file: - annotation = {} - if "annotation" in genome_data: - annotation = genome_data["annotation"] + annotation = genome_metadata.get("annotation", {}) if ("provider_name" not in annotation) and ("provider_url" not in annotation): annotation["provider_name"] = provider["annotation"]["provider_name"] - annotation["provider_url"] = provider["annotation"]["provider_url"] - genome_data["annotation"] = annotation + annotation["provider_url"] = f'{provider["annotation"]["provider_url"]}/{accession}' + genome_metadata["annotation"] = annotation def add_assembly_version(genome_data: Dict) -> None: diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py new file mode 100644 index 000000000..11c9d9834 --- /dev/null +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -0,0 +1,100 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit testing of `ensembl.io.genomio.genome_metadata.prepare` module. + +Typical usage example:: + $ pytest test_prepare.py + +""" + +from contextlib import nullcontext as does_not_raise +from pathlib import Path +from typing import Any, Callable, ContextManager, Dict, Optional + +from deepdiff import DeepDiff +import pytest + +from ensembl.io.genomio.genome_metadata import prepare + + +# @pytest.mark.dependency(name="test_get_gbff_regions") +@pytest.mark.parametrize( + "genome_file, gff3_file, output, expectation", + [ + pytest.param( + "genbank_genome.json", + None, + { + "assembly": { + "provider_name": "GenBank", + "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_013436015.1", + }, + }, + does_not_raise(), + id="GenBank assembly", + ), + pytest.param( + "refseq_genome.json", + "fake.gff3", + { + "assembly": { + "provider_name": "RefSeq", + "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000004695.1", + }, + "annotation": { + "provider_name": "RefSeq", + "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000004695.1", + }, + }, + does_not_raise(), + id="RefSeq assembly and annotation", + ), + pytest.param( + "updated_genome.json", + "fake.gff3", + { + "assembly": {"provider_name": "GenBank", "provider_url": None}, + "annotation": {"provider_name": "GenBank", "provider_url": None}, + }, + does_not_raise(), + id="Provider information already present", + ), + pytest.param( + "cncb_genome.json", None, {}, pytest.raises(prepare.MetadataError), id="Unexpected provider" + ), + ], +) +def test_add_provider( + json_data: Callable[[str], Any], + genome_file: str, + gff3_file: Optional[str], + output: Dict[str, Dict[str, Optional[str]]], + expectation: ContextManager, +) -> None: + """Tests the `prepare.add_provider()` method. + + Args: + json_data: JSON test file parsing fixture. + genome_file: Genome metadata JSON file. + gff3_file: GFF3 file. + output: Expected elements present in the updated genome metadata. + expectation: Context manager for the expected exception (if any). + """ + with expectation: + genome_metadata = json_data(genome_file) + prepare.add_provider(genome_metadata, gff3_file) + for section, metadata in output.items(): + for key, value in metadata.items(): + assert genome_metadata[section].get(key, None) == value diff --git a/src/python/tests/genome_metadata/test_prepare/cncb_genome.json b/src/python/tests/genome_metadata/test_prepare/cncb_genome.json new file mode 100644 index 000000000..673848576 --- /dev/null +++ b/src/python/tests/genome_metadata/test_prepare/cncb_genome.json @@ -0,0 +1,8 @@ +{ + "assembly": { + "accession": "GWHAMMH00000000" + }, + "species": { + "scientific_name": "Ixodes persulcatus" + } +} \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_prepare/genbank_genome.json b/src/python/tests/genome_metadata/test_prepare/genbank_genome.json new file mode 100644 index 000000000..fd3f6cd7e --- /dev/null +++ b/src/python/tests/genome_metadata/test_prepare/genbank_genome.json @@ -0,0 +1,13 @@ +{ + "BRC4": { + "component": "VectorBase", + "organism_abbrev": "rannulKG" + }, + "assembly": { + "accession": "GCA_013436015.1" + }, + "species": { + "scientific_name": "Rhipicephalus annulatus", + "strain": "Klein Grass" + } +} \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_prepare/refseq_genome.json b/src/python/tests/genome_metadata/test_prepare/refseq_genome.json new file mode 100644 index 000000000..ce490dae4 --- /dev/null +++ b/src/python/tests/genome_metadata/test_prepare/refseq_genome.json @@ -0,0 +1,13 @@ +{ + "BRC4": { + "component": "AmoebaDB", + "organism_abbrev": "ddisAX4" + }, + "assembly": { + "accession": "GCF_000004695.1" + }, + "species": { + "scientific_name": "Dictyostelium discoideum AX4", + "strain": "AX4" + } +} \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_prepare/updated_genome.json b/src/python/tests/genome_metadata/test_prepare/updated_genome.json new file mode 100644 index 000000000..f4ae7720b --- /dev/null +++ b/src/python/tests/genome_metadata/test_prepare/updated_genome.json @@ -0,0 +1,17 @@ +{ + "BRC4": { + "component": "VectorBase", + "organism_abbrev": "rannulKG" + }, + "annotation": { + "provider_name": "GenBank" + }, + "assembly": { + "accession": "GCA_013436015.1", + "provider_name": "GenBank" + }, + "species": { + "scientific_name": "Rhipicephalus annulatus", + "strain": "Klein Grass" + } +} \ No newline at end of file From 0c86202ac6f4c6733d76bf72884c87c34c679e3f Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 6 Mar 2024 11:46:23 +0000 Subject: [PATCH 05/16] update prepare.add_assembly_version() and add unit test --- .../io/genomio/genome_metadata/prepare.py | 8 +++---- .../tests/genome_metadata/test_prepare.py | 22 ++++++++++++++++++- .../test_prepare/genbank_genome.json | 2 +- .../test_prepare/updated_genome.json | 3 ++- 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index 691dc905a..118c692cc 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -113,7 +113,7 @@ def add_provider(genome_metadata: Dict, gff3_file: Optional[PathLike] = None) -> def add_assembly_version(genome_data: Dict) -> None: - """Adds version number to the genome's assembly if one is not present already. + """Adds version number to the genome's assembly information if one is not present already. Args: genome_data: Genome information of assembly, accession and annotation. @@ -122,9 +122,9 @@ def add_assembly_version(genome_data: Dict) -> None: assembly = genome_data["assembly"] if not "version" in assembly: accession = assembly["accession"] - values = accession.split(".") - if (len(values) == 2) and values[1]: - assembly["version"] = int(values[1]) + version = accession.partition(".")[2] + if version: + assembly["version"] = int(version) def add_genebuild_metadata(genome_data: Dict) -> None: diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index 11c9d9834..0e9fd9442 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -39,7 +39,7 @@ { "assembly": { "provider_name": "GenBank", - "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_013436015.1", + "provider_url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_013436015.2", }, }, does_not_raise(), @@ -98,3 +98,23 @@ def test_add_provider( for section, metadata in output.items(): for key, value in metadata.items(): assert genome_metadata[section].get(key, None) == value + + +@pytest.mark.parametrize( + "genome_file, output", + [ + ("genbank_genome.json", 2), + ("updated_genome.json", 1), + ], +) +def test_add_assembly_version(json_data: Callable[[str], Any], genome_file: str, output: int) -> None: + """Tests the `prepare.add_assembly_version()` method. + + Args: + json_data: JSON test file parsing fixture. + genome_file: Genome metadata JSON file. + output: Assembly version expected in the updated genome metadata. + """ + genome_metadata = json_data(genome_file) + prepare.add_assembly_version(genome_metadata) + assert genome_metadata["assembly"]["version"] == output diff --git a/src/python/tests/genome_metadata/test_prepare/genbank_genome.json b/src/python/tests/genome_metadata/test_prepare/genbank_genome.json index fd3f6cd7e..7665c52a5 100644 --- a/src/python/tests/genome_metadata/test_prepare/genbank_genome.json +++ b/src/python/tests/genome_metadata/test_prepare/genbank_genome.json @@ -4,7 +4,7 @@ "organism_abbrev": "rannulKG" }, "assembly": { - "accession": "GCA_013436015.1" + "accession": "GCA_013436015.2" }, "species": { "scientific_name": "Rhipicephalus annulatus", diff --git a/src/python/tests/genome_metadata/test_prepare/updated_genome.json b/src/python/tests/genome_metadata/test_prepare/updated_genome.json index f4ae7720b..18a3981c2 100644 --- a/src/python/tests/genome_metadata/test_prepare/updated_genome.json +++ b/src/python/tests/genome_metadata/test_prepare/updated_genome.json @@ -8,7 +8,8 @@ }, "assembly": { "accession": "GCA_013436015.1", - "provider_name": "GenBank" + "provider_name": "GenBank", + "version": 1 }, "species": { "scientific_name": "Rhipicephalus annulatus", From f0e92b59639451667399c795362fe6dc2beb3582 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 6 Mar 2024 12:03:22 +0000 Subject: [PATCH 06/16] update prepare.add_genebuild_metadata() and add unit test --- .../io/genomio/genome_metadata/prepare.py | 9 +++---- .../tests/genome_metadata/test_prepare.py | 27 +++++++++++++++++++ .../test_prepare/updated_genome.json | 4 +++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index 118c692cc..7e4befaf4 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -117,7 +117,6 @@ def add_assembly_version(genome_data: Dict) -> None: Args: genome_data: Genome information of assembly, accession and annotation. - """ assembly = genome_data["assembly"] if not "version" in assembly: @@ -128,20 +127,20 @@ def add_assembly_version(genome_data: Dict) -> None: def add_genebuild_metadata(genome_data: Dict) -> None: - """Adds missing genebuild metadata. + """Adds genebuild metadata to genome information if not present already. - The default convention is to use the current date as ``version`` and ``start_date``. + The default convention is to use the current date as `version` and `start_date`. Args: genome_data: Genome information of assembly, accession and annotation. - """ - genebuild = genome_data["genebuild"] + genebuild = genome_data.get("genebuild", {}) current_date = datetime.date.today().isoformat() if not "version" in genebuild: genebuild["version"] = current_date if not "start_date" in genebuild: genebuild["start_date"] = current_date + genome_data["genebuild"] = genebuild def add_species_metadata(genome_data: Dict, base_api_url: str = DEFAULT_API_URL) -> None: diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index 0e9fd9442..a596e9755 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -22,6 +22,7 @@ from contextlib import nullcontext as does_not_raise from pathlib import Path from typing import Any, Callable, ContextManager, Dict, Optional +from unittest.mock import Mock, patch from deepdiff import DeepDiff import pytest @@ -118,3 +119,29 @@ def test_add_assembly_version(json_data: Callable[[str], Any], genome_file: str, genome_metadata = json_data(genome_file) prepare.add_assembly_version(genome_metadata) assert genome_metadata["assembly"]["version"] == output + + +@patch("datetime.date") +@pytest.mark.parametrize( + "genome_file, output", + [ + ("genbank_genome.json", "03-2024"), + ("updated_genome.json", "01-2021"), + ], +) +def test_add_genebuild_metadata( + mock_date: Mock, json_data: Callable[[str], Any], genome_file: str, output: str +) -> None: + """Tests the `prepare.add_genebuild_metadata()` method. + + Args: + json_data: JSON test file parsing fixture. + genome_file: Genome metadata JSON file. + output: Expected date for genebuild's `start_date` and `version` in the updated genome metadata. + """ + mock_date.today.return_value = mock_date + mock_date.isoformat.return_value = output + genome_metadata = json_data(genome_file) + prepare.add_genebuild_metadata(genome_metadata) + assert genome_metadata["genebuild"]["start_date"] == output + assert genome_metadata["genebuild"]["version"] == output diff --git a/src/python/tests/genome_metadata/test_prepare/updated_genome.json b/src/python/tests/genome_metadata/test_prepare/updated_genome.json index 18a3981c2..42d64dff6 100644 --- a/src/python/tests/genome_metadata/test_prepare/updated_genome.json +++ b/src/python/tests/genome_metadata/test_prepare/updated_genome.json @@ -11,6 +11,10 @@ "provider_name": "GenBank", "version": 1 }, + "genebuild": { + "start_date": "01-2021", + "version": "01-2021" + }, "species": { "scientific_name": "Rhipicephalus annulatus", "strain": "Klein Grass" From 091ac03d2837596d2f277a3bd897c6acc878a88d Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 7 Mar 2024 12:03:24 +0000 Subject: [PATCH 07/16] update preprare.get_taxonomy_from_accession() (and private function) and add unit tests --- .../io/genomio/genome_metadata/prepare.py | 47 ++++---- .../tests/genome_metadata/test_prepare.py | 109 ++++++++++++++++++ .../test_prepare/default_taxonomy.xml | 8 ++ .../test_prepare/no_taxonomy.xml | 4 + .../test_prepare/strain_taxonomy.xml | 9 ++ 5 files changed, 151 insertions(+), 26 deletions(-) create mode 100644 src/python/tests/genome_metadata/test_prepare/default_taxonomy.xml create mode 100644 src/python/tests/genome_metadata/test_prepare/no_taxonomy.xml create mode 100644 src/python/tests/genome_metadata/test_prepare/strain_taxonomy.xml diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index 7e4befaf4..623e1501b 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -31,7 +31,7 @@ import datetime from os import PathLike -from typing import Dict, Optional +from typing import Any, Dict, Optional from xml.etree import ElementTree from xml.etree.ElementTree import Element @@ -165,7 +165,7 @@ def add_species_metadata(genome_data: Dict, base_api_url: str = DEFAULT_API_URL) species["scientific_name"] = taxonomy["scientific_name"] -def get_taxonomy_from_accession(accession: str, base_api_url: str = DEFAULT_API_URL) -> Dict: +def get_taxonomy_from_accession(accession: str, base_api_url: str = DEFAULT_API_URL) -> Dict[str, Any]: """Returns the taxonomy metadata associated to the given accession. Args: @@ -173,57 +173,52 @@ def get_taxonomy_from_accession(accession: str, base_api_url: str = DEFAULT_API_ base_api_url: Base API URL to fetch the taxonomy data from. Returns: - Dictionary with key-value pairs for ``taxon_id`` and ``scientific_name``. ``strain`` will be added - only if present in the fetched taxonomy data. + Dictionary with key-value pairs for `taxon_id` and `scientific_name`. `strain` will also be + included if it is present in the fetched taxonomy data. Raises: - MissinDataException: If ``TAXON_ID`` or ``SCIENTIFIC_NAME`` are missing in the taxonomy data fetched. + MissingNodeError: If `TAXON` node is missing in the taxonomy data fetched. """ # Use the GenBank accession without version gb_accession = accession.replace("GCF", "GCA").split(".")[0] response = requests.get(f"{base_api_url}/{gb_accession}", timeout=60) entry = ElementTree.fromstring(response.text) - taxon_node = entry.find(".//TAXON") if taxon_node is None: - raise MissingNodeError("Can't find the TAXON node") - + raise MissingNodeError("Cannot find the TAXON node") # Fetch taxon ID, scientific_name and strain taxon_id = _get_node_text(taxon_node, "TAXON_ID") scientific_name = _get_node_text(taxon_node, "SCIENTIFIC_NAME") strain = _get_node_text(taxon_node, "STRAIN", optional=True) - - if taxon_id and scientific_name: - taxonomy = { - "taxon_id": int(taxon_id), - "scientific_name": scientific_name, - } + taxonomy = { + "taxon_id": int(taxon_id), + "scientific_name": scientific_name, + } if strain: taxonomy["strain"] = strain return taxonomy -def _get_node_text(node: Element, tag: str, optional: bool = False) -> Optional[str]: +def _get_node_text(node: Optional[Element], tag: str, optional: bool = False) -> Optional[str]: """Returns the value of the field matching the provided tag inside `node`. - By default raise a MissingNodeException if the tag is not found. - If optional is True and no tag is found, return None. + + If the tag is not present and `optional` is True, returns `None` instead. Args: node: Node of an XML tree. tag: Tag to fetch within the node. - optional: Don't raise an exception if the tag doesn't exist. + optional: Do not raise an exception if the tag does not exist. + Raises: + MissingNodeError: If no node is provided or if the tag is missing (if `optional == False`). """ if node is None: - raise MissingNodeError(f"No node provided to look for {tag}") - tag_node = node.find(tag) - - if tag_node is not None: - return tag_node.text - if optional: - return None - raise MissingNodeError(f"No node found for tag {tag}") + raise MissingNodeError(f"No node provided to look for '{tag}'") + tag_text = node.findtext(tag) + if not optional and (tag_text is None): + raise MissingNodeError(f"No node found for tag '{tag}'") + return tag_text def prepare_genome_metadata( diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index a596e9755..6ec8a890c 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -23,9 +23,12 @@ from pathlib import Path from typing import Any, Callable, ContextManager, Dict, Optional from unittest.mock import Mock, patch +from xml.etree import ElementTree +from xml.etree.ElementTree import Element from deepdiff import DeepDiff import pytest +import requests from ensembl.io.genomio.genome_metadata import prepare @@ -145,3 +148,109 @@ def test_add_genebuild_metadata( prepare.add_genebuild_metadata(genome_metadata) assert genome_metadata["genebuild"]["start_date"] == output assert genome_metadata["genebuild"]["version"] == output + + +@pytest.mark.dependency(name="test_get_node_text") +@pytest.mark.parametrize( + "xml_file, tag, optional, output, expectation", + [ + pytest.param( + "", + "tag", + False, + "", + pytest.raises(prepare.MissingNodeError, match="No node provided to look for 'tag'"), + id="No node provided", + ), + pytest.param("default_taxonomy.xml", "TAXON_ID", False, "34611", does_not_raise(), id="Tag present"), + pytest.param("default_taxonomy.xml", "tag", True, None, does_not_raise(), id="Missing optional tag"), + pytest.param( + "default_taxonomy.xml", + "tag", + False, + "", + pytest.raises(prepare.MissingNodeError, match="No node found for tag 'tag'"), + id="Missing mandatory tag", + ), + ], +) +def test_get_node_text( + data_dir: Path, + xml_file: str, + tag: str, + optional: bool, + output: Optional[str], + expectation: ContextManager, +) -> None: + """Tests the `prepare._get_node_text()` method. + + Args: + data_dir: Module's test data directory fixture. + xml_file: XML file with assembly's taxonomy data. + tag: Tag to fetch within the node. + optional: Do not raise an exception if the tag does not exist. + output: Expected field value returned. + expectation: Context manager for the expected exception (if any). + """ + if xml_file: + tree = ElementTree.parse(data_dir / xml_file) + node = tree.find(".//TAXON") + else: + node = None + with expectation: + result = prepare._get_node_text(node, tag, optional) + assert result == output + + +@pytest.mark.dependency(name="test_get_taxonomy_from_accession", depends=["test_get_node_text"]) +@patch("requests.Response") +@patch("requests.get") +@pytest.mark.parametrize( + "xml_file, output, expectation", + [ + pytest.param( + "default_taxonomy.xml", + {"taxon_id": 34611, "scientific_name": "Rhipicephalus annulatus"}, + does_not_raise(), + id="Basic taxonomy data", + ), + pytest.param( + "strain_taxonomy.xml", + {"taxon_id": 34611, "scientific_name": "Rhipicephalus annulatus", "strain": "Klein Grass"}, + does_not_raise(), + id="Taxonomy with strain data", + ), + pytest.param( + "no_taxonomy.xml", + {}, + pytest.raises(prepare.MissingNodeError, match="Cannot find the TAXON node"), + id="Missing TAXON node", + ), + ], +) +def test_get_taxonomy_from_accession( + mock_requests_get: Mock, + mock_response: Mock, + data_dir: Path, + xml_file: str, + output: Dict[str, Any], + expectation: ContextManager, +): + """Tests the `prepare.get_taxonomy_from_accession()` method. + + Args: + mock_requests_get: A mock of `requests.get()` function. + mock_response: A mock of `requests.Response` class. + data_dir: Module's test data directory fixture. + xml_file: XML file with assembly's taxonomy data. + output: Expected taxonomy data returned. + expectation: Context manager for the expected exception (if any). + """ + xml_path = data_dir / xml_file + with xml_path.open() as xml: + text = "".join(xml.readlines()) + mock_response.text = text + mock_requests_get.return_value = mock_response + with expectation: + result = prepare.get_taxonomy_from_accession("GCA_013436015.2") + assert not DeepDiff(result, output) diff --git a/src/python/tests/genome_metadata/test_prepare/default_taxonomy.xml b/src/python/tests/genome_metadata/test_prepare/default_taxonomy.xml new file mode 100644 index 000000000..2e42c43f6 --- /dev/null +++ b/src/python/tests/genome_metadata/test_prepare/default_taxonomy.xml @@ -0,0 +1,8 @@ + + + + 34611 + Rhipicephalus annulatus + + + \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_prepare/no_taxonomy.xml b/src/python/tests/genome_metadata/test_prepare/no_taxonomy.xml new file mode 100644 index 000000000..9f79bc472 --- /dev/null +++ b/src/python/tests/genome_metadata/test_prepare/no_taxonomy.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_prepare/strain_taxonomy.xml b/src/python/tests/genome_metadata/test_prepare/strain_taxonomy.xml new file mode 100644 index 000000000..44b48cc0b --- /dev/null +++ b/src/python/tests/genome_metadata/test_prepare/strain_taxonomy.xml @@ -0,0 +1,9 @@ + + + + 34611 + Rhipicephalus annulatus + Klein Grass + + + \ No newline at end of file From 138df36a06337ecd05f6e4350e0f35286509c430 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 7 Mar 2024 12:04:08 +0000 Subject: [PATCH 08/16] minor updates and extend coverage --- src/python/tests/genome_metadata/test_prepare.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index 6ec8a890c..f5decdfdc 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -96,8 +96,8 @@ def test_add_provider( output: Expected elements present in the updated genome metadata. expectation: Context manager for the expected exception (if any). """ + genome_metadata = json_data(genome_file) with expectation: - genome_metadata = json_data(genome_file) prepare.add_provider(genome_metadata, gff3_file) for section, metadata in output.items(): for key, value in metadata.items(): @@ -109,6 +109,7 @@ def test_add_provider( [ ("genbank_genome.json", 2), ("updated_genome.json", 1), + ("cncb_genome.json", 0), ], ) def test_add_assembly_version(json_data: Callable[[str], Any], genome_file: str, output: int) -> None: @@ -121,7 +122,7 @@ def test_add_assembly_version(json_data: Callable[[str], Any], genome_file: str, """ genome_metadata = json_data(genome_file) prepare.add_assembly_version(genome_metadata) - assert genome_metadata["assembly"]["version"] == output + assert genome_metadata["assembly"].get("version", 0) == output @patch("datetime.date") @@ -138,6 +139,7 @@ def test_add_genebuild_metadata( """Tests the `prepare.add_genebuild_metadata()` method. Args: + mock_date: A mock of `datetime.date` class. json_data: JSON test file parsing fixture. genome_file: Genome metadata JSON file. output: Expected date for genebuild's `start_date` and `version` in the updated genome metadata. From ea53ffe87d5e18082ecb191e598dd759c279fa6d Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 7 Mar 2024 14:13:26 +0000 Subject: [PATCH 09/16] build correct URLs and update unit test --- .../ensembl/io/genomio/genome_metadata/prepare.py | 4 +++- src/python/tests/genome_metadata/test_prepare.py | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index 623e1501b..31352a736 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -182,7 +182,9 @@ def get_taxonomy_from_accession(accession: str, base_api_url: str = DEFAULT_API_ """ # Use the GenBank accession without version gb_accession = accession.replace("GCF", "GCA").split(".")[0] - response = requests.get(f"{base_api_url}/{gb_accession}", timeout=60) + if not base_api_url.endswith("/"): + base_api_url += "/" + response = requests.get(f"{base_api_url}{gb_accession}", timeout=60) entry = ElementTree.fromstring(response.text) taxon_node = entry.find(".//TAXON") if taxon_node is None: diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index f5decdfdc..b64d12806 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -208,21 +208,27 @@ def test_get_node_text( @patch("requests.Response") @patch("requests.get") @pytest.mark.parametrize( - "xml_file, output, expectation", + "accession, base_api_url, xml_file, output, expectation", [ pytest.param( + "GCF_013436015.2", + prepare.DEFAULT_API_URL, "default_taxonomy.xml", {"taxon_id": 34611, "scientific_name": "Rhipicephalus annulatus"}, does_not_raise(), id="Basic taxonomy data", ), pytest.param( + "GCA_013436015.2", + "/", "strain_taxonomy.xml", {"taxon_id": 34611, "scientific_name": "Rhipicephalus annulatus", "strain": "Klein Grass"}, does_not_raise(), id="Taxonomy with strain data", ), pytest.param( + "GCA_013436015.2", + "", "no_taxonomy.xml", {}, pytest.raises(prepare.MissingNodeError, match="Cannot find the TAXON node"), @@ -234,6 +240,8 @@ def test_get_taxonomy_from_accession( mock_requests_get: Mock, mock_response: Mock, data_dir: Path, + accession: str, + base_api_url: str, xml_file: str, output: Dict[str, Any], expectation: ContextManager, @@ -244,6 +252,8 @@ def test_get_taxonomy_from_accession( mock_requests_get: A mock of `requests.get()` function. mock_response: A mock of `requests.Response` class. data_dir: Module's test data directory fixture. + accession: INSDC accession ID. + base_api_url: Base API URL to fetch the taxonomy data from. xml_file: XML file with assembly's taxonomy data. output: Expected taxonomy data returned. expectation: Context manager for the expected exception (if any). @@ -254,5 +264,5 @@ def test_get_taxonomy_from_accession( mock_response.text = text mock_requests_get.return_value = mock_response with expectation: - result = prepare.get_taxonomy_from_accession("GCA_013436015.2") + result = prepare.get_taxonomy_from_accession(accession, base_api_url) assert not DeepDiff(result, output) From 9b09b6ed64e3118f9cfd010a20a362ce381973f3 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 7 Mar 2024 15:08:09 +0000 Subject: [PATCH 10/16] update docstrings and code cleanup --- .../io/genomio/genome_metadata/prepare.py | 25 ++++++++++--------- .../tests/genome_metadata/test_prepare.py | 6 ++--- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index 31352a736..468bb4b77 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -78,8 +78,8 @@ class MetadataError(Exception): def add_provider(genome_metadata: Dict, gff3_file: Optional[PathLike] = None) -> None: """Updates the genome metadata adding provider information for assembly and gene models. - Assembly provider metadata will only be added if it is missing, i.e. neither `provider_name` or - `provider_url` are present. The gene model metadata will only be added if `gff3_file` is provided. + Assembly provider metadata will only be added if it is missing, i.e. neither `"provider_name"` or + `"provider_url"` are present. The gene model metadata will only be added if `gff3_file` is provided. Args: genome_data: Genome information of assembly, accession and annotation. @@ -129,7 +129,7 @@ def add_assembly_version(genome_data: Dict) -> None: def add_genebuild_metadata(genome_data: Dict) -> None: """Adds genebuild metadata to genome information if not present already. - The default convention is to use the current date as `version` and `start_date`. + The default convention is to use the current date as `"version"` and `"start_date"`. Args: genome_data: Genome information of assembly, accession and annotation. @@ -143,20 +143,21 @@ def add_genebuild_metadata(genome_data: Dict) -> None: genome_data["genebuild"] = genebuild -def add_species_metadata(genome_data: Dict, base_api_url: str = DEFAULT_API_URL) -> None: - """Adds missing species metadata based on the genome's accession. +def add_species_metadata(genome_metadata: Dict, base_api_url: str = DEFAULT_API_URL) -> None: + """Adds missing species metadata from its taxonomy based on the genome's accession. - The ``taxonomy_id``, ``strain`` and ``scientific_name`` will be fetched from the taxonomy information - linked to the given accession. + If `"taxonomy_id"` is already present in the species metadata, nothing is added. The `"taxonomy_id"`, + `"scientific_name"` and `"strain"` will be fetched from the taxonomy information linked to the given + accession. Args: - genome_data: Genome information of assembly, accession and annotation. + genome_metadata: Genome information of assembly, accession and annotation. base_api_url: Base API URL to fetch the taxonomy data from. """ - species = genome_data["species"] + species = genome_metadata.setdefault("species", {}) if not "taxonomy_id" in species: - accession = genome_data["assembly"]["accession"] + accession = genome_metadata["assembly"]["accession"] taxonomy = get_taxonomy_from_accession(accession, base_api_url) species["taxonomy_id"] = taxonomy["taxon_id"] if (not "strain" in species) and ("strain" in taxonomy): @@ -173,11 +174,11 @@ def get_taxonomy_from_accession(accession: str, base_api_url: str = DEFAULT_API_ base_api_url: Base API URL to fetch the taxonomy data from. Returns: - Dictionary with key-value pairs for `taxon_id` and `scientific_name`. `strain` will also be + Dictionary with key-value pairs for `"taxon_id"` and `"scientific_name"`. `"strain"` will also be included if it is present in the fetched taxonomy data. Raises: - MissingNodeError: If `TAXON` node is missing in the taxonomy data fetched. + MissingNodeError: If `"TAXON"` node is missing in the taxonomy data fetched. """ # Use the GenBank accession without version diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index b64d12806..d5ca68a93 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -24,11 +24,9 @@ from typing import Any, Callable, ContextManager, Dict, Optional from unittest.mock import Mock, patch from xml.etree import ElementTree -from xml.etree.ElementTree import Element from deepdiff import DeepDiff import pytest -import requests from ensembl.io.genomio.genome_metadata import prepare @@ -142,7 +140,7 @@ def test_add_genebuild_metadata( mock_date: A mock of `datetime.date` class. json_data: JSON test file parsing fixture. genome_file: Genome metadata JSON file. - output: Expected date for genebuild's `start_date` and `version` in the updated genome metadata. + output: Expected date for genebuild's `"start_date"` and `"version"` in the updated genome metadata. """ mock_date.today.return_value = mock_date mock_date.isoformat.return_value = output @@ -204,7 +202,7 @@ def test_get_node_text( assert result == output -@pytest.mark.dependency(name="test_get_taxonomy_from_accession", depends=["test_get_node_text"]) +@pytest.mark.dependency(depends=["test_get_node_text"]) @patch("requests.Response") @patch("requests.get") @pytest.mark.parametrize( From 479c994c939c756dab001055893522e390541a17 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 7 Mar 2024 15:08:43 +0000 Subject: [PATCH 11/16] use dict.setdefault() instead of dict.get() --- src/python/ensembl/io/genomio/genome_metadata/prepare.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index 468bb4b77..54c6e0377 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -105,11 +105,10 @@ def add_provider(genome_metadata: Dict, gff3_file: Optional[PathLike] = None) -> # Add annotation provider if there are gene models if gff3_file: - annotation = genome_metadata.get("annotation", {}) + annotation = genome_metadata.setdefault("annotation", {}) if ("provider_name" not in annotation) and ("provider_url" not in annotation): annotation["provider_name"] = provider["annotation"]["provider_name"] annotation["provider_url"] = f'{provider["annotation"]["provider_url"]}/{accession}' - genome_metadata["annotation"] = annotation def add_assembly_version(genome_data: Dict) -> None: @@ -134,13 +133,12 @@ def add_genebuild_metadata(genome_data: Dict) -> None: Args: genome_data: Genome information of assembly, accession and annotation. """ - genebuild = genome_data.get("genebuild", {}) + genebuild = genome_data.setdefault("genebuild", {}) current_date = datetime.date.today().isoformat() if not "version" in genebuild: genebuild["version"] = current_date if not "start_date" in genebuild: genebuild["start_date"] = current_date - genome_data["genebuild"] = genebuild def add_species_metadata(genome_metadata: Dict, base_api_url: str = DEFAULT_API_URL) -> None: From d1844fa292b36df3855f26fe0028efbeb18552da Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 7 Mar 2024 15:09:27 +0000 Subject: [PATCH 12/16] update prepare.add_species_metadata() and add unit test --- .../tests/genome_metadata/test_prepare.py | 47 +++++++++++++++++++ .../test_prepare/cncb_genome.json | 3 -- .../test_prepare/genbank_genome.json | 8 ---- .../test_prepare/refseq_genome.json | 7 +-- .../test_prepare/updated_genome.json | 9 ++-- 5 files changed, 51 insertions(+), 23 deletions(-) diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index d5ca68a93..ec1fff8d9 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -264,3 +264,50 @@ def test_get_taxonomy_from_accession( with expectation: result = prepare.get_taxonomy_from_accession(accession, base_api_url) assert not DeepDiff(result, output) + + +@patch("ensembl.io.genomio.genome_metadata.prepare.get_taxonomy_from_accession") +@pytest.mark.parametrize( + "genome_file, taxonomy, output", + [ + pytest.param( + "genbank_genome.json", + {"taxon_id": 34611, "scientific_name": "Rhipicephalus annulatus"}, + {"taxonomy_id": 34611, "scientific_name": "Rhipicephalus annulatus"}, + id="Add taxonomy information", + ), + pytest.param( + "refseq_genome.json", + {"taxon_id": 34611, "scientific_name": "Rhipicephalus annulatus", "strain": "Klein Grass"}, + {"taxonomy_id": 34611, "scientific_name": "Rhipicephalus annulatus", "strain": "Klein Grass"}, + id="Add strain taxonomy information", + ), + pytest.param( + "updated_genome.json", + {"taxon_id": 34611}, + {"taxonomy_id": 10092, "scientific_name": "Mus musculus", "strain": "domesticus"}, + id="Nothing to add", + ), + ], +) +def test_add_species_metadata( + mock_get_taxonomy_data: Mock, + json_data: Path, + genome_file: str, + taxonomy: Dict[str, Any], + output: Dict[str, Any], +): + """Tests the `prepare.add_species_metadata()` method. + + Args: + mock_get_taxonomy_data: A mock of + `ensembl.io.genomio.genome_metadata.prepare.get_taxonomy_from_accession()` function. + json_data: JSON test file parsing fixture. + genome_file: Genome metadata JSON file. + taxonomy: Taxonomy metadata to add. + output: Expected `"species"` genome metadata content. + """ + mock_get_taxonomy_data.return_value = taxonomy + genome_metadata = json_data(genome_file) + prepare.add_species_metadata(genome_metadata) + assert not DeepDiff(genome_metadata["species"], output) diff --git a/src/python/tests/genome_metadata/test_prepare/cncb_genome.json b/src/python/tests/genome_metadata/test_prepare/cncb_genome.json index 673848576..07bf83108 100644 --- a/src/python/tests/genome_metadata/test_prepare/cncb_genome.json +++ b/src/python/tests/genome_metadata/test_prepare/cncb_genome.json @@ -1,8 +1,5 @@ { "assembly": { "accession": "GWHAMMH00000000" - }, - "species": { - "scientific_name": "Ixodes persulcatus" } } \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_prepare/genbank_genome.json b/src/python/tests/genome_metadata/test_prepare/genbank_genome.json index 7665c52a5..d7ec38df6 100644 --- a/src/python/tests/genome_metadata/test_prepare/genbank_genome.json +++ b/src/python/tests/genome_metadata/test_prepare/genbank_genome.json @@ -1,13 +1,5 @@ { - "BRC4": { - "component": "VectorBase", - "organism_abbrev": "rannulKG" - }, "assembly": { "accession": "GCA_013436015.2" - }, - "species": { - "scientific_name": "Rhipicephalus annulatus", - "strain": "Klein Grass" } } \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_prepare/refseq_genome.json b/src/python/tests/genome_metadata/test_prepare/refseq_genome.json index ce490dae4..7470e8b67 100644 --- a/src/python/tests/genome_metadata/test_prepare/refseq_genome.json +++ b/src/python/tests/genome_metadata/test_prepare/refseq_genome.json @@ -1,13 +1,8 @@ { - "BRC4": { - "component": "AmoebaDB", - "organism_abbrev": "ddisAX4" - }, "assembly": { "accession": "GCF_000004695.1" }, "species": { - "scientific_name": "Dictyostelium discoideum AX4", - "strain": "AX4" + "scientific_name": "Rhipicephalus annulatus" } } \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_prepare/updated_genome.json b/src/python/tests/genome_metadata/test_prepare/updated_genome.json index 42d64dff6..b709a0c48 100644 --- a/src/python/tests/genome_metadata/test_prepare/updated_genome.json +++ b/src/python/tests/genome_metadata/test_prepare/updated_genome.json @@ -1,8 +1,4 @@ { - "BRC4": { - "component": "VectorBase", - "organism_abbrev": "rannulKG" - }, "annotation": { "provider_name": "GenBank" }, @@ -16,7 +12,8 @@ "version": "01-2021" }, "species": { - "scientific_name": "Rhipicephalus annulatus", - "strain": "Klein Grass" + "scientific_name": "Mus musculus", + "strain": "domesticus", + "taxonomy_id": 10092 } } \ No newline at end of file From 6d0642d7a09f9f9172e0918cf35887151d7e5f81 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 7 Mar 2024 15:47:37 +0000 Subject: [PATCH 13/16] add unit test for prepare.prepare_genome_metadata() --- .../tests/genome_metadata/test_prepare.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index ec1fff8d9..2249e0bd7 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -311,3 +311,34 @@ def test_add_species_metadata( genome_metadata = json_data(genome_file) prepare.add_species_metadata(genome_metadata) assert not DeepDiff(genome_metadata["species"], output) + + +@patch("ensembl.io.genomio.genome_metadata.prepare.add_species_metadata") +@patch("ensembl.io.genomio.genome_metadata.prepare.add_genebuild_metadata") +@patch("ensembl.io.genomio.genome_metadata.prepare.add_assembly_version") +@patch("ensembl.io.genomio.genome_metadata.prepare.add_provider") +def test_prepare_genome_metadata( + mock_add_provider: Mock, + mock_add_assembly_version: Mock, + mock_add_genebuild_metadata: Mock, + mock_add_species_metadata: Mock, + tmp_path: Path, + data_dir: Path, + assert_files: Callable[[Path, Path], None], +): + """Tests the `prepare.prepare_genome_metadata()` method. + + Args: + mock_*: A mock of `ensembl.io.genomio.genome_metadata.prepare.*` functions. + tmp_path: Test's unique temporary directory fixture. + data_dir: Module's test data directory fixture. + assert_files: File diff assertion fixture. + """ + input_file = data_dir / "updated_genome.json" + output_file = tmp_path / "output.json" + prepare.prepare_genome_metadata(input_file, output_file) + mock_add_provider.assert_called_once() + mock_add_assembly_version.assert_called_once() + mock_add_genebuild_metadata.assert_called_once() + mock_add_species_metadata.assert_called_once() + assert_files(input_file, output_file) From 32f9b48bdd99acbe31da1f7b7dda41dcd0280938 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 7 Mar 2024 15:54:04 +0000 Subject: [PATCH 14/16] make pylint and mypy happy --- .../ensembl/io/genomio/genome_metadata/prepare.py | 3 ++- src/python/tests/genome_metadata/test_prepare.py | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index 54c6e0377..4b6aa2b5d 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -193,7 +193,8 @@ def get_taxonomy_from_accession(accession: str, base_api_url: str = DEFAULT_API_ scientific_name = _get_node_text(taxon_node, "SCIENTIFIC_NAME") strain = _get_node_text(taxon_node, "STRAIN", optional=True) taxonomy = { - "taxon_id": int(taxon_id), + # Ignore arg-type check in the following line since taxon_id cannot be None + "taxon_id": int(taxon_id), # type: ignore[arg-type] "scientific_name": scientific_name, } if strain: diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index 2249e0bd7..ba94c1e1a 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -49,7 +49,7 @@ ), pytest.param( "refseq_genome.json", - "fake.gff3", + Path("fake.gff3"), { "assembly": { "provider_name": "RefSeq", @@ -65,7 +65,7 @@ ), pytest.param( "updated_genome.json", - "fake.gff3", + Path("fake.gff3"), { "assembly": {"provider_name": "GenBank", "provider_url": None}, "annotation": {"provider_name": "GenBank", "provider_url": None}, @@ -81,7 +81,7 @@ def test_add_provider( json_data: Callable[[str], Any], genome_file: str, - gff3_file: Optional[str], + gff3_file: Optional[Path], output: Dict[str, Dict[str, Optional[str]]], expectation: ContextManager, ) -> None: @@ -198,7 +198,7 @@ def test_get_node_text( else: node = None with expectation: - result = prepare._get_node_text(node, tag, optional) + result = prepare._get_node_text(node, tag, optional) # pylint: disable=protected-access assert result == output @@ -292,7 +292,7 @@ def test_get_taxonomy_from_accession( ) def test_add_species_metadata( mock_get_taxonomy_data: Mock, - json_data: Path, + json_data: Callable[[str], Any], genome_file: str, taxonomy: Dict[str, Any], output: Dict[str, Any], From f1c463aa9a49b8ef57c9458ac60d98d3088f4ccd Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Fri, 8 Mar 2024 11:03:02 +0000 Subject: [PATCH 15/16] typo: the test is for filter_genome_meta() --- src/python/tests/genome_metadata/test_dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index f6f0c8082..75c4a4494 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -126,7 +126,7 @@ def test_check_genebuild_version( ], ) def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, Any]) -> None: - """Tests the `dump.check_genebuild_version()` method. + """Tests the `dump.filter_genome_meta()` method. Args: genome_metadata: Nested genome metadata key values. From 8a9f621f1e0cb122e3276d3560c8223c650a37d6 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Fri, 8 Mar 2024 16:11:12 +0000 Subject: [PATCH 16/16] add better explanation of test parametrisations --- src/python/tests/genome_metadata/test_prepare.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index ba94c1e1a..5879306e6 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -105,9 +105,9 @@ def test_add_provider( @pytest.mark.parametrize( "genome_file, output", [ - ("genbank_genome.json", 2), - ("updated_genome.json", 1), - ("cncb_genome.json", 0), + pytest.param("genbank_genome.json", 2, id="Added assembly version"), + pytest.param("updated_genome.json", 1, id="Version found, nothing to add"), + pytest.param("cncb_genome.json", 0, id="No version available, nothing to add"), ], ) def test_add_assembly_version(json_data: Callable[[str], Any], genome_file: str, output: int) -> None: @@ -127,8 +127,8 @@ def test_add_assembly_version(json_data: Callable[[str], Any], genome_file: str, @pytest.mark.parametrize( "genome_file, output", [ - ("genbank_genome.json", "03-2024"), - ("updated_genome.json", "01-2021"), + pytest.param("genbank_genome.json", "03-2024", id="Added '03-2024' as genebuild metadata"), + pytest.param("updated_genome.json", "01-2021", id="Found '01-2021', nothing to add"), ], ) def test_add_genebuild_metadata(