Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move set_id_prefix to the stable id allocator #299

Merged
merged 1 commit into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/python/ensembl/io/genomio/gff3/id_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from dataclasses import dataclass, field
import logging
import re
from typing import List, Set
from typing import Dict, List, Set

from Bio.SeqFeature import SeqFeature

Expand All @@ -40,6 +40,16 @@ class StableIDAllocator:
prefix: str = "TMP_"
_loaded_ids: Set = field(default_factory=set)

def set_prefix(self, genome: Dict) -> None:
"""Sets the ID prefix using the organism abbrev if it exists in the genome metadata."""
try:
org = genome["BRC4"]["organism_abbrev"]
except KeyError:
prefix = "TMP_PREFIX_"
else:
prefix = "TMP_" + org + "_"
self.prefix = prefix

def generate_gene_id(self) -> str:
"""Returns a new unique gene stable_id with a prefix.

Expand Down
27 changes: 12 additions & 15 deletions src/python/ensembl/io/genomio/gff3/simplifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,30 +72,27 @@ class GFFSimplifier:
skip_unrecognized = True
gene_cds_skip_others = False
allow_pseudogene_with_CDS = False
exclude_seq_regions: List = []
fail_types: Dict[str, int] = {}
stable_ids = StableIDAllocator()

def __init__(self, genome_path: Optional[PathLike] = None):
# Load biotypes
biotypes_json = files(ensembl.io.genomio.data.gff3) / "biotypes.json"
self._biotypes = get_json(biotypes_json)
self.records = Records()
self.annotations = FunctionalAnnotations()

# Load genome metadata
self.genome = {}
if genome_path:
with Path(genome_path).open("r") as genome_fh:
self.genome = json.load(genome_fh)
self._set_id_prefix()

def _set_id_prefix(self) -> None:
"""Sets the ID prefix using the organism abbrev if it exists in the genome metadata."""
try:
org = self.genome["BRC4"]["organism_abbrev"]
except KeyError:
prefix = "TMP_PREFIX_"
else:
prefix = "TMP_" + org + "_"
self.stable_ids.prefix = prefix
# Other preparations
self.stable_ids = StableIDAllocator()
self.stable_ids.set_prefix(self.genome)
self.exclude_seq_regions: List = []
self.fail_types: Dict[str, int] = {}

# Init the actual data we will store
self.records = Records()
self.annotations = FunctionalAnnotations()

def simpler_gff3(self, in_gff_path: PathLike) -> None:
"""Loads a GFF3 from INSDC and rewrites it in a simpler version, whilst also writing a
Expand Down
16 changes: 15 additions & 1 deletion src/python/tests/gff3/test_id_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from difflib import unified_diff
import filecmp
from pathlib import Path
from typing import ContextManager, List, Optional
from typing import ContextManager, Dict, List, Optional

import pytest

Expand Down Expand Up @@ -55,6 +55,20 @@ def _show_diff(result_path: Path, expected_path: Path) -> str:
return "".join(diff)


@pytest.mark.parametrize(
"genome, expected_prefix",
[
pytest.param({}, "TMP_PREFIX_", id="Default prefix"),
pytest.param({"BRC4": {"organism_abbrev": "LOREM"}}, "TMP_LOREM_", id="Prefix from genome meta"),
],
)
def test_set_prefix(genome: Dict, expected_prefix: str) -> None:
"""Test prefix setting from genome metadata."""
ids = StableIDAllocator()
ids.set_prefix(genome)
assert ids.prefix == expected_prefix


@pytest.mark.parametrize(
"prefix, expected_ids",
[
Expand Down
Loading