Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disha/update func anno #305

Merged
merged 27 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
50ffbc1
adding repeated xrefs as an array
Dishalodha Feb 29, 2024
cbb0410
adding xrefs to the json file
Dishalodha Feb 29, 2024
0c47f0a
add provider name for xrefs
Dishalodha Feb 29, 2024
d92c61d
only adding xrefs to transcripts and translation
Dishalodha Mar 4, 2024
a39cf6f
ran black and mypy
Dishalodha Mar 4, 2024
3b14311
remove the print statements
Dishalodha Mar 4, 2024
9954b93
correcting typo
Dishalodha Mar 4, 2024
90d474e
Merge branch 'main' into disha/update_func_anno
Dishalodha Mar 4, 2024
1112c70
remove make_missing_stable_ids parameter from extract_annotation
Dishalodha Mar 5, 2024
d4a2a7c
making sure the pipeline doesnt fail if there is no provider_name
Dishalodha Mar 5, 2024
97ade73
adding dbxref and synonyms to functional anno file'
Dishalodha Mar 5, 2024
2b28612
change GenBank dbname to RefSeq
Dishalodha Mar 5, 2024
def0b5b
adding a list of ignored xrefs
Dishalodha Mar 5, 2024
98045a3
Simplify, use lowercase, no allowed needed
MatBarba Mar 6, 2024
eed153d
Optional -> default None
MatBarba Mar 6, 2024
913e321
catch keyerror
MatBarba Mar 6, 2024
21e5e44
Do not keep synonyms if in xrefs already
MatBarba Mar 6, 2024
b7be8de
Use lowercase to compare
MatBarba Mar 6, 2024
939daa6
Merge pull request #309 from Ensembl/mbarba/disha_func_tweaks
Dishalodha Mar 6, 2024
797e64d
removed spaces, suggestion from code review
Dishalodha Mar 5, 2024
342c667
Update spaces scripts/load_fann.pl
Dishalodha Mar 6, 2024
ae6600f
Update spaces scripts/load_fann.pl
Dishalodha Mar 6, 2024
13020cf
Update space
Dishalodha Mar 6, 2024
74203a6
remove all the trailing white spaces
Dishalodha Mar 6, 2024
c5ccfa9
adding type hints
Dishalodha Mar 7, 2024
17fae7c
format code, unwanted space
Dishalodha Mar 7, 2024
bf4c5de
Update from code review
Dishalodha Mar 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 36 additions & 44 deletions scripts/load_fann.pl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ =head1 EXAMPLE
"Uniprot/SPTREMBL"
);


&GetOptions(
'host=s' => \$host,
'port=s' => \$port,
Expand Down Expand Up @@ -129,7 +128,6 @@ =head1 EXAMPLE
-dbname => $dbname,
);


my $aa = $dba->get_adaptor('Analysis');
my $analysis = $aa->fetch_by_logic_name($analysis_name);
if (! defined $analysis) {
Expand Down Expand Up @@ -172,7 +170,6 @@ sub get_adaptor {
$type = 'gene';
}
my $lc_type = lc($type);

my $adaptor = get_adaptor($dba, $type);
if (not defined $adaptor) {
warn qq/can't get adaptor for "$type" (id: "$id"). skipping...\n/;
Expand Down Expand Up @@ -204,13 +201,11 @@ sub get_adaptor {
warn qq/can't get object for "$id" (type: "$type"). skipping...\n/;
next;
}

# gene and transcript description
if ($lc_type eq "gene" or $lc_type eq 'transcript') {
$obj->description($it->{description}) if (exists $it->{description} && $it->{description} !~ m/^\s*$/);
$do_update = 1;
}

# gene and transript versions
if ($lc_type eq "gene" or $lc_type eq "transcript") {
my $version = exists $it->{version}
Expand Down Expand Up @@ -250,13 +245,10 @@ sub get_adaptor {

# Remove duplications
@xrefs = unique_xrefs(@xrefs);

# prepend missing GO and SO prefixes
prepend_GO_SO(\@xrefs);

# Do not load xrefs generated by Ensembl
@xrefs = remove_ensembl_xrefs(\@xrefs, $extdb_map, $lc_type) if $skip_ensembl_xrefs;

my $already_used = 0;
my $stored_xref = undef;
for my $xref (@xrefs) {
Expand Down Expand Up @@ -291,7 +283,6 @@ sub get_adaptor {
$xref->{info_type},
$xref->{info_text}
);

# update 'display_xref' only for the first time or for the $set_display_xref_4
if ( defined $display_xref && ( $display_xref eq $xref->{id} || $xref->{display_id} && $xref->{display_id} eq $display_xref ) ) {
if ($lc_type eq "gene" || $lc_type eq "transcript") {
Expand All @@ -317,9 +308,7 @@ sub get_adaptor {
$dba->dbc && $dba->dbc->disconnect_if_idle();
close($fh);


# utils

sub unique_xrefs {
my (@xrefs) = @_;

Expand All @@ -343,7 +332,6 @@ sub prepend_GO_SO {
}
}


sub remove_ensembl_xrefs {
# Do not load xrefs generated by Ensembl
my ($xrefs, $extdb_map, $lc_type) = @_;
Expand All @@ -354,12 +342,12 @@ sub remove_ensembl_xrefs {
if ($dbname && !exists $ensembl_xrefs{ $dbname }) {
push @new_xrefs, $xref;
}

}

return @new_xrefs;
}


sub norm_str {
my ($raw) = @_;
return $raw if (!defined $raw);
Expand All @@ -371,6 +359,18 @@ sub norm_str {
return $out;
}

sub add_to_db_map {
Dishalodha marked this conversation as resolved.
Show resolved Hide resolved
my ($db_map, $feature, $from, $val, $pat) = @_;
$db_map->{$feature} //= {};
$db_map->{$feature}->{$from} //= [];
# We are adding same dbname but different pat to the database as an array
my %uniq_dict = map {"val:".$_->{val}."with:".($_->{pat} // "_undef_") => 1 } @{$db_map->{$feature}->{$from}};
my $k = "val:".$val."with:".($pat // "_undef_");
if (!exists $uniq_dict{$k} ) {
push @{$db_map->{$feature}->{$from} }, {val => $val, pat => $pat };
}
}

sub load_external_db_map {
my($path) = @_;

Expand All @@ -392,31 +392,27 @@ sub load_external_db_map {
$from_name = uc($from_name);

if ($to_name eq '_IGNORE_') {
$db_map->{IGNORE}->{$feature}->{$from_name} = { val => 1, pat => $pat };
$db_map->{IGNORE}->{_OTHER_}->{$from_name} = { val => 1, pat => undef } if ($feature ne "_ANY_");
add_to_db_map($db_map->{IGNORE}, $feature, $from_name, 1, $pat);
add_to_db_map($db_map->{IGNORE}, '_OTHER_', $from_name, 1, undef) if ($feature ne "_ANY_");
} else {
$db_map->{VALID}->{$feature}->{$from_name} = { val => $to_name, pat => $pat };
$db_map->{VALID}->{_OTHER_}->{$from_name} = { val => 1, pat => undef } if ($feature ne "_ANY_");
add_to_db_map($db_map->{VALID}, $feature, $from_name, $to_name, $pat);
add_to_db_map($db_map->{VALID}, '_OTHER_', $from_name, 1, undef) if ($feature ne "_ANY_");
}
}

}
return $db_map;
}

sub db_name_for_feature {
my ($db_map, $lc_type, $xref, $skip_unknown_xref_source) = @_;

return if (!$xref || !$xref->{dbname});

my $raw_dbname = $xref->{dbname};
my $xref_id = $xref->{id};

my $res = $skip_unknown_xref_source ? undef : $raw_dbname;

return $res if !$db_map;

my $feature = uc($lc_type);
my $from_name = uc($raw_dbname);

my $ignore_map = $db_map->{IGNORE};
my $ignore_feature = exists $ignore_map->{$feature} && $ignore_map->{$feature}->{$from_name} || undef;
my $ignore_any = exists $ignore_map->{_ANY_} && $ignore_map->{_ANY_}->{$from_name} || undef;
Expand All @@ -428,27 +424,29 @@ sub db_name_for_feature {
my $valid_other = exists $valid_map->{_OTHER_} && $valid_map->{_OTHER_}->{$from_name} || undef;

# check if there's a specific ignore rule
if ($ignore_feature) {
my $pat = $ignore_feature->{pat};
return if (!defined $pat);
return if ($xref_id =~ m/$pat/);
if ($ignore_feature && @$ignore_feature) {
for my $case (@$ignore_feature) {
my $pat = $case->{pat};
return if (!defined $pat);
return if ($xref_id =~ m/$pat/);
}
}

# check if there's a specific valid rule
if ($valid_feature) {
my $pat = $valid_feature->{pat};
return $valid_feature->{val} if (!defined $pat);
return $valid_feature->{val} if ($xref_id =~ m/$pat/);
if ($valid_feature && @$valid_feature) {
for my $case (@$valid_feature) {
my $pat = $case->{pat};
return $case->{val} if (!defined $pat);
return $case->{val} if ($xref_id =~ m/$pat/);
}
}

# check if mentioned anywhere else and no global validness; no pattern checked
return if ($valid_other and !$valid_any);

return if (($valid_other && @$valid_other) && !($valid_any && @$valid_any));
# check global ignore
return if ($ignore_any);
return if ($ignore_any && @$ignore_any);
# then check global validness
return $valid_any->{val} if ($valid_any);

if ($valid_any && @$valid_any) {
return $valid_any->[0]->{val};
}
# return raw name or undef based on $skip_unknown_xref_source flag
return $res;
}
Expand All @@ -463,7 +461,6 @@ sub get_syns {
return ($default_ones[0], \@syns)
}


sub array_ref {
my ($o) = @_;

Expand All @@ -474,7 +471,6 @@ sub array_ref {

sub store_xref {
my ($dbea, $object_type, $id, $external_db_name, $external_id, $external_display, $synonyms, $description, $info_type, $info_text) = @_;

# make an xref
my $entry = new Bio::EnsEMBL::DBEntry(
-adaptor => $dbea,
Expand All @@ -486,18 +482,14 @@ sub store_xref {
-info_text => $info_text,
-analysis => $analysis,
);

# add synonyms
$entry->{synonyms} = $synonyms if ($synonyms and ref $synonyms eq "ARRAY");

# store xref
my $ignore_release = 1;
return $entry if $dbea->store( $entry, $id, $object_type, $ignore_release);

return;
}


sub update_version {
my ($dba, $type, $id, $obj, $version) = @_;

Expand Down
46 changes: 41 additions & 5 deletions src/python/ensembl/io/genomio/gff3/extract_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"FunctionalAnnotations",
]

import logging
Dishalodha marked this conversation as resolved.
Show resolved Hide resolved
from os import PathLike
from pathlib import Path
import re
Expand Down Expand Up @@ -55,9 +56,11 @@ class AnnotationError(Exception):
class FunctionalAnnotations:
"""List of annotations extracted from a GFF3 file."""

def __init__(self) -> None:
self.annotations: List[Annotation] = []
ignored_xrefs = {"go", "interpro", "uniprot"}

def __init__(self, genome: Optional[Dict[str, Dict[str, Any]]] = None) -> None:
self.annotations: List[Annotation] = []
self.genome = genome
# Annotated features
# Under each feature, each dict's key is a feature ID
self.features: Dict[str, Dict[str, Annotation]] = {
Expand All @@ -72,6 +75,31 @@ def __init__(self) -> None:
"transcript": {},
}

def get_xrefs(self, feature: SeqFeature) -> List[Dict[str, Any]]:
"""Get the xrefs from the Dbxref field."""
all_xref = []

# Using provider name to modify the xref
provider_name = None
if self.genome:
try:
provider_name = self.genome["assembly"]["provider_name"]
except KeyError:
logging.warning("No provider name is provided in the genome file")

# Extract the Dbxrefs
for xref in feature.qualifiers["Dbxref"]:
dbname, name = xref.split(":")
if dbname == "GenBank" and provider_name == "RefSeq":
dbname = "RefSeq"

if dbname.lower() in self.ignored_xrefs:
continue

xrefs = {"dbname": dbname, "id": name}
all_xref.append(xrefs)
return all_xref

def get_features(self, feat_type: str) -> Dict[str, Annotation]:
"""Get all feature annotations for the requested type."""
try:
Expand Down Expand Up @@ -127,7 +155,6 @@ def _generic_feature(self, feature: SeqFeature, feat_type: str) -> Dict[str, Any
feat_type: Feature type of the feature to store (e.g. gene, transcript, translation).

"""

feature_object: Annotation = {"object_type": feat_type, "id": feature.id}

# Description?
Expand All @@ -145,11 +172,20 @@ def _generic_feature(self, feature: SeqFeature, feat_type: str) -> Dict[str, Any
):
del feature_object["description"]

feature_object["xrefs"] = []
if "Dbxref" in feature.qualifiers:
all_xref = self.get_xrefs(feature)
feature_object["xrefs"] = all_xref

xref_values = {xref["id"].lower() for xref in feature_object["xrefs"]}

# Synonyms?
# We add synonyms to the external_synonym table
# which is associated with the first xref of that feature type
if "Name" in feature.qualifiers:
feat_name = feature.qualifiers["Name"][0]
if feat_name != feature.id:
feature_object["synonyms"] = {"synonym": feat_name, "default": True}
if feat_name.lower() != feature.id.lower() and feat_name.lower() not in xref_values:
feature_object["synonyms"] = {"synonym": feat_name}

# is_pseudogene?
if feature.type.startswith("pseudogen"):
Expand Down
6 changes: 3 additions & 3 deletions src/python/ensembl/io/genomio/gff3/simplifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from os import PathLike
from pathlib import Path
import re
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional

from BCBio import GFF
from Bio.SeqRecord import SeqRecord
Expand Down Expand Up @@ -80,11 +80,11 @@ def __init__(self, genome_path: Optional[PathLike] = None):
biotypes_json = files(ensembl.io.genomio.data.gff3) / "biotypes.json"
self._biotypes = get_json(biotypes_json)
self.records = Records()
self.annotations = FunctionalAnnotations()
self.genome = {}
self.genome: Dict[str, Dict[str, Any]] = {}
if genome_path:
with Path(genome_path).open("r") as genome_fh:
self.genome = json.load(genome_fh)
self.annotations = FunctionalAnnotations(self.genome)
self._set_id_prefix()

def _set_id_prefix(self) -> None:
Expand Down
Loading