Ensembl · Dishalodha · Mar 8, 2024 · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/scripts/load_fann.pl b/scripts/load_fann.pl
@@ -90,7 +90,6 @@ =head1 EXAMPLE
   "Uniprot/SPTREMBL"
 );
 
-
 &GetOptions(
   'host=s'                     => \$host,
   'port=s'                     => \$port,
@@ -129,7 +128,6 @@ =head1 EXAMPLE
   -dbname => $dbname,
 );
 
-
 my $aa       = $dba->get_adaptor('Analysis');
 my $analysis = $aa->fetch_by_logic_name($analysis_name);
 if (! defined $analysis) {
@@ -172,7 +170,6 @@ sub get_adaptor {
     $type = 'gene';
   }
   my $lc_type = lc($type);
-
   my $adaptor = get_adaptor($dba, $type);
   if (not defined $adaptor) {
     warn qq/can't get adaptor for "$type" (id: "$id"). skipping...\n/;
@@ -204,13 +201,11 @@ sub get_adaptor {
     warn qq/can't get object for "$id" (type: "$type"). skipping...\n/;
     next;
   }
-
   # gene and transcript description
   if ($lc_type eq "gene" or $lc_type eq 'transcript') {
     $obj->description($it->{description}) if (exists $it->{description} && $it->{description} !~ m/^\s*$/);
     $do_update = 1;
   }
-
   # gene and transript versions
   if ($lc_type eq "gene" or $lc_type eq "transcript") {
     my $version = exists $it->{version}
@@ -250,13 +245,10 @@ sub get_adaptor {
 
   # Remove duplications
   @xrefs = unique_xrefs(@xrefs);
-
   # prepend missing GO and SO prefixes
   prepend_GO_SO(\@xrefs);
-
   # Do not load xrefs generated by Ensembl
   @xrefs = remove_ensembl_xrefs(\@xrefs, $extdb_map, $lc_type) if $skip_ensembl_xrefs;
-
   my $already_used = 0;
   my $stored_xref = undef;
   for my $xref (@xrefs) {
@@ -291,7 +283,6 @@ sub get_adaptor {
       $xref->{info_type},
       $xref->{info_text}
     );
-
     # update 'display_xref' only for the first time or for the $set_display_xref_4
     if ( defined $display_xref && ( $display_xref eq $xref->{id} || $xref->{display_id} && $xref->{display_id} eq $display_xref ) ) {
       if ($lc_type eq "gene" || $lc_type eq "transcript") {
@@ -317,9 +308,7 @@ sub get_adaptor {
 $dba->dbc && $dba->dbc->disconnect_if_idle();
 close($fh);
 
-
 # utils
-
 sub unique_xrefs {
   my (@xrefs) = @_;
 
@@ -343,7 +332,6 @@ sub prepend_GO_SO {
   }
 }
 
-
 sub remove_ensembl_xrefs {
   # Do not load xrefs generated by Ensembl
   my ($xrefs, $extdb_map, $lc_type) = @_;
@@ -354,12 +342,12 @@ sub remove_ensembl_xrefs {
     if ($dbname && !exists $ensembl_xrefs{ $dbname }) {
       push @new_xrefs, $xref;
     }
+
   }
 
   return @new_xrefs;
 }
 
-
 sub norm_str {
   my ($raw) = @_;
   return $raw if (!defined $raw);
@@ -371,6 +359,18 @@ sub norm_str {
   return $out;
 }
 
+sub add_to_db_map {
+  my ($db_map, $feature, $from, $val, $pat) = @_;
+  $db_map->{$feature} //= {};
+  $db_map->{$feature}->{$from} //= [];
+  # We are adding same dbname but different pat to the database as an array
+  my %uniq_dict = map {"val:".$_->{val}."with:".($_->{pat} // "_undef_") => 1 } @{$db_map->{$feature}->{$from}};
+  my $k = "val:".$val."with:".($pat // "_undef_");
+  if (!exists $uniq_dict{$k} ) {
+    push @{$db_map->{$feature}->{$from} }, {val => $val, pat => $pat };
+  }
+}
+
 sub load_external_db_map {
   my($path) = @_;
 
@@ -392,31 +392,27 @@ sub load_external_db_map {
     $from_name = uc($from_name);
 
     if ($to_name eq '_IGNORE_') {
-      $db_map->{IGNORE}->{$feature}->{$from_name} = { val => 1, pat => $pat };
-      $db_map->{IGNORE}->{_OTHER_}->{$from_name} = { val => 1, pat => undef } if ($feature ne "_ANY_");
+      add_to_db_map($db_map->{IGNORE}, $feature, $from_name, 1, $pat);
+      add_to_db_map($db_map->{IGNORE}, '_OTHER_', $from_name, 1, undef) if ($feature ne "_ANY_");
     } else {
-      $db_map->{VALID}->{$feature}->{$from_name} = { val => $to_name, pat => $pat };
-      $db_map->{VALID}->{_OTHER_}->{$from_name} = { val => 1, pat => undef } if ($feature ne "_ANY_");
+      add_to_db_map($db_map->{VALID}, $feature, $from_name, $to_name, $pat);
+      add_to_db_map($db_map->{VALID}, '_OTHER_', $from_name, 1, undef) if ($feature ne "_ANY_");
     }
-  }
-
+}
   return $db_map;
 }
 
 sub db_name_for_feature {
   my ($db_map, $lc_type, $xref, $skip_unknown_xref_source) = @_;
-
   return if (!$xref || !$xref->{dbname});
+
   my $raw_dbname = $xref->{dbname};
   my $xref_id = $xref->{id};
-
   my $res = $skip_unknown_xref_source ?  undef : $raw_dbname;
 
   return $res if !$db_map;
-
   my $feature = uc($lc_type);
   my $from_name = uc($raw_dbname);
-
   my $ignore_map = $db_map->{IGNORE};
   my $ignore_feature = exists $ignore_map->{$feature} && $ignore_map->{$feature}->{$from_name} || undef;
   my $ignore_any = exists $ignore_map->{_ANY_} && $ignore_map->{_ANY_}->{$from_name} || undef;
@@ -428,27 +424,29 @@ sub db_name_for_feature {
   my $valid_other = exists $valid_map->{_OTHER_} && $valid_map->{_OTHER_}->{$from_name} || undef;
 
   # check if there's a specific ignore rule
-  if ($ignore_feature) {
-    my $pat = $ignore_feature->{pat};
-    return if (!defined $pat);
-    return if ($xref_id =~ m/$pat/);
+  if ($ignore_feature && @$ignore_feature) {
+    for my $case (@$ignore_feature) {
+      my $pat = $case->{pat};
+      return if (!defined $pat);
+      return if ($xref_id =~ m/$pat/);
+    }
   }
-
   # check if there's a specific valid rule
-  if ($valid_feature) {
-    my $pat = $valid_feature->{pat};
-    return $valid_feature->{val} if (!defined $pat);
-    return $valid_feature->{val} if ($xref_id =~ m/$pat/);
+  if ($valid_feature && @$valid_feature) {
+    for my $case (@$valid_feature) {
+      my $pat = $case->{pat};
+      return $case->{val} if (!defined $pat);
+      return $case->{val} if ($xref_id =~ m/$pat/);
+    } 
   }
-
   # check if mentioned anywhere else and no global validness; no pattern checked
-  return if ($valid_other and !$valid_any);
-
+  return if (($valid_other && @$valid_other) && !($valid_any && @$valid_any));
   # check global ignore
-  return if ($ignore_any);
+  return if ($ignore_any && @$ignore_any);
   # then check global validness
-  return $valid_any->{val} if ($valid_any);
-
+  if ($valid_any && @$valid_any) {
+    return $valid_any->[0]->{val};
+  }
   # return raw name or undef based on $skip_unknown_xref_source flag
   return $res;
 }
@@ -463,7 +461,6 @@ sub get_syns {
   return ($default_ones[0], \@syns)
 }
 
-
 sub array_ref {
   my ($o) = @_;
 
@@ -474,7 +471,6 @@ sub array_ref {
 
 sub store_xref {
   my ($dbea, $object_type, $id, $external_db_name, $external_id, $external_display, $synonyms, $description, $info_type, $info_text) = @_;
-
   # make an xref
   my $entry = new Bio::EnsEMBL::DBEntry(
     -adaptor     => $dbea,
@@ -486,18 +482,14 @@ sub store_xref {
     -info_text   => $info_text,
     -analysis    => $analysis,
   );
-
   # add synonyms
   $entry->{synonyms} = $synonyms if ($synonyms and ref $synonyms eq "ARRAY");
-
   # store xref
   my $ignore_release = 1;
   return $entry if $dbea->store( $entry, $id, $object_type, $ignore_release);
-
   return;
 }
 
-
 sub update_version {
   my ($dba, $type, $id, $obj, $version) = @_;
 

diff --git a/src/python/ensembl/io/genomio/gff3/extract_annotation.py b/src/python/ensembl/io/genomio/gff3/extract_annotation.py
@@ -22,6 +22,7 @@
     "FunctionalAnnotations",
 ]
 
+import logging
 from os import PathLike
 from pathlib import Path
 import re
@@ -55,9 +56,11 @@ class AnnotationError(Exception):
 class FunctionalAnnotations:
     """List of annotations extracted from a GFF3 file."""
 
-    def __init__(self) -> None:
-        self.annotations: List[Annotation] = []
+    ignored_xrefs = {"go", "interpro", "uniprot"}
 
+    def __init__(self, genome: Optional[Dict[str, Dict[str, Any]]] = None) -> None:
+        self.annotations: List[Annotation] = []
+        self.genome = genome
         # Annotated features
         # Under each feature, each dict's key is a feature ID
         self.features: Dict[str, Dict[str, Annotation]] = {
@@ -72,6 +75,31 @@ def __init__(self) -> None:
             "transcript": {},
         }
 
+    def get_xrefs(self, feature: SeqFeature) -> List[Dict[str, Any]]:
+        """Get the xrefs from the Dbxref field."""
+        all_xref = []
+
+        # Using provider name to modify the xref
+        provider_name = None
+        if self.genome:
+            try:
+                provider_name = self.genome["assembly"]["provider_name"]
+            except KeyError:
+                logging.warning("No provider name is provided in the genome file")
+
+        # Extract the Dbxrefs
+        for xref in feature.qualifiers["Dbxref"]:
+            dbname, name = xref.split(":")
+            if dbname == "GenBank" and provider_name == "RefSeq":
+                dbname = "RefSeq"
+
+            if dbname.lower() in self.ignored_xrefs:
+                continue
+
+            xrefs = {"dbname": dbname, "id": name}
+            all_xref.append(xrefs)
+        return all_xref
+
     def get_features(self, feat_type: str) -> Dict[str, Annotation]:
         """Get all feature annotations for the requested type."""
         try:
@@ -127,7 +155,6 @@ def _generic_feature(self, feature: SeqFeature, feat_type: str) -> Dict[str, Any
             feat_type: Feature type of the feature to store (e.g. gene, transcript, translation).
 
         """
-
         feature_object: Annotation = {"object_type": feat_type, "id": feature.id}
 
         # Description?
@@ -145,11 +172,20 @@ def _generic_feature(self, feature: SeqFeature, feat_type: str) -> Dict[str, Any
         ):
             del feature_object["description"]
 
+        feature_object["xrefs"] = []
+        if "Dbxref" in feature.qualifiers:
+            all_xref = self.get_xrefs(feature)
+            feature_object["xrefs"] = all_xref
+
+        xref_values = {xref["id"].lower() for xref in feature_object["xrefs"]}
+
         # Synonyms?
+        # We add synonyms to the external_synonym table
+        # which is associated with the first xref of that feature type
         if "Name" in feature.qualifiers:
             feat_name = feature.qualifiers["Name"][0]
-            if feat_name != feature.id:
-                feature_object["synonyms"] = {"synonym": feat_name, "default": True}
+            if feat_name.lower() != feature.id.lower() and feat_name.lower() not in xref_values:
+                feature_object["synonyms"] = {"synonym": feat_name}
 
         # is_pseudogene?
         if feature.type.startswith("pseudogen"):

diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py
@@ -28,7 +28,7 @@
 from os import PathLike
 from pathlib import Path
 import re
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 from BCBio import GFF
 from Bio.SeqRecord import SeqRecord
@@ -80,11 +80,11 @@ def __init__(self, genome_path: Optional[PathLike] = None):
         biotypes_json = files(ensembl.io.genomio.data.gff3) / "biotypes.json"
         self._biotypes = get_json(biotypes_json)
         self.records = Records()
-        self.annotations = FunctionalAnnotations()
-        self.genome = {}
+        self.genome: Dict[str, Dict[str, Any]] = {}
         if genome_path:
             with Path(genome_path).open("r") as genome_fh:
                 self.genome = json.load(genome_fh)
+        self.annotations = FunctionalAnnotations(self.genome)
         self._set_id_prefix()
 
     def _set_id_prefix(self) -> None: