Skip to content

Commit

Permalink
Updated to prefer using the short prefix (e.g. "mod") instead of long…
Browse files Browse the repository at this point in the history
… db names in the xref rules, etc.
  • Loading branch information
IgorRodchenkov committed Feb 12, 2024
1 parent 3564cbb commit b64b574
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,19 @@ public interface XrefUtils {
* It returns NULL for "unknown" database name.
*
* @param name case-insensitive name (of a bio ID type/resource) name
* @return preferred name
* @return preferred name (upper case)
*/
String getPrimaryDbName(String name);

/**
* Gets the "prefix" (curated short name) for the DB (collection of IDs).
* It returns NULL for "unknown" database name.
*
* @param name case-insensitive name (of a bio ID type/resource) name
* @return prefix (lower case)
*/
String getPrefix(String name);

/**
* Checks whether the ID format is valid for the database.
* Always use {@link #canCheckIdFormatIn(String)} before this method,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public void check(final Validation validation, UnificationXref x) {
initInternalMaps();

if (x.getDb() == null
|| helper.getPrimaryDbName(x.getDb()) == null) {
|| helper.getPrefix(x.getDb()) == null) {
// ignore for unknown databases (another rule checks)
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,42 +30,40 @@ public void check(final Validation validation, Xref x) {
String db = x.getDb();
if (db != null) {
// check db
String preferedDbName = xrefUtils.getPrimaryDbName(db);
if (preferedDbName == null) {
String prefix = xrefUtils.getPrefix(db);
if (prefix == null) {
error(validation, x, "unknown.db", false, db);
return;
}

// check id
String id = x.getId();
if (id != null) {
if (!xrefUtils.canCheckIdFormatIn(preferedDbName)) {
logger.info("Can't check IDs (no regexp) for "
+ db + " (" + preferedDbName + ")");
} else if (!xrefUtils.checkIdFormat(preferedDbName, id)) {

String regxp = xrefUtils.getRegexpString(preferedDbName);
if (!xrefUtils.canCheckIdFormatIn(prefix)) {
logger.info("Can't check IDs (no regexp) for " + db + " (" + prefix + ")");
} else if (!xrefUtils.checkIdFormat(prefix, id)) {
String regxp = xrefUtils.getRegexpString(prefix);
// report error with fixed=false
error(validation, x, "invalid.id.format", false, db, preferedDbName, id, regxp);
error(validation, x, "invalid.id.format", false, db, prefix, id, regxp);

// try to fix (in some cases) using a hack
while(validation.isFix()) { //- no worries - will use 'break' to escape the infinite loop
// guess it's a Uniprot Isoform (next try splitting it into id and idVersion parts)
if (StringUtils.startsWithIgnoreCase(preferedDbName, "UNIPROT")) {
// guess it's Uniprot Isoform (next try splitting it into id and idVersion parts)
if (StringUtils.startsWithIgnoreCase(prefix, "uniprot")) {
if (id.contains("-")
&& xrefUtils.checkIdFormat("uniprot isoform",id.toUpperCase())) {
x.setDb("uniprot isoform");
&& xrefUtils.checkIdFormat("uniprot.isoform",id.toUpperCase())) {
x.setDb("uniprot.isoform");
x.setId(id.toUpperCase());
// update the error case, set fixed=true
error(validation, x, "invalid.id.format", true);
break;
}
} // guess it's in fact a PSI-MOD despite PSI-MI is used (todo: likely useless/obsolete code)
else if (preferedDbName.equalsIgnoreCase("MOLECULAR INTERACTIONS ONTOLOGY")) {
else if (prefix.equalsIgnoreCase("mi")) {
if (id.toUpperCase().startsWith("MOD")
&& xrefUtils.checkIdFormat("MOD", id.toUpperCase()))
{
x.setDb("MOD");
x.setDb("mod");
x.setId(id.toUpperCase());
// update the error case, set fixed=true
error(validation, x, "invalid.id.format", true);
Expand All @@ -81,7 +79,7 @@ else if (preferedDbName.equalsIgnoreCase("MOLECULAR INTERACTIONS ONTOLOGY")) {
i = id.lastIndexOf('-');
if(i > 0 && i < id.length()) {
String newId = id.substring(0, i);
if (xrefUtils.checkIdFormat(preferedDbName, newId)) {
if (xrefUtils.checkIdFormat(prefix, newId)) {
x.setId(newId);
x.setIdVersion(id.substring(i + 1));
// update the error case, set fixed=true there
Expand All @@ -91,46 +89,40 @@ else if (preferedDbName.equalsIgnoreCase("MOLECULAR INTERACTIONS ONTOLOGY")) {
}

/*
* Fix if MI:, GO:, MOD:, etc., prefixes were simply missing/forgotten -
* Add 'MI:','GO:','MOD:' etc. "banana" to the ID (though it's correct to use w/o that banana/prefix too)
*/
i = regxp.lastIndexOf(':');
if(i>0) {
// guess, regexp looks like "^GO:%d{7}", and we want to get "GO"
String prefix = regxp.substring(1, i).toUpperCase();
if (logger.isDebugEnabled())
logger.debug("Trying to fix id with missing prefix: " + prefix);
if(preferedDbName.equalsIgnoreCase(xrefUtils.getPrimaryDbName(prefix))
&& !id.toUpperCase().startsWith(prefix))
{
String newId = prefix + ':' + id;
if (xrefUtils.checkIdFormat(preferedDbName, newId)) {
String p = regxp.substring(1, i).toUpperCase();
if(prefix.equalsIgnoreCase(xrefUtils.getPrefix(p)) && !id.toUpperCase().startsWith(p)) {
String newId = p + ':' + id;
if (xrefUtils.checkIdFormat(prefix, newId)) {
x.setId(newId);
error(validation, x, "invalid.id.format", true);
if (logger.isDebugEnabled())
logger.debug(x.getModelInterface()
.getSimpleName() + " " + x
+ " 'id' auto-fixed! (was: " + id + ")");
if (logger.isDebugEnabled()) {
logger.debug(x.getModelInterface().getSimpleName() + " " + x
+ " 'id' auto-fixed! (was: " + id + ")");
}
break;
}
}
}


/*
* Turning ID to upper case can sometimes help (e.g., KEGG, - c00022 to C00022 helps!) -
* because most identifier patterns corresp. to MIRIAM data collections are case-sensitive and
* use upper-case symbols (e.g., Uniport's begin with P, Q, O; also - GO:, MOD:, and NP_ - same idea)
*/
String newId = id.toUpperCase();
if (xrefUtils.checkIdFormat(preferedDbName, newId)) {
if (xrefUtils.checkIdFormat(prefix, newId)) {
x.setId(newId);
error(validation, x, "invalid.id.format", true);
if (logger.isDebugEnabled())
logger.debug(x.getModelInterface()
.getSimpleName() + " " + x
+ " 'id' auto-fixed! (was: " + id + ")");
if (logger.isDebugEnabled()) {
logger.debug(x.getModelInterface().getSimpleName() + " " + x + " 'id' auto-fixed! (was: " + id + ")");
}
break;
}
}

break; //breaks this loop anyway
} //end while
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public void check(final Validation validation, Xref x) {
return; // another (cardinality) rule reports
}

String primary = xrefUtils.getPrimaryDbName(db);
String primary = xrefUtils.getPrefix(db);
// if primary is null, do nothing, - another rule (XrefRule) reports this
if (primary != null && !primary.equalsIgnoreCase(db)) {
// report only if it is definitely not official db synonym
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,8 @@ public synchronized void init() {
String name = dbName(ns.getName()); //trim,uppercase
String prefix = dbName(ns.getPrefix());
final List<String> synonyms = new ArrayList<>();
synonyms.add(prefix); //add as first - will be primary db name for this id collection
synonyms.add(name);
synonyms.add(prefix);
//add synonyms from the registry
ns.getSynonyms().forEach((s) -> synonyms.add(dbName(s)));
//add custom synonyms of given prefix from the Resolver's map
Expand Down Expand Up @@ -287,12 +287,12 @@ public boolean match(Collection<String> a, Collection<String> b) {
merged.add(name);
}
}
//if possible, move the preferred name on top
//if possible, move the prefix and preferred name on top (the order is important)
String topName = merged.get(0);
Namespace ns = Resolver.getNamespace(topName);
if(ns != null) {
String preferName = dbName(ns.getName());
merged.add(0, preferName);
merged.add(0, dbName(ns.getPrefix()));
merged.add(1, dbName(ns.getName()));
}
this.allSynonyms.addComposited(merged);
} else {
Expand Down Expand Up @@ -322,7 +322,23 @@ public List<String> getSynonymsForDbName(String name) {
@Override
public String getPrimaryDbName(String name) {
List<String> names = getSynonymsForDbName(name);
return (names.isEmpty()) ? null : names.iterator().next();
if (names.isEmpty()) {
return null;
} else if (names.size() > 1) {
return names.get(1);
} else {
return names.get(0);
}
}

@Override
public String getPrefix(String name) {
List<String> names = getSynonymsForDbName(name);
if (names.isEmpty()) {
return null;
} else {
return names.get(0).toLowerCase();
}
}

@Override
Expand Down
17 changes: 14 additions & 3 deletions biopax-validator/src/test/java/ValidatorIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,17 +119,28 @@ public void xRefHelperContainsSynonyms() {

@Test
public void primarySynonym() {
//not in registry: PIR
assertAll(
() -> assertEquals("UNIPROT PROTEIN", xrefUtils.getPrimaryDbName("pir")),
() -> assertEquals("UNIPROT PROTEIN", xrefUtils.getPrimaryDbName("pir")), //not in registry: PIR (using extra synonyms)
() -> assertEquals("GENE ONTOLOGY", xrefUtils.getPrimaryDbName("go")),
() -> assertEquals("KEGG COMPOUND", xrefUtils.getSynonymsForDbName("kegg compound").get(0)),
() -> assertEquals("KEGG.COMPOUND", xrefUtils.getSynonymsForDbName("kegg compound").get(0)),
() -> assertEquals("KEGG COMPOUND", xrefUtils.getSynonymsForDbName("kegg compound").get(1)),
() -> assertEquals("KEGG COMPOUND", xrefUtils.getPrimaryDbName("ligand")), //ligand (deprecated) is inside kegg compound!
() -> assertEquals("KEGG GENOME", xrefUtils.getPrimaryDbName("kegg organism")),
() -> assertEquals("KYOTO ENCYCLOPEDIA OF GENES AND GENOMES", xrefUtils.getPrimaryDbName("KEGG"))
);
}

@Test
public void prefix() {
assertAll(
() -> assertEquals("uniprot", xrefUtils.getPrefix("pir")),
() -> assertEquals("go", xrefUtils.getPrefix("go")),
() -> assertEquals("kegg.compound", xrefUtils.getPrefix("ligand")),
() -> assertEquals("kegg.genome", xrefUtils.getPrefix("kegg organism")),
() -> assertEquals("kegg", xrefUtils.getPrefix("KEGG"))
);
}

@ParameterizedTest
@MethodSource
public void hasRegexp(String db) {
Expand Down

0 comments on commit b64b574

Please sign in to comment.