Skip to content

Commit

Permalink
fix(data): get all allele definitions straight from PharmGKB
Browse files Browse the repository at this point in the history
  • Loading branch information
whaleyr committed Jul 17, 2024
1 parent 37411fc commit b13f199
Show file tree
Hide file tree
Showing 62 changed files with 2,629 additions and 3,073 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,15 @@ public void removeIgnoredNamedAlleles(DefinitionExemption exemption) {
.collect(Collectors.toCollection(TreeSet::new));
}

/**
* Filter out structural variant alleles, they have no definition to match against
*/
public void removeStructuralVariants() {
m_namedAlleles = m_namedAlleles.stream()
.filter((a) -> !a.isStructuralVariant())
.collect(Collectors.toCollection(TreeSet::new));
}


/**
* Removes any unused positions and ignored positions specified in {@link DefinitionExemption}.
Expand Down Expand Up @@ -324,7 +333,7 @@ public void removeIgnoredPositions(SortedSet<VariantLocus> ignoredPositions, boo
" because it has no alleles after removing unused/ignored positions");
} else {
updatedNamedAlleles.add(new NamedAllele(namedAllele.getId(), namedAllele.getName(), null, cpicAlleles,
namedAllele.isReference()));
namedAllele.isReference(), namedAllele.isStructuralVariant()));
}
}
m_namedAlleles = updatedNamedAlleles;
Expand Down Expand Up @@ -380,7 +389,7 @@ public void doVcfTranslation(VcfHelper vcfHelper) throws IOException {
if (mustResort) {
updated = reorderHaplotypeAlleles(na, m_variants, sortedVariants, fixedAlleles);
} else {
updated = new NamedAllele(na.getId(), na.getName(), fixedAlleles, na.getCpicAlleles(), na.isReference());
updated = new NamedAllele(na.getId(), na.getName(), fixedAlleles, na.getCpicAlleles(), na.isReference(), na.isStructuralVariant());
}
updatedNamedAlleles.add(updated);
}
Expand Down Expand Up @@ -641,6 +650,6 @@ private NamedAllele reorderHaplotypeAlleles(NamedAllele hap, VariantLocus[] oldP
}

return new NamedAllele(hap.getId(), hap.getName(), alleles, cpicAlleles, hap.getMissingPositions(),
hap.isReference(), hap.getNumCombinations(), hap.getNumPartials());
hap.isReference(), hap.getNumCombinations(), hap.getNumPartials(), hap.isStructuralVariant());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ public class NamedAllele implements Comparable<NamedAllele> {
@Expose
@SerializedName(value = "reference", alternate = {"matchesreferencesequence"})
private final boolean m_isReference;
@Expose
@SerializedName("structuralVariant")
private final boolean m_isStructuralVariant;
//-- variables after this point are used by NamedAlleleMatcher --//
/** The set of positions that are missing from this copy of the NamedAllele **/
private final SortedSet<VariantLocus> m_missingPositions;
Expand All @@ -66,20 +69,20 @@ public class NamedAllele implements Comparable<NamedAllele> {
* Primary constructor.
* Use this when reading in allele definitions.
*/
public NamedAllele(String id, String name, String[] alleles, String[] cpicAlleles, boolean isReference) {
this(id, name, alleles, cpicAlleles, Collections.emptySortedSet(), isReference, 0, 0);
public NamedAllele(String id, String name, String[] alleles, String[] cpicAlleles, boolean isReference, boolean isStructuralVariant) {
this(id, name, alleles, cpicAlleles, Collections.emptySortedSet(), isReference, 0, 0, isStructuralVariant);
}

public NamedAllele(String id, String name, String[] alleles, String[] cpicAlleles,
SortedSet<VariantLocus> missingPositions, boolean isReference) {
this(id, name, alleles, cpicAlleles, missingPositions, isReference, 0, 0);
SortedSet<VariantLocus> missingPositions, boolean isReference, boolean isStructuralVariant) {
this(id, name, alleles, cpicAlleles, missingPositions, isReference, 0, 0, isStructuralVariant);
}

/**
* Constructor for duplicating/modifying a {@link NamedAllele}.
*/
public NamedAllele(String id, String name, String[] alleles, String[] cpicAlleles,
SortedSet<VariantLocus> missingPositions, boolean isReference, int numCombinations, int numPartials) {
SortedSet<VariantLocus> missingPositions, boolean isReference, int numCombinations, int numPartials, boolean isStructuralVariant) {
Preconditions.checkNotNull(id);
Preconditions.checkNotNull(name);
Preconditions.checkNotNull(cpicAlleles);
Expand All @@ -90,6 +93,7 @@ public NamedAllele(String id, String name, String[] alleles, String[] cpicAllele
m_cpicAlleles = cpicAlleles;
m_missingPositions = missingPositions;
m_isReference = isReference;
m_isStructuralVariant = isStructuralVariant;
m_numCombinations = numCombinations;
m_numPartials = numPartials;

Expand Down Expand Up @@ -219,6 +223,10 @@ public boolean isReference() {
return m_isReference;
}

public boolean isStructuralVariant() {
return m_isStructuralVariant;
}


public boolean isCombination() {
return m_numCombinations > 1;
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/pharmgkb/pharmcat/haplotype/MatchData.java
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ void marshallHaplotypes(String gene, SortedSet<NamedAllele> allHaplotypes, boole
.collect(Collectors.toCollection(TreeSet::new));

NamedAllele newHap = new NamedAllele(hap.getId(), hap.getName(), availableAlleles, cpicAlleles,
missingPositions, hap.isReference());
missingPositions, hap.isReference(), hap.isStructuralVariant());
newHap.initialize(m_positions);
if (newHap.getScore() > 0) {
m_haplotypes.add(newHap);
Expand Down Expand Up @@ -216,7 +216,7 @@ void defaultMissingAllelesToReference() {
}

NamedAllele fixedHap = new NamedAllele(hap.getId(), hap.getName(), newAlleles, cpicAlleles,
hap.getMissingPositions(), hap.isReference());
hap.getMissingPositions(), hap.isReference(), hap.isStructuralVariant());
fixedHap.initialize(m_positions, hap.getScore());
updatedHaplotypes.add(fixedHap);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,13 @@ public void finalizeCombinationHaplotype(MatchData matchData, boolean findPartia
setName("[" + builder + "]");

NamedAllele partialHap = new NamedAllele(hap.getId(), builder.toString(), hap.getAlleles(),
hap.getCpicAlleles(), hap.getMissingPositions(), false, hap.getNumCombinations(), partials.size());
hap.getCpicAlleles(), hap.getMissingPositions(), false, hap.getNumCombinations(), partials.size(), hap.isStructuralVariant());
partialHap.initialize(refVariants);
m_haplotype = partialHap;

} else {
NamedAllele newHap = new NamedAllele(hap.getId(), hap.getName(), hap.getAlleles(),
hap.getCpicAlleles(), hap.getMissingPositions(), hap.isReference(), hap.getNumCombinations(), 0);
hap.getCpicAlleles(), hap.getMissingPositions(), hap.isReference(), hap.getNumCombinations(), 0, hap.isStructuralVariant());
newHap.initialize(refVariants);
m_haplotype = newHap;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ private NamedAllele buildHaplotype(int numPartials, boolean isOffReferencePartia
}
}
NamedAllele na = new NamedAllele(idBuilder.toString(), getName(), alleles, cpicAlleles, missingPositions,
false, m_componentHaplotypes.size(), numPartials);
false, m_componentHaplotypes.size(), numPartials, false);
if (isOffReferencePartial) {
na.initialize(m_refVariants, 0);
} else {
Expand Down
30 changes: 7 additions & 23 deletions src/main/java/org/pharmgkb/pharmcat/util/DataManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ public class DataManager {
public static final Path DEFAULT_EXEMPTIONS_FILE = DEFAULT_DEFINITION_DIR.resolve(EXEMPTIONS_JSON_FILE_NAME);
public static final String POSITIONS_VCF = "pharmcat_positions.vcf";
private static final String ALLELES_FILE_NAME = "allele_translations.json";
private static final String CPIC_ALLELES_FILE_NAME = "allele_definitions.json";
private static final String PRESCRIBING_GUIDANCE_FILE_NAME = "prescribing_guidance.json";
private static final String sf_zipFileName = "pharmcat.zip";
private static final String sf_googleDocUrlFmt = "https://docs.google.com/spreadsheets/d/%s/export?format=tsv";
Expand Down Expand Up @@ -176,13 +175,6 @@ public static void main(String[] args) {
Path exemptionsJson = allelesDir.resolve(EXEMPTIONS_JSON_FILE_NAME);
Map<String, DefinitionExemption> exemptionsMap = manager.transformExemptions(exemptionsTsv, exemptionsJson);

if (!skipDownload) {
// download allele definitions
// use S3 link to avoid caching problems
FileUtils.copyURLToFile(
new URL("http://files.cpicpgx.org.s3-us-west-2.amazonaws.com/data/report/current/allele_definitions.json"),
downloadDir.resolve(CPIC_ALLELES_FILE_NAME).toFile());
}
// transform allele definitions
definitionReader = manager.transformAlleleDefinitions(downloadDir, allelesDir, exemptionsMap);

Expand Down Expand Up @@ -218,8 +210,8 @@ public static void main(String[] args) {
}


private DefinitionFile[] parseDefinitionFiles(Path downloadDir, String fileName) throws IOException {
Path definitionsFile = downloadDir.resolve(fileName);
private DefinitionFile[] parseDefinitionFiles(Path downloadDir) throws IOException {
Path definitionsFile = downloadDir.resolve(ALLELES_FILE_NAME);
if (!Files.exists(definitionsFile)) {
throw new IOException("Cannot find alleles definitions (" + definitionsFile + ")");
}
Expand All @@ -236,12 +228,7 @@ private DefinitionReader transformAlleleDefinitions(Path downloadDir, Path defin

System.out.println("Generating allele definitions...");
List<DefinitionFile> definitionFiles = new ArrayList<>();
for (DefinitionFile df : parseDefinitionFiles(downloadDir, CPIC_ALLELES_FILE_NAME)) {
df.setSource(DataSource.CPIC);
df.validateAlleleNames();
definitionFiles.add(df);
}
for (DefinitionFile df : parseDefinitionFiles(downloadDir, ALLELES_FILE_NAME)) {
for (DefinitionFile df : parseDefinitionFiles(downloadDir)) {
df.setSource(DataSource.PHARMGKB);
df.validateAlleleNames();
definitionFiles.add(df);
Expand All @@ -251,13 +238,10 @@ private DefinitionReader transformAlleleDefinitions(Path downloadDir, Path defin
try (VcfHelper vcfHelper = new VcfHelper()) {
for (DefinitionFile df : definitionFiles) {
String gene = df.getGeneSymbol();
if (definitionFileMap.containsKey(gene)) {
// this will prefer CPIC allele definitions over PharmGKB ones
continue;
}
if (gene.equals("MT-RNR1")) {
continue;
}

// always strip structural variants since they are unmatchable
df.removeStructuralVariants();

DefinitionExemption exemption = exemptionsMap.get(gene);
if (exemption != null) {
if (!exemption.getIgnoredAlleles().isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"formatVersion": "2",
"source": "CPIC",
"version": "v1.40.1-1-gf95ca46",
"modificationDate": "2024-07-11T23:06:42.472Z",
"source": "PHARMGKB",
"version": "2024-07-15-21-58",
"modificationDate": "2023-02-03T16:20:52.044Z",
"gene": "ABCG2",
"orientation": null,
"chromosome": "chr4",
Expand Down Expand Up @@ -32,27 +32,29 @@
"namedAlleles": [
{
"name": "rs2231142 reference (G)",
"id": "1508724",
"id": "PA166287823",
"alleles": [
"G"
],
"cpicAlleles": [
"G"
],
"reference": true,
"structuralVariant": false,
"numCombinations": 0,
"numPartials": 0
},
{
"name": "rs2231142 variant (T)",
"id": "1508726",
"id": "PA166287824",
"alleles": [
"T"
],
"cpicAlleles": [
"T"
],
"reference": false,
"structuralVariant": false,
"numCombinations": 0,
"numPartials": 0
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"formatVersion": "2",
"source": "CPIC",
"version": "v1.40.1-1-gf95ca46",
"modificationDate": "2024-07-11T23:06:42.624Z",
"source": "PHARMGKB",
"version": "2024-07-15-21-58",
"modificationDate": "2021-10-22T20:00:04.974Z",
"gene": "CACNA1S",
"orientation": null,
"chromosome": "chr1",
Expand Down Expand Up @@ -51,7 +51,7 @@
"namedAlleles": [
{
"name": "Reference",
"id": "777262",
"id": "PA166180429",
"alleles": [
"C",
"G"
Expand All @@ -61,12 +61,13 @@
"G"
],
"reference": true,
"structuralVariant": false,
"numCombinations": 0,
"numPartials": 0
},
{
"name": "c.520C>T",
"id": "777264",
"id": "PA166180430",
"alleles": [
null,
"A"
Expand All @@ -76,12 +77,13 @@
"A"
],
"reference": false,
"structuralVariant": false,
"numCombinations": 0,
"numPartials": 0
},
{
"name": "c.3257G>A",
"id": "777266",
"id": "PA166180431",
"alleles": [
"T",
null
Expand All @@ -91,6 +93,7 @@
null
],
"reference": false,
"structuralVariant": false,
"numCombinations": 0,
"numPartials": 0
}
Expand Down
Loading

0 comments on commit b13f199

Please sign in to comment.