Skip to content

Commit

Permalink
feat(reporter): add support for passing sample metadata to reporter
Browse files Browse the repository at this point in the history
  • Loading branch information
markwoon committed Feb 18, 2025
1 parent 485aa06 commit dae7a76
Show file tree
Hide file tree
Showing 23 changed files with 298 additions and 169 deletions.
4 changes: 4 additions & 0 deletions src/main/java/org/pharmgkb/pharmcat/BaseConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public class BaseConfig {
boolean deleteIntermediateFiles;
boolean verbose;
SortedSet<String> samples = new TreeSet<>();
Path sampleMetadataFile;


BaseConfig(CliHelper cliHelper) throws IOException, ReportableException {
Expand Down Expand Up @@ -94,6 +95,9 @@ public class BaseConfig {
}
}
}
if (cliHelper.hasOption("sm")) {
sampleMetadataFile = cliHelper.getValidFile("sm", true);
}

boolean researchMode = false;
if (runMatcher) {
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/pharmgkb/pharmcat/BatchPharmCAT.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ public static void main(String[] args) {
.addOption("i", "input-dir", "Directory containing source data files", false, "dir")
.addOption("s", "samples", "Comma-separated list of samples", false, "samples")
.addOption("S", "sample-file", "File containing a list of sample, one per line", false, "file")
.addOption("sm", "sample-metadata", "TSV containing sample metadata", false, "file")

// named allele matcher args
.addOption("matcher", "matcher", "Run named allele matcher independently")
Expand Down Expand Up @@ -401,7 +402,7 @@ public Pipeline build(Env env, int index, int totalTasks) throws ReportableExcep
m_config.reporterSources, m_config.reporterCompact,
m_config.reporterJson, m_config.reporterHtml, m_config.reporterCallsOnlyTsv,
m_config.outputDir, m_config.baseFilename, m_config.deleteIntermediateFiles,
mode, (index + "/" + totalTasks), m_verbose);
mode, (index + "/" + totalTasks), m_verbose, null);
}


Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/pharmgkb/pharmcat/Constants.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.pharmgkb.pharmcat;

import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
Expand Down Expand Up @@ -97,7 +98,7 @@ public static boolean isActivityScoreGene(String gene, DataSource dataSource) {
Set<String> genes = switch (dataSource) {
case CPIC -> ACTIVITY_SCORE_GENES_CPIC;
case DPWG -> ACTIVITY_SCORE_GENES_DPWG;
default -> throw new RuntimeException("No genes specified for " + dataSource);
default -> Collections.emptySet();
};
return gene != null && genes.contains(gene.toUpperCase());
}
Expand Down
41 changes: 41 additions & 0 deletions src/main/java/org/pharmgkb/pharmcat/Env.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package org.pharmgkb.pharmcat;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.TreeMap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import org.checkerframework.checker.nullness.qual.Nullable;
Expand Down Expand Up @@ -33,6 +36,7 @@ public class Env {
private MessageHelper m_messageHelper;
private final Map<DataSource, Map<String, Map<String, Haplotype>>> m_haplotypeCache = new HashMap<>();
private final Multimap<String, String> m_validHaplotypes = HashMultimap.create();
private final Map<Path, Map<String, Map<String, String>>> m_sampleDataMap = new HashMap<>();


public Env() throws IOException, ReportableException {
Expand Down Expand Up @@ -189,4 +193,41 @@ public synchronized Haplotype makeHaplotype(String gene, String name, DataSource
return haplotype;
});
}


public synchronized @Nullable Map<String, String> getSampleMetadata(Path sampleMetadataFile, String sampleId,
boolean cache) throws IOException {

Map<String, Map<String, String>> fileMap;
if (cache) {
fileMap = m_sampleDataMap.computeIfAbsent(sampleMetadataFile, f -> new TreeMap<>());
if (!fileMap.isEmpty()) {
return fileMap.get(sampleId);
}
} else {
fileMap = new TreeMap<>();
}
try (BufferedReader reader = Files.newBufferedReader(sampleMetadataFile)) {
String line;
while ((line = reader.readLine()) != null) {
String[] row = line.split("\t");
if (row.length >= 3) {
String sid = row[0];
if (cache) {
fileMap.computeIfAbsent(sid, k -> new HashMap<>())
.put(row[1], row[2]);
} else {
if (sid.equals(sampleId)) {
fileMap.computeIfAbsent(sid, k -> new HashMap<>())
.put(row[1], row[2]);
} else if (fileMap.containsKey(sid)) {
// all values for a single sample must be consecutive
break;
}
}
}
}
}
return fileMap.get(sampleId);
}
}
1 change: 1 addition & 0 deletions src/main/java/org/pharmgkb/pharmcat/PharmCAT.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ public static void main(String[] args) {
// inputs
.addOption("s", "samples", "Comma-separated list of samples", false, "samples")
.addOption("S", "sample-file", "File containing a list of sample, one per line", false, "file")
.addOption("sm", "sample-metadata", "TSV containing sample metadata", false, "file")

// named allele matcher args
.addOption("matcher", "matcher", "Run named allele matcher independently")
Expand Down
10 changes: 7 additions & 3 deletions src/main/java/org/pharmgkb/pharmcat/Pipeline.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ public enum Mode {
private Path m_matcherHtmlFile;
/** True if the VCF file only contains a single sample. */
private final boolean m_singleSample;
private final Path m_sampleMetadataFile;

private final boolean m_runPhenotyper;
private Path m_phenotyperInputFile;
Expand Down Expand Up @@ -97,7 +98,8 @@ public Pipeline(Env env, BaseConfig config, @Nullable VcfFile vcfFile,
config.reporterSources, config.reporterCompact,
config.reporterJson, config.reporterHtml, config.reporterCallsOnlyTsv,
config.outputDir, config.baseFilename, config.deleteIntermediateFiles,
Pipeline.Mode.CLI, null, config.verbose);
Pipeline.Mode.CLI, null, config.verbose,
config.sampleMetadataFile);
}


Expand All @@ -109,11 +111,13 @@ public Pipeline(Env env,
@Nullable List<PrescribingGuidanceSource> reporterSources, boolean reporterCompact,
boolean reporterJson, boolean reporterHtml, boolean reporterCallsOnlyTsv,
@Nullable Path outputDir, @Nullable String baseFilename, boolean deleteIntermediateFiles,
Mode mode, @Nullable String displayCount, boolean verbose) throws ReportableException {
Mode mode, @Nullable String displayCount, boolean verbose,
@Nullable Path sampleMetadataFile) throws ReportableException {
m_env = env;

m_runMatcher = runMatcher;
m_baseDir = outputDir;
m_sampleMetadataFile = sampleMetadataFile;
if (runMatcher) {
m_vcfFile = Objects.requireNonNull(vcfFile);
m_sampleId = sampleId;
Expand Down Expand Up @@ -268,7 +272,7 @@ public PipelineResult call() throws IOException {
if (!batchDisplayMode) {
namedAlleleMatcher.printWarnings();
}
matcherResult = namedAlleleMatcher.call(m_vcfFile, m_sampleId);
matcherResult = namedAlleleMatcher.call(m_vcfFile, m_sampleId, m_sampleMetadataFile);

if (matcherResult.getVcfWarnings() != null &&
!matcherResult.getVcfWarnings().isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ private void test(String gene, NamedAlleleMatcher namedAlleleMatcher, VcfFile vc

boolean hasUnknownCall = expectedAlleles.contains("?");
boolean hasComboCall = !hasUnknownCall && vcfFile.getFile().getFileName().toString().contains("noCall");
Result result = namedAlleleMatcher.call(vcfFile, null);
Result result = namedAlleleMatcher.call(vcfFile, null, null);

if (isLowestFunctionGene(gene)) {
if (result.getGeneCalls().get(0).getDiplotypes().isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ public static void main(String[] args) {
NamedAlleleMatcher namedAlleleMatcher =
new NamedAlleleMatcher(new Env(), definitionReader, findCombinations, topCandidateOnly, callCyp2d6)
.printWarnings();
Result result = namedAlleleMatcher.call(new VcfFile(vcfFile), null);
Result result = namedAlleleMatcher.call(new VcfFile(vcfFile), null, null);

Path jsonFile = CliUtils.getOutputFile(cliHelper, vcfFile, "json", BaseConfig.MATCHER_SUFFIX + ".json");
ResultSerializer resultSerializer = new ResultSerializer();
Expand Down Expand Up @@ -182,10 +182,18 @@ private boolean getTopCandidateOnly(String gene) {
* Calls diplotypes for the given VCF file for all genes for which a definition exists.
*/
public Result call(VcfFile vcfFile, @Nullable String sampleId) throws IOException {
return call(vcfFile, sampleId, null);
}

/**
* Calls diplotypes for the given VCF file for all genes for which a definition exists.
*/
public Result call(VcfFile vcfFile, @Nullable String sampleId, @Nullable Path sampleMetadataFile) throws IOException {
VcfReader vcfReader = vcfFile.getReader(m_definitionReader, sampleId, m_findCombinations);
SortedMap<String, SampleAllele> alleleMap = vcfReader.getAlleleMap();
ResultBuilder resultBuilder = new ResultBuilder(m_definitionReader, m_topCandidateOnly, m_findCombinations, m_callCyp2d6)
.forFile(vcfFile, vcfReader.getWarnings().asMap());
.forFile(vcfFile, vcfReader.getWarnings().asMap(), vcfReader.getSampleId(), sampleMetadataFile);

if (m_printWarnings) {
vcfReader.getWarnings().keySet()
.forEach(key -> {
Expand All @@ -207,7 +215,7 @@ public Result call(VcfFile vcfFile, @Nullable String sampleId) throws IOExceptio
callAssumingReference(vcfReader.getSampleId(), alleleMap, gene, resultBuilder);
}
}
return resultBuilder.build();
return resultBuilder.build(m_env);
}

/**
Expand Down
31 changes: 23 additions & 8 deletions src/main/java/org/pharmgkb/pharmcat/haplotype/ResultBuilder.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.pharmgkb.pharmcat.haplotype;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Collection;
import java.util.Date;
import java.util.List;
Expand All @@ -10,6 +12,7 @@
import com.google.common.base.Preconditions;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.pharmgkb.common.util.PathUtils;
import org.pharmgkb.pharmcat.Env;
import org.pharmgkb.pharmcat.VcfFile;
import org.pharmgkb.pharmcat.definition.DefinitionReader;
import org.pharmgkb.pharmcat.definition.model.DefinitionFile;
Expand All @@ -35,6 +38,7 @@ public class ResultBuilder {
private final boolean m_topCandidatesOnly;
private final boolean m_findCombinations;
private final boolean m_callCyp2d6;
private Path m_sampleMetadataFile;


public ResultBuilder(DefinitionReader definitionReader, boolean topCandidatesOnly, boolean findCombinations,
Expand All @@ -46,19 +50,30 @@ public ResultBuilder(DefinitionReader definitionReader, boolean topCandidatesOnl
m_callCyp2d6 = callCyp2d6;
}

public Result build() {
public Result build(Env env) throws IOException {
if (m_sampleMetadataFile != null) {
Metadata metadata = m_result.getMetadata();
Map<String, String> sampleData = env.getSampleMetadata(m_sampleMetadataFile, metadata.getSampleId(), true);
if (sampleData != null && !sampleData.isEmpty()) {
metadata.setSampleProps(sampleData);
}
}
return m_result;
}


public ResultBuilder forFile(VcfFile vcfFile, Map<String, Collection<String>> warnings) {
public ResultBuilder forFile(VcfFile vcfFile, Map<String, Collection<String>> warnings, String sampleId,
@Nullable Path sampleMetadataFile) {
Preconditions.checkNotNull(vcfFile);

m_result.setMetadata(new Metadata(NamedAlleleMatcher.VERSION, m_definitionReader.getGenomeBuild(),
PathUtils.getFilename(vcfFile.getFile()), new Date(), m_topCandidatesOnly, m_findCombinations, m_callCyp2d6));
Metadata metadata = new Metadata(NamedAlleleMatcher.VERSION, m_definitionReader.getGenomeBuild(),
PathUtils.getFilename(vcfFile.getFile()), new Date(), m_topCandidatesOnly, m_findCombinations, m_callCyp2d6,
sampleId);
m_result.setMetadata(metadata);
if (warnings != null) {
m_result.setVcfWarnings(warnings);
}
m_sampleMetadataFile = sampleMetadataFile;
return this;
}

Expand All @@ -74,15 +89,15 @@ protected ResultBuilder gene(String gene, MatchData matchData) {


/**
* Adds diplotype results for specified gene.
* Adds diplotype results for a specified gene.
*/
protected ResultBuilder diplotypes(String gene, MatchData matchData, SortedSet<DiplotypeMatch> matches) {
Preconditions.checkNotNull(gene);
return diplotypes(gene, matchData, matches, null);
}

/**
* Adds diplotype results for specified gene.
* Adds diplotype results for a specified gene.
*/
protected ResultBuilder diplotypes(String gene, MatchData matchData, SortedSet<DiplotypeMatch> matches,
@Nullable List<MessageAnnotation> warnings) {
Expand All @@ -100,7 +115,7 @@ protected ResultBuilder diplotypes(String gene, MatchData matchData, SortedSet<D


/**
* Add haplotype results for specified gene.
* Add haplotype results for a specified gene.
* <p>
* This should only be used when we can't get diplotypes but still need to track potential haplotypes (e.g. DPYD).
*/
Expand All @@ -109,7 +124,7 @@ protected ResultBuilder haplotypes(String gene, MatchData matchData, List<Haplot
}

/**
* Add haplotype results for specified gene.
* Add haplotype results for a specified gene.
* <p>
* This should only be used when we can't get diplotypes but still need to track potential haplotypes (e.g. DPYD).
*/
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/org/pharmgkb/pharmcat/haplotype/VcfReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,11 @@ public VcfReader(DefinitionReader definitionReader, Path vcfFile) throws IOExcep
}


public @Nullable String getSampleId() {
/**
* Gets the Sample ID of the data to read.
*/
public String getSampleId() {
// this should never be null after read() is called
return m_sampleId;
}

Expand Down
36 changes: 31 additions & 5 deletions src/main/java/org/pharmgkb/pharmcat/haplotype/model/Metadata.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
package org.pharmgkb.pharmcat.haplotype.model;

import java.util.Date;
import java.util.Map;
import com.google.gson.annotations.Expose;
import com.google.gson.annotations.SerializedName;
import org.checkerframework.checker.nullness.qual.Nullable;


public class Metadata {
Expand All @@ -18,7 +20,7 @@ public class Metadata {
private String m_inputFilename;
@Expose
@SerializedName("timestamp")
private Date m_timetamp;
private Date m_timestamp;
@Expose
@SerializedName("topCandidatesOnly")
private boolean m_topCandidatesOnly;
Expand All @@ -28,17 +30,24 @@ public class Metadata {
@Expose
@SerializedName("callCyp2d")
private boolean m_callCyp2d6;
@Expose
@SerializedName("sampleId")
private String m_sampleId;
@Expose
@SerializedName("sampleProps")
private Map<String, String> m_sampleProps;


public Metadata(String namedAlleleMatcherVersion, String genomeBuild, String vcfFilename, Date date,
boolean topCandidatesOnly, boolean findCombinations, boolean callCyp2d6) {
boolean topCandidatesOnly, boolean findCombinations, boolean callCyp2d6, String sampleId) {
m_namedAlleleMatcherVersion = namedAlleleMatcherVersion;
m_genomeBuild = genomeBuild;
m_inputFilename = vcfFilename;
m_timetamp = date;
m_timestamp = date;
m_topCandidatesOnly = topCandidatesOnly;
m_findCombinations = findCombinations;
m_callCyp2d6 = callCyp2d6;
m_sampleId = sampleId;
}


Expand All @@ -54,8 +63,8 @@ public String getInputFilename() {
return m_inputFilename;
}

public Date getTimetamp() {
return m_timetamp;
public Date getTimestamp() {
return m_timestamp;
}

public boolean isTopCandidatesOnly() {
Expand All @@ -69,4 +78,21 @@ public boolean isFindCombinations() {
public boolean isCallCyp2d6() {
return m_callCyp2d6;
}


public String getSampleId() {
return m_sampleId;
}

public void setSampleId(String sampleId) {
m_sampleId = sampleId;
}

public @Nullable Map<String, String> getSampleProps() {
return m_sampleProps;
}

public void setSampleProps(Map<String, String> sampleProps) {
m_sampleProps = sampleProps;
}
}
Loading

0 comments on commit dae7a76

Please sign in to comment.