From 5136b817298e3d5c37ee87537b7b2d26cd9f929b Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Thu, 16 Jan 2025 09:13:45 -0500 Subject: [PATCH 1/7] Adding in refRangeFilter --- .../phgv2/pathing/ropebwt/MapReads.kt | 63 +++++++++++++++---- .../phgv2/pathing/ropebwt/MapReadsTest.kt | 51 ++++++++++++--- 2 files changed, 92 insertions(+), 22 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt b/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt index 8fb3788b..17c67993 100644 --- a/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt +++ b/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt @@ -10,6 +10,8 @@ import com.github.ajalt.clikt.parameters.options.default import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required import com.github.ajalt.clikt.parameters.types.int +import net.maizegenetics.phgv2.api.HaplotypeGraph +import net.maizegenetics.phgv2.api.ReferenceRange import net.maizegenetics.phgv2.cli.logCommand import net.maizegenetics.phgv2.pathing.AlignmentUtils import net.maizegenetics.phgv2.pathing.KeyFileData @@ -17,6 +19,7 @@ import net.maizegenetics.phgv2.pathing.ReadInputFile import org.apache.logging.log4j.LogManager import java.io.BufferedInputStream import java.io.BufferedReader +import java.io.File import java.io.InputStreamReader /** @@ -46,6 +49,9 @@ class MapReads : CliktCommand(help="BETA: Map reads to a pangenome using ropeBWT val outputDir by option("-o", "--output-dir", help = "Name for output ReadMapping file Directory (Required)") .required() + val hvcfDir by option(help = "Directory for the haplotype VCF files. These hvcfs will be used to filter the hits to only those that hit a single referenceRange.") + .required() + val threads by option(help = "Number of threads to use.") .int() .default(5) @@ -64,26 +70,37 @@ class MapReads : CliktCommand(help="BETA: Map reads to a pangenome using ropeBWT override fun run() { logCommand(this) + + myLogger.info("Building the Graph") + //loop through all files in hvcfDir and create a list of hvcf files + val hvcfFiles = File(hvcfDir).walkTopDown().filter { it.isFile } + .filter { it.name.endsWith("h.vcf") || it.name.endsWith("h.vcf.gz") }.map { "${hvcfDir}/${it.name}" } + .toList() + + val graph = HaplotypeGraph(hvcfFiles) + + val hapIdToRefRangeMap = graph.hapIdToRefRangeMap() + myLogger.info("Mapping reads to pangenome") - mapAllReadFiles(index, readInputFiles.getReadFiles(), outputDir, threads, minMemLength, maxNumHits, condaEnvPrefix) + mapAllReadFiles(index, readInputFiles.getReadFiles(), outputDir, threads, minMemLength, maxNumHits, condaEnvPrefix, hapIdToRefRangeMap) } /** * Function to map all the read files in the keyFileDataEntries to the index */ - fun mapAllReadFiles(index: String, keyFileDataEntries: List, outputDir: String, threads: Int, minMemLength: Int, maxNumHits: Int, condaEnvPrefix: String) { + fun mapAllReadFiles(index: String, keyFileDataEntries: List, outputDir: String, threads: Int, minMemLength: Int, maxNumHits: Int, condaEnvPrefix: String, hapIdToRefRangeMap : Map>) { //Loop through the keyFileDataEntries and map the reads //If there is a second file it processes that and makes a separate readMapping file. val readNameToFileMap = mutableMapOf>() for(readFile in keyFileDataEntries) { val fileList = readNameToFileMap[readFile.sampleName]?: mutableListOf() val outputFile1 = "$outputDir/${readFile.sampleName}_1_readMapping.txt" - mapSingleReadFile(index, readFile.sampleName, readFile.file1,outputFile1, threads, minMemLength, maxNumHits, condaEnvPrefix) + mapSingleReadFile(index, readFile.sampleName, readFile.file1,outputFile1, threads, minMemLength, maxNumHits, condaEnvPrefix, hapIdToRefRangeMap) fileList.add(outputFile1) if(readFile.file2 != "") { val outputFile2 = "$outputDir/${readFile.sampleName}_2_readMapping.txt" - mapSingleReadFile(index, readFile.sampleName, readFile.file2, outputFile2, threads, minMemLength, maxNumHits, condaEnvPrefix) + mapSingleReadFile(index, readFile.sampleName, readFile.file2, outputFile2, threads, minMemLength, maxNumHits, condaEnvPrefix, hapIdToRefRangeMap) fileList.add(outputFile2) } readNameToFileMap[readFile.sampleName] = fileList @@ -94,12 +111,12 @@ class MapReads : CliktCommand(help="BETA: Map reads to a pangenome using ropeBWT /** * Function to map a single read file to the index and write the read mapping to the outputFile */ - fun mapSingleReadFile(index: String, sampleName: String,readFile: String, outputFile: String, threads: Int, minMemLength: Int, maxNumHits: Int, condaEnvPrefix: String) { + fun mapSingleReadFile(index: String, sampleName: String,readFile: String, outputFile: String, threads: Int, minMemLength: Int, maxNumHits: Int, condaEnvPrefix: String, hapIdToRefRangeMap: Map>) { myLogger.info("Mapping reads in $readFile to $index") val bedFileReader = setupMappingProcess(index, readFile, threads, minMemLength, maxNumHits, condaEnvPrefix) - val readMapping = createReadMappingsForFileReader(bedFileReader, maxNumHits) + val readMapping = createReadMappingsForFileReader(bedFileReader, maxNumHits, hapIdToRefRangeMap) bedFileReader.close() @@ -135,7 +152,8 @@ class MapReads : CliktCommand(help="BETA: Map reads to a pangenome using ropeBWT */ fun createReadMappingsForFileReader( bedFileReader: BufferedReader, - maxNumHits: Int + maxNumHits: Int, + hapIdToRefRangeMap: Map> ): MutableMap, Int> { var currentLine = bedFileReader.readLine() val tempMems = mutableListOf() @@ -148,14 +166,14 @@ class MapReads : CliktCommand(help="BETA: Map reads to a pangenome using ropeBWT val alignmentParsed = parseStringIntoMem(currentLine) if (tempMems.isNotEmpty() && tempMems[0].readName != alignmentParsed.readName) { //write out the tempMems - processMemsForRead(tempMems, readMapping, maxNumHits) + processMemsForRead(tempMems, readMapping, maxNumHits,hapIdToRefRangeMap) tempMems.clear() } tempMems.add(alignmentParsed) currentLine = bedFileReader.readLine() } - processMemsForRead(tempMems, readMapping, maxNumHits) + processMemsForRead(tempMems, readMapping, maxNumHits, hapIdToRefRangeMap) return readMapping } @@ -180,22 +198,43 @@ class MapReads : CliktCommand(help="BETA: Map reads to a pangenome using ropeBWT * This will filter things based on the maxNumHits and retain the mems that are longest * Because MEMs are Maximal Exact Matches, the longest MEMs are the best hits */ - fun processMemsForRead(tempMems: List, readMapping: MutableMap, Int>, maxNumHits: Int) { + fun processMemsForRead(tempMems: List, readMapping: MutableMap, Int>, maxNumHits: Int, hapIdToRefRangeMap: Map>) { //get the longest hits val maxLength = tempMems.maxOf { it.readEnd - it.readStart } //remove any hits that are not the longest val bestHits = tempMems.filter { it.readEnd - it.readStart == maxLength } - val totalNumHits = bestHits.sumOf { it.numHits } + val bestHapIds = bestHits.flatMap { it.listMemHits.map{hits -> hits.contig} }.toSet() + + val filteredBestHapIds = filterToOneReferenceRange(bestHapIds, hapIdToRefRangeMap) + + val totalNumHits = filteredBestHapIds.size if(totalNumHits <= maxNumHits) { //if the total number of hits is less than the maxNumHits, add all the hits to the readMapping //First turn the hapIds into a set then back to a list and sort so it will be consistent - val hapIdsHit = bestHits.flatMap { it.listMemHits.map{hits -> hits.contig} }.toSet().toList().sorted() + val hapIdsHit = filteredBestHapIds.sorted() readMapping[hapIdsHit] = (readMapping[hapIdsHit]?:0) + 1 } } + /** + * Function to filter the hapIds to only those that hit a single reference range + * This will pick the highest occurring reference range and if there is a tie it will choose the first one. + */ + fun filterToOneReferenceRange(hapIds: Set, hapIdToRefRangeMap: Map>) : List { + //figure out which reference range is hit the most + val refRangeCounts = hapIds.map { hapIdToRefRangeMap[it] } + .filterNotNull() + .flatten() + .groupingBy { it } + .eachCount() + + val maxRefRange = refRangeCounts.maxByOrNull { it.value }?.key + //Then filter out any hapIds that don't have that reference range + return hapIds.filter { hapIdToRefRangeMap[it]?.contains(maxRefRange)?:false } + } + /** * Function to export the path key file for the readMapping files that have been processed. */ diff --git a/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt b/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt index aeee0d3b..94463874 100644 --- a/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt +++ b/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt @@ -2,6 +2,8 @@ package net.maizegenetics.phgv2.pathing.ropebwt import biokotlin.util.bufferedReader import com.github.ajalt.clikt.testing.test +import net.maizegenetics.phgv2.api.HaplotypeGraph +import net.maizegenetics.phgv2.api.ReferenceRange import net.maizegenetics.phgv2.cli.TestExtension import net.maizegenetics.phgv2.pathing.KeyFileData import net.maizegenetics.phgv2.utils.setupDebugLogging @@ -108,18 +110,21 @@ class MapReadsTest { fun testProcessMemsForRead() { val mapReads = MapReads() + val hapIdToRefRangeMap = mapOf("hap1" to listOf(ReferenceRange("chr1",100,200)), "hap2" to listOf(ReferenceRange("chr1",100,200)), "hap3" to listOf(ReferenceRange("chr1",100,200)), + "hap4" to listOf(ReferenceRange("chr1",100,200)), "hap5" to listOf(ReferenceRange("chr1",100,200))) + //Make a simple 1 MEM hit val simpleMEMList = listOf(MEM("read1",0,150,3,listOf(MEMHit("hap1", "+", 100), MEMHit("hap2", "-", 200), MEMHit("hap3", "+", 300)))) var simpleReadMapping = mutableMapOf, Int>() - mapReads.processMemsForRead(simpleMEMList,simpleReadMapping,5) + mapReads.processMemsForRead(simpleMEMList,simpleReadMapping,5, hapIdToRefRangeMap) assertEquals(1, simpleReadMapping.size) assertEquals(listOf("hap1","hap2","hap3"), simpleReadMapping.keys.first()) assertEquals(1, simpleReadMapping[listOf("hap1","hap2","hap3")]) simpleReadMapping = mutableMapOf() - mapReads.processMemsForRead(simpleMEMList,simpleReadMapping,2) + mapReads.processMemsForRead(simpleMEMList,simpleReadMapping,2, hapIdToRefRangeMap) assertEquals(0, simpleReadMapping.size) @@ -136,10 +141,10 @@ class MapReadsTest { ) simpleReadMapping = mutableMapOf() - mapReads.processMemsForRead(simpleMEMList,simpleReadMapping,6) - mapReads.processMemsForRead(simpleMEMList2,simpleReadMapping,6) - mapReads.processMemsForRead(simpleMEMList3,simpleReadMapping,6) - mapReads.processMemsForRead(simpleMEMList4,simpleReadMapping,6) + mapReads.processMemsForRead(simpleMEMList,simpleReadMapping,6, hapIdToRefRangeMap) + mapReads.processMemsForRead(simpleMEMList2,simpleReadMapping,6, hapIdToRefRangeMap) + mapReads.processMemsForRead(simpleMEMList3,simpleReadMapping,6, hapIdToRefRangeMap) + mapReads.processMemsForRead(simpleMEMList4,simpleReadMapping,6, hapIdToRefRangeMap) assertEquals(3, simpleReadMapping.size) assertTrue(simpleReadMapping.keys.contains(listOf("hap1","hap2","hap3"))) assertEquals(2, simpleReadMapping[listOf("hap1","hap2","hap3")]) @@ -151,15 +156,23 @@ class MapReadsTest { //pass in empty list simpleReadMapping = mutableMapOf() - assertThrows { mapReads.processMemsForRead(listOf(),simpleReadMapping,6) } + assertThrows { mapReads.processMemsForRead(listOf(),simpleReadMapping,6, hapIdToRefRangeMap) } } @Test fun testCreateReadMappingsForFileReader() { val reader = bufferedReader("data/test/ropebwt/alignment.bed") + + val hapIdToRefRangeMap = mapOf("hap1" to listOf(ReferenceRange("chr1",100,200)), + "hap2" to listOf(ReferenceRange("chr1",100,200)), + "hap3" to listOf(ReferenceRange("chr1",100,200)), + "hap4" to listOf(ReferenceRange("chr1",100,200)), + "hap5" to listOf(ReferenceRange("chr1",100,200))) + + val mapReads = MapReads() - val readMapping = mapReads.createReadMappingsForFileReader(reader, 5) + val readMapping = mapReads.createReadMappingsForFileReader(reader, 5, hapIdToRefRangeMap) assertEquals(2, readMapping.size) assertTrue(readMapping.keys.contains(listOf("hap1","hap2","hap3"))) @@ -192,7 +205,17 @@ class MapReadsTest { val readFile = "data/test/kmerReadMapping/simulatedReads/LineA_1.fq" val outputDir = "${TestExtension.tempDir}ropebwtTest/" val outputFile = "$outputDir/LineA_1_readMapping.txt" - mapReads.mapSingleReadFile(index, "LineA" ,readFile, outputFile ,5, 148, 5, "") + + + val hvcfFiles = File(TestExtension.smallSeqInputDir).walkTopDown().filter { it.isFile } + .filter { it.name.endsWith("h.vcf") || it.name.endsWith("h.vcf.gz") }.map { "${TestExtension.smallSeqInputDir}/${it.name}" } + .toList() + + val graph = HaplotypeGraph(hvcfFiles) + + val hapIdToRefRangeMap = graph.hapIdToRefRangeMap() + + mapReads.mapSingleReadFile(index, "LineA" ,readFile, outputFile ,5, 148, 5, "", hapIdToRefRangeMap) val expectedLines = bufferedReader("data/test/ropebwt/LineA_1_readMapping.txt").readLines() val expectedReadMap = expectedLines.filter { !it.startsWith("#") }.filter { !it.startsWith("HapId") }.map { it.split("\t") }.associate { it[0] to it[1].toInt() } @@ -212,8 +235,16 @@ class MapReadsTest { val outputDir = "${TestExtension.tempDir}ropebwtTest/" val index = "data/test/ropebwt/testIndex.fmd" + val hvcfFiles = File(TestExtension.smallSeqInputDir).walkTopDown().filter { it.isFile } + .filter { it.name.endsWith("h.vcf") || it.name.endsWith("h.vcf.gz") }.map { "${TestExtension.smallSeqInputDir}/${it.name}" } + .toList() + + val graph = HaplotypeGraph(hvcfFiles) + + val hapIdToRefRangeMap = graph.hapIdToRefRangeMap() + val keyFileDataEntry = listOf(KeyFileData("LineA", "data/test/kmerReadMapping/simulatedReads/LineA_1.fq", "data/test/kmerReadMapping/simulatedReads/LineA_2.fq")) - mapReads.mapAllReadFiles(index, keyFileDataEntry, outputDir, 5, 148, 5, "") + mapReads.mapAllReadFiles(index, keyFileDataEntry, outputDir, 5, 148, 5, "", hapIdToRefRangeMap ) val expectedLines = bufferedReader("data/test/ropebwt/LineA_1_readMapping.txt").readLines() From 3b333a062b9908d49bffe5fca80719ca0dee5509 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Thu, 16 Jan 2025 16:18:33 -0500 Subject: [PATCH 2/7] Fixing unit tests. --- data/test/ropebwt/LineA_1_readMapping.txt | 131 +++++------------- data/test/ropebwt/LineA_2_readMapping.txt | 129 +++++------------ .../phgv2/pathing/ropebwt/MapReads.kt | 12 +- .../phgv2/pathing/ropebwt/MapReadsTest.kt | 28 ++-- 4 files changed, 96 insertions(+), 204 deletions(-) diff --git a/data/test/ropebwt/LineA_1_readMapping.txt b/data/test/ropebwt/LineA_1_readMapping.txt index a145cd5a..f44321a4 100644 --- a/data/test/ropebwt/LineA_1_readMapping.txt +++ b/data/test/ropebwt/LineA_1_readMapping.txt @@ -2,66 +2,42 @@ #filename1=data/test/kmerReadMapping/simulatedReads/LineA_1.fq HapIds count 12f0cec9102e84a161866e37072443b7 853 -3149b3144f93134eb29661bade697fc6 4210 -25413a93cd7622fa44d015d6914f344d,3149b3144f93134eb29661bade697fc6,d4c8b5505d7046b41d7f69b246063ebb 112 -164098aa7cfeff45d651fd5554538bbe,25413a93cd7622fa44d015d6914f344d,3149b3144f93134eb29661bade697fc6,d4c8b5505d7046b41d7f69b246063ebb 8 -164098aa7cfeff45d651fd5554538bbe,25413a93cd7622fa44d015d6914f344d,3149b3144f93134eb29661bade697fc6 20 -164098aa7cfeff45d651fd5554538bbe,3149b3144f93134eb29661bade697fc6 9 -164098aa7cfeff45d651fd5554538bbe,3149b3144f93134eb29661bade697fc6,bb4b6391a180e13f4e2448674cba6d43 32 +3149b3144f93134eb29661bade697fc6 4338 +d4c8b5505d7046b41d7f69b246063ebb 4171 +164098aa7cfeff45d651fd5554538bbe 41 3149b3144f93134eb29661bade697fc6,8967fabf10e55d881caa6fe192e7d4ca 40 1b568197f6f329ec5b71f66e49a732fb 808 05efe15d97db33185b64821791b01b0f,1b568197f6f329ec5b71f66e49a732fb 47 -369464a8743d2e40ad83d1375c196bdd 4296 +369464a8743d2e40ad83d1375c196bdd 4303 369464a8743d2e40ad83d1375c196bdd,66465399052d8ebe48b06329c60fee03 11 -369464a8743d2e40ad83d1375c196bdd,8967fabf10e55d881caa6fe192e7d4ca 7 369464a8743d2e40ad83d1375c196bdd,8f7de1a693aa15fb8fb7b85e7a8b5e95 41 f50fe6d6b3d9a9d305889db977969916 855 aff71f19de448514a6d9208b1fcb4e8a,d4c8b5505d7046b41d7f69b246063ebb 80 -d4c8b5505d7046b41d7f69b246063ebb 4034 -8e5dbf7df5b265ba5daf35c4dfdf8a39,d4c8b5505d7046b41d7f69b246063ebb 4 -073286a82fe47d6a370e8a7a3803f1d3,d4c8b5505d7046b41d7f69b246063ebb 90 -073286a82fe47d6a370e8a7a3803f1d3,bb4b6391a180e13f4e2448674cba6d43,d4c8b5505d7046b41d7f69b246063ebb 28 -073286a82fe47d6a370e8a7a3803f1d3,bb4b6391a180e13f4e2448674cba6d43,d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 6 -073286a82fe47d6a370e8a7a3803f1d3,d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 7 -073286a82fe47d6a370e8a7a3803f1d3,57705b1e2541c7634ea59a48fc52026f,d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 9 -073286a82fe47d6a370e8a7a3803f1d3,18dcf42d7b02fd603e05e3714fa71e95,57705b1e2541c7634ea59a48fc52026f,d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 10 -25413a93cd7622fa44d015d6914f344d,57705b1e2541c7634ea59a48fc52026f,d4c8b5505d7046b41d7f69b246063ebb 10 -25413a93cd7622fa44d015d6914f344d,d4c8b5505d7046b41d7f69b246063ebb 7 -073286a82fe47d6a370e8a7a3803f1d3,8967fabf10e55d881caa6fe192e7d4ca,8f7de1a693aa15fb8fb7b85e7a8b5e95,d4c8b5505d7046b41d7f69b246063ebb 18 +8e5dbf7df5b265ba5daf35c4dfdf8a39 4240 +d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 32 +25413a93cd7622fa44d015d6914f344d 10 +8f7de1a693aa15fb8fb7b85e7a8b5e95 60 18498579d89483ac270e8cca57f34752 817 18498579d89483ac270e8cca57f34752,fca8c375152e127d322a99bbd10e9873 38 -8e5dbf7df5b265ba5daf35c4dfdf8a39 4223 8e5dbf7df5b265ba5daf35c4dfdf8a39,bcb3fa7872cb54537cdf34c96b01ed15 58 8e5dbf7df5b265ba5daf35c4dfdf8a39,90919d6d6f101925c82108fb86ec0621 4 -8e5dbf7df5b265ba5daf35c4dfdf8a39,8f7de1a693aa15fb8fb7b85e7a8b5e95 55 -66465399052d8ebe48b06329c60fee03,8e5dbf7df5b265ba5daf35c4dfdf8a39 13 +66465399052d8ebe48b06329c60fee03 13 cfa288e041fc4dbe299d20ae1920d258 828 a92a239d3a5652d3b9daba7de6febb97,cfa288e041fc4dbe299d20ae1920d258 27 -f3bb0c19ab2a88d82cf79f9311ffa61f 4255 +f3bb0c19ab2a88d82cf79f9311ffa61f 4258 164098aa7cfeff45d651fd5554538bbe,f3bb0c19ab2a88d82cf79f9311ffa61f 71 -8967fabf10e55d881caa6fe192e7d4ca,c4d18d9fa581d3a791a3f2cfd40aefd6,f2e5593769729ebb8aab356ba78a7e3a,f3bb0c19ab2a88d82cf79f9311ffa61f 75 -8967fabf10e55d881caa6fe192e7d4ca,f2e5593769729ebb8aab356ba78a7e3a,f3bb0c19ab2a88d82cf79f9311ffa61f 2 -8967fabf10e55d881caa6fe192e7d4ca,8f7de1a693aa15fb8fb7b85e7a8b5e95,f3bb0c19ab2a88d82cf79f9311ffa61f 3 +f2e5593769729ebb8aab356ba78a7e3a 4275 64747625066ac7d3477cca8ecfa46ed1 785 105c85412229b45439db1f03c3f064f4,64747625066ac7d3477cca8ecfa46ed1 67 64747625066ac7d3477cca8ecfa46ed1,ea27ac2def0e1d80c59cdb38b6dfa66a 3 -f2e5593769729ebb8aab356ba78a7e3a 4171 -7e388530fa03aa2ab21b3cd10a438f5a,90919d6d6f101925c82108fb86ec0621,f2e5593769729ebb8aab356ba78a7e3a 2 -90919d6d6f101925c82108fb86ec0621,f2e5593769729ebb8aab356ba78a7e3a 9 -bb4b6391a180e13f4e2448674cba6d43,f2e5593769729ebb8aab356ba78a7e3a 18 -66465399052d8ebe48b06329c60fee03,bb4b6391a180e13f4e2448674cba6d43,f2e5593769729ebb8aab356ba78a7e3a 96 -66465399052d8ebe48b06329c60fee03,8967fabf10e55d881caa6fe192e7d4ca,bb4b6391a180e13f4e2448674cba6d43,f2e5593769729ebb8aab356ba78a7e3a 20 -8967fabf10e55d881caa6fe192e7d4ca,bb4b6391a180e13f4e2448674cba6d43,f2e5593769729ebb8aab356ba78a7e3a 18 -8967fabf10e55d881caa6fe192e7d4ca,bb4b6391a180e13f4e2448674cba6d43,c4d18d9fa581d3a791a3f2cfd40aefd6,f2e5593769729ebb8aab356ba78a7e3a 9 -8967fabf10e55d881caa6fe192e7d4ca,c4d18d9fa581d3a791a3f2cfd40aefd6,f2e5593769729ebb8aab356ba78a7e3a 38 +90919d6d6f101925c82108fb86ec0621 2 +bb4b6391a180e13f4e2448674cba6d43 4299 +8967fabf10e55d881caa6fe192e7d4ca 30 674dda50f4e91de9d4a88cc341173974,f2e5593769729ebb8aab356ba78a7e3a 49 5a2ff3fb844d5647987da5c194d1c728 855 -bb4b6391a180e13f4e2448674cba6d43 4130 9650638ee16a4853495610120e1323f8,bb4b6391a180e13f4e2448674cba6d43 76 25413a93cd7622fa44d015d6914f344d,9650638ee16a4853495610120e1323f8,bb4b6391a180e13f4e2448674cba6d43 5 25413a93cd7622fa44d015d6914f344d,bb4b6391a180e13f4e2448674cba6d43 14 -164098aa7cfeff45d651fd5554538bbe,bb4b6391a180e13f4e2448674cba6d43 16 -164098aa7cfeff45d651fd5554538bbe,bb4b6391a180e13f4e2448674cba6d43,f62948ca6d4385c880321727ba494bec 2 c255c2971f8d977c0407d6c7da877610 855 c4d18d9fa581d3a791a3f2cfd40aefd6 4308 bfb9ef2dde7090760392e8b2701e13b4 762 @@ -73,80 +49,47 @@ bfb9ef2dde7090760392e8b2701e13b4,ddbae40728836c4c245f9ccb7e651515 93 5be0e52ccfe573a6e42e4dd1e8658105,fafafee7c250c76b7e0571fde286022e 26 f62948ca6d4385c880321727ba494bec 122 13417ecbb38b9a159e3ca8c9dade7088 853 -c16ac825052c0456069f6408652aadf8 4203 +c16ac825052c0456069f6408652aadf8 4221 799371fe289fbe0046797729659cb5ff,c16ac825052c0456069f6408652aadf8 94 -51a185c08bf7fac70a60abfeaefc2c90,c16ac825052c0456069f6408652aadf8 18 -74ffef78af08b12ba1c7876206570e26,8f2731708b30e1c402c6d6a69a983fe4,b8843efbd6adaa261a01518dc2a39aa2,c16ac825052c0456069f6408652aadf8 12 -8f2731708b30e1c402c6d6a69a983fe4,b8843efbd6adaa261a01518dc2a39aa2,c16ac825052c0456069f6408652aadf8 43 +b8843efbd6adaa261a01518dc2a39aa2 80 +74ffef78af08b12ba1c7876206570e26 4192 +8f2731708b30e1c402c6d6a69a983fe4 28 50044914d5111c5b5ec58c9d720e3b2d 855 -c472bb8d63e19218a4089821ae666db3 4214 +c472bb8d63e19218a4089821ae666db3 4324 b8843efbd6adaa261a01518dc2a39aa2,c472bb8d63e19218a4089821ae666db3 11 -a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 37 -2c4b8564bbbdf70c6560fdefdbe3ef6a,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 5 -2c4b8564bbbdf70c6560fdefdbe3ef6a,51a185c08bf7fac70a60abfeaefc2c90,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 8 -51a185c08bf7fac70a60abfeaefc2c90,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 4 -261f7e63beed04e2500a9d8ae867e4ab,51a185c08bf7fac70a60abfeaefc2c90,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 28 -261f7e63beed04e2500a9d8ae867e4ab,51a185c08bf7fac70a60abfeaefc2c90,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 75 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 16 -51a185c08bf7fac70a60abfeaefc2c90,c472bb8d63e19218a4089821ae666db3 46 -105e63346a01d88e8339eddf9131c435,51a185c08bf7fac70a60abfeaefc2c90,c472bb8d63e19218a4089821ae666db3 12 -51a185c08bf7fac70a60abfeaefc2c90,5be09ea29709972af27bf90073843f8d,c472bb8d63e19218a4089821ae666db3 12 +a593b1c67f90cd58523762c11773dce3 29 +2c4b8564bbbdf70c6560fdefdbe3ef6a,a593b1c67f90cd58523762c11773dce3 13 +261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26 132 3ec680649615da0685b8c245e0f196e2 855 -184a72815a2ba5949635cc38769cedd0 4075 -184a72815a2ba5949635cc38769cedd0,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc,f9f80587acc7c2ac0e973ed343f69cfe 10 -184a72815a2ba5949635cc38769cedd0,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc 1 -184a72815a2ba5949635cc38769cedd0,c052e7e5a036bddc9f364324b203f1b3,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc 10 -184a72815a2ba5949635cc38769cedd0,c052e7e5a036bddc9f364324b203f1b3,e4f5f60483b68badd6caf4cc2812d2bc 4 -184a72815a2ba5949635cc38769cedd0,c052e7e5a036bddc9f364324b203f1b3 70 -184a72815a2ba5949635cc38769cedd0,74ffef78af08b12ba1c7876206570e26,c052e7e5a036bddc9f364324b203f1b3 3 -184a72815a2ba5949635cc38769cedd0,74ffef78af08b12ba1c7876206570e26 12 +184a72815a2ba5949635cc38769cedd0 4090 +c3415e16a664399ecec286ad7ae48e07 7 +c052e7e5a036bddc9f364324b203f1b3,e4f5f60483b68badd6caf4cc2812d2bc 14 +c052e7e5a036bddc9f364324b203f1b3 4266 184a72815a2ba5949635cc38769cedd0,6fb2de47c835bd9ab026c02d62f49807 134 184a72815a2ba5949635cc38769cedd0,cb86faf105b192fd9281b7f8e00f4fae 86 1e38bd82670c3f10982f70390c599a8d 831 1e38bd82670c3f10982f70390c599a8d,db22dfc14799b1aa666eb7d571cf04ec 24 dc3aea9f418eccd5ea6326c6d0d47e9c 4157 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,dc3aea9f418eccd5ea6326c6d0d47e9c 14 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,d8dd1f9c56425f5874687a1ec67a31e8,dc3aea9f418eccd5ea6326c6d0d47e9c 4 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,ac9583b43abdb40e8da76e4dedccd534,d8dd1f9c56425f5874687a1ec67a31e8,dc3aea9f418eccd5ea6326c6d0d47e9c 75 -9c467defbbea2e4be338eecd26c87e4c,dc3aea9f418eccd5ea6326c6d0d47e9c 6 -9c467defbbea2e4be338eecd26c87e4c,bc94073196b0b2c13e62b5fa47c76b51,dc3aea9f418eccd5ea6326c6d0d47e9c 5 +799371fe289fbe0046797729659cb5ff 49 +51a185c08bf7fac70a60abfeaefc2c90,d8dd1f9c56425f5874687a1ec67a31e8 158 +9c467defbbea2e4be338eecd26c87e4c 11 dc3aea9f418eccd5ea6326c6d0d47e9c,f9f80587acc7c2ac0e973ed343f69cfe 142 5be09ea29709972af27bf90073843f8d,dc3aea9f418eccd5ea6326c6d0d47e9c,f9f80587acc7c2ac0e973ed343f69cfe 11 cf0acc0bc840779c02b16245aa59d35c 855 -74ffef78af08b12ba1c7876206570e26 4118 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26 12 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3 2 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3,b8843efbd6adaa261a01518dc2a39aa2 8 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3,b8843efbd6adaa261a01518dc2a39aa2,c3415e16a664399ecec286ad7ae48e07 19 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,9c467defbbea2e4be338eecd26c87e4c,b8843efbd6adaa261a01518dc2a39aa2,bc94073196b0b2c13e62b5fa47c76b51 6 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,9c467defbbea2e4be338eecd26c87e4c,b8843efbd6adaa261a01518dc2a39aa2 4 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,9c467defbbea2e4be338eecd26c87e4c,b8843efbd6adaa261a01518dc2a39aa2,e4f5f60483b68badd6caf4cc2812d2bc 10 -74ffef78af08b12ba1c7876206570e26,b8843efbd6adaa261a01518dc2a39aa2,e4f5f60483b68badd6caf4cc2812d2bc 19 -74ffef78af08b12ba1c7876206570e26,8f2731708b30e1c402c6d6a69a983fe4,b8843efbd6adaa261a01518dc2a39aa2 38 -105e63346a01d88e8339eddf9131c435,74ffef78af08b12ba1c7876206570e26 58 +261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,9c467defbbea2e4be338eecd26c87e4c 20 d915f009b3e02030ab68baf1c43c55ad 855 -c052e7e5a036bddc9f364324b203f1b3 4199 -261f7e63beed04e2500a9d8ae867e4ab,c052e7e5a036bddc9f364324b203f1b3 89 +261f7e63beed04e2500a9d8ae867e4ab 84 8f2731708b30e1c402c6d6a69a983fe4,c052e7e5a036bddc9f364324b203f1b3 24 90248144d7173c1f1481008c43e65129 855 -ac9583b43abdb40e8da76e4dedccd534 4144 +ac9583b43abdb40e8da76e4dedccd534 4189 a593b1c67f90cd58523762c11773dce3,ac9583b43abdb40e8da76e4dedccd534 40 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,ac9583b43abdb40e8da76e4dedccd534,d8dd1f9c56425f5874687a1ec67a31e8,f9f80587acc7c2ac0e973ed343f69cfe 4 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,ac9583b43abdb40e8da76e4dedccd534,f9f80587acc7c2ac0e973ed343f69cfe 32 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,ac9583b43abdb40e8da76e4dedccd534,c3415e16a664399ecec286ad7ae48e07,f9f80587acc7c2ac0e973ed343f69cfe 2 -51a185c08bf7fac70a60abfeaefc2c90,ac9583b43abdb40e8da76e4dedccd534,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc,f9f80587acc7c2ac0e973ed343f69cfe 40 -ac9583b43abdb40e8da76e4dedccd534,c524a7c04519de87cb28b875146ac9ac 2 -ac9583b43abdb40e8da76e4dedccd534,c524a7c04519de87cb28b875146ac9ac,cb86faf105b192fd9281b7f8e00f4fae 3 -ac9583b43abdb40e8da76e4dedccd534,c3415e16a664399ecec286ad7ae48e07,c524a7c04519de87cb28b875146ac9ac,cb86faf105b192fd9281b7f8e00f4fae 17 +c3415e16a664399ecec286ad7ae48e07,c524a7c04519de87cb28b875146ac9ac 17 2c4b8564bbbdf70c6560fdefdbe3ef6a,ac9583b43abdb40e8da76e4dedccd534 67 539d334104090326aa72b38e4b353cc0 855 -51a185c08bf7fac70a60abfeaefc2c90 4039 -51a185c08bf7fac70a60abfeaefc2c90,d8dd1f9c56425f5874687a1ec67a31e8 75 -51a185c08bf7fac70a60abfeaefc2c90,bc94073196b0b2c13e62b5fa47c76b51,cb86faf105b192fd9281b7f8e00f4fae 5 -51a185c08bf7fac70a60abfeaefc2c90,bc94073196b0b2c13e62b5fa47c76b51 10 -51a185c08bf7fac70a60abfeaefc2c90,5be09ea29709972af27bf90073843f8d 41 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff 1 -51a185c08bf7fac70a60abfeaefc2c90,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc,f9f80587acc7c2ac0e973ed343f69cfe 4 +51a185c08bf7fac70a60abfeaefc2c90 4049 +cb86faf105b192fd9281b7f8e00f4fae 5 +5be09ea29709972af27bf90073843f8d 41 +f9f80587acc7c2ac0e973ed343f69cfe 4 d706ac56d212d11e1c12ec391c4ea01d 855 565c967a46f12cd0bf458fe366cdc6db 4340 565c967a46f12cd0bf458fe366cdc6db,c3415e16a664399ecec286ad7ae48e07 15 diff --git a/data/test/ropebwt/LineA_2_readMapping.txt b/data/test/ropebwt/LineA_2_readMapping.txt index bbdb453f..94b1390a 100644 --- a/data/test/ropebwt/LineA_2_readMapping.txt +++ b/data/test/ropebwt/LineA_2_readMapping.txt @@ -2,68 +2,44 @@ #filename1=data/test/kmerReadMapping/simulatedReads/LineA_2.fq HapIds count 12f0cec9102e84a161866e37072443b7 403 -3149b3144f93134eb29661bade697fc6 4210 -25413a93cd7622fa44d015d6914f344d,3149b3144f93134eb29661bade697fc6,d4c8b5505d7046b41d7f69b246063ebb 112 -164098aa7cfeff45d651fd5554538bbe,25413a93cd7622fa44d015d6914f344d,3149b3144f93134eb29661bade697fc6,d4c8b5505d7046b41d7f69b246063ebb 8 -164098aa7cfeff45d651fd5554538bbe,25413a93cd7622fa44d015d6914f344d,3149b3144f93134eb29661bade697fc6 20 -164098aa7cfeff45d651fd5554538bbe,3149b3144f93134eb29661bade697fc6 9 -164098aa7cfeff45d651fd5554538bbe,3149b3144f93134eb29661bade697fc6,bb4b6391a180e13f4e2448674cba6d43 32 +3149b3144f93134eb29661bade697fc6 4217 +25413a93cd7622fa44d015d6914f344d 156 +164098aa7cfeff45d651fd5554538bbe 49 3149b3144f93134eb29661bade697fc6,8967fabf10e55d881caa6fe192e7d4ca 40 1b568197f6f329ec5b71f66e49a732fb 808 05efe15d97db33185b64821791b01b0f,1b568197f6f329ec5b71f66e49a732fb 47 -369464a8743d2e40ad83d1375c196bdd 4296 +369464a8743d2e40ad83d1375c196bdd 4303 369464a8743d2e40ad83d1375c196bdd,66465399052d8ebe48b06329c60fee03 11 -369464a8743d2e40ad83d1375c196bdd,8967fabf10e55d881caa6fe192e7d4ca 7 369464a8743d2e40ad83d1375c196bdd,8f7de1a693aa15fb8fb7b85e7a8b5e95 41 f50fe6d6b3d9a9d305889db977969916 855 aff71f19de448514a6d9208b1fcb4e8a,d4c8b5505d7046b41d7f69b246063ebb 80 -d4c8b5505d7046b41d7f69b246063ebb 4034 -8e5dbf7df5b265ba5daf35c4dfdf8a39,d4c8b5505d7046b41d7f69b246063ebb 4 -073286a82fe47d6a370e8a7a3803f1d3,d4c8b5505d7046b41d7f69b246063ebb 90 -073286a82fe47d6a370e8a7a3803f1d3,bb4b6391a180e13f4e2448674cba6d43,d4c8b5505d7046b41d7f69b246063ebb 28 -073286a82fe47d6a370e8a7a3803f1d3,bb4b6391a180e13f4e2448674cba6d43,d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 6 -073286a82fe47d6a370e8a7a3803f1d3,d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 7 -073286a82fe47d6a370e8a7a3803f1d3,57705b1e2541c7634ea59a48fc52026f,d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 9 -073286a82fe47d6a370e8a7a3803f1d3,18dcf42d7b02fd603e05e3714fa71e95,57705b1e2541c7634ea59a48fc52026f,d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 10 -25413a93cd7622fa44d015d6914f344d,57705b1e2541c7634ea59a48fc52026f,d4c8b5505d7046b41d7f69b246063ebb 10 -25413a93cd7622fa44d015d6914f344d,d4c8b5505d7046b41d7f69b246063ebb 7 -073286a82fe47d6a370e8a7a3803f1d3,8967fabf10e55d881caa6fe192e7d4ca,8f7de1a693aa15fb8fb7b85e7a8b5e95,d4c8b5505d7046b41d7f69b246063ebb 18 +d4c8b5505d7046b41d7f69b246063ebb 4152 +8e5dbf7df5b265ba5daf35c4dfdf8a39 4282 +d4c8b5505d7046b41d7f69b246063ebb,f2a26406aaa6aefdc20ccdf8cb313dff 32 +8f7de1a693aa15fb8fb7b85e7a8b5e95 21 18498579d89483ac270e8cca57f34752 817 18498579d89483ac270e8cca57f34752,fca8c375152e127d322a99bbd10e9873 38 -8e5dbf7df5b265ba5daf35c4dfdf8a39 4223 8e5dbf7df5b265ba5daf35c4dfdf8a39,bcb3fa7872cb54537cdf34c96b01ed15 58 8e5dbf7df5b265ba5daf35c4dfdf8a39,90919d6d6f101925c82108fb86ec0621 4 -8e5dbf7df5b265ba5daf35c4dfdf8a39,8f7de1a693aa15fb8fb7b85e7a8b5e95 55 -66465399052d8ebe48b06329c60fee03,8e5dbf7df5b265ba5daf35c4dfdf8a39 13 +66465399052d8ebe48b06329c60fee03 13 cfa288e041fc4dbe299d20ae1920d258 828 a92a239d3a5652d3b9daba7de6febb97,cfa288e041fc4dbe299d20ae1920d258 27 -f3bb0c19ab2a88d82cf79f9311ffa61f 4255 +f3bb0c19ab2a88d82cf79f9311ffa61f 4332 164098aa7cfeff45d651fd5554538bbe,f3bb0c19ab2a88d82cf79f9311ffa61f 71 -8967fabf10e55d881caa6fe192e7d4ca,c4d18d9fa581d3a791a3f2cfd40aefd6,f2e5593769729ebb8aab356ba78a7e3a,f3bb0c19ab2a88d82cf79f9311ffa61f 75 -8967fabf10e55d881caa6fe192e7d4ca,f2e5593769729ebb8aab356ba78a7e3a,f3bb0c19ab2a88d82cf79f9311ffa61f 2 -8967fabf10e55d881caa6fe192e7d4ca,8f7de1a693aa15fb8fb7b85e7a8b5e95,f3bb0c19ab2a88d82cf79f9311ffa61f 3 64747625066ac7d3477cca8ecfa46ed1 785 105c85412229b45439db1f03c3f064f4,64747625066ac7d3477cca8ecfa46ed1 67 64747625066ac7d3477cca8ecfa46ed1,ea27ac2def0e1d80c59cdb38b6dfa66a 3 -f2e5593769729ebb8aab356ba78a7e3a 4171 -7e388530fa03aa2ab21b3cd10a438f5a,90919d6d6f101925c82108fb86ec0621,f2e5593769729ebb8aab356ba78a7e3a 2 -90919d6d6f101925c82108fb86ec0621,f2e5593769729ebb8aab356ba78a7e3a 9 -bb4b6391a180e13f4e2448674cba6d43,f2e5593769729ebb8aab356ba78a7e3a 18 -66465399052d8ebe48b06329c60fee03,bb4b6391a180e13f4e2448674cba6d43,f2e5593769729ebb8aab356ba78a7e3a 96 -66465399052d8ebe48b06329c60fee03,8967fabf10e55d881caa6fe192e7d4ca,bb4b6391a180e13f4e2448674cba6d43,f2e5593769729ebb8aab356ba78a7e3a 20 -8967fabf10e55d881caa6fe192e7d4ca,bb4b6391a180e13f4e2448674cba6d43,f2e5593769729ebb8aab356ba78a7e3a 18 -8967fabf10e55d881caa6fe192e7d4ca,bb4b6391a180e13f4e2448674cba6d43,c4d18d9fa581d3a791a3f2cfd40aefd6,f2e5593769729ebb8aab356ba78a7e3a 9 -8967fabf10e55d881caa6fe192e7d4ca,c4d18d9fa581d3a791a3f2cfd40aefd6,f2e5593769729ebb8aab356ba78a7e3a 38 +f2e5593769729ebb8aab356ba78a7e3a 4285 +90919d6d6f101925c82108fb86ec0621 11 +8967fabf10e55d881caa6fe192e7d4ca 60 +c4d18d9fa581d3a791a3f2cfd40aefd6 4333 674dda50f4e91de9d4a88cc341173974,f2e5593769729ebb8aab356ba78a7e3a 49 5a2ff3fb844d5647987da5c194d1c728 855 -bb4b6391a180e13f4e2448674cba6d43 4130 +bb4b6391a180e13f4e2448674cba6d43 4134 9650638ee16a4853495610120e1323f8,bb4b6391a180e13f4e2448674cba6d43 76 25413a93cd7622fa44d015d6914f344d,9650638ee16a4853495610120e1323f8,bb4b6391a180e13f4e2448674cba6d43 5 25413a93cd7622fa44d015d6914f344d,bb4b6391a180e13f4e2448674cba6d43 14 -164098aa7cfeff45d651fd5554538bbe,bb4b6391a180e13f4e2448674cba6d43 16 -164098aa7cfeff45d651fd5554538bbe,bb4b6391a180e13f4e2448674cba6d43,f62948ca6d4385c880321727ba494bec 2 c255c2971f8d977c0407d6c7da877610 855 -c4d18d9fa581d3a791a3f2cfd40aefd6 4308 bfb9ef2dde7090760392e8b2701e13b4 762 bfb9ef2dde7090760392e8b2701e13b4,ddbae40728836c4c245f9ccb7e651515 93 5ae13475e95253ee352dda4875dc9f7c 4328 @@ -73,80 +49,47 @@ bfb9ef2dde7090760392e8b2701e13b4,ddbae40728836c4c245f9ccb7e651515 93 5be0e52ccfe573a6e42e4dd1e8658105,fafafee7c250c76b7e0571fde286022e 26 f62948ca6d4385c880321727ba494bec 122 13417ecbb38b9a159e3ca8c9dade7088 403 -c16ac825052c0456069f6408652aadf8 4203 +c16ac825052c0456069f6408652aadf8 4221 799371fe289fbe0046797729659cb5ff,c16ac825052c0456069f6408652aadf8 94 -51a185c08bf7fac70a60abfeaefc2c90,c16ac825052c0456069f6408652aadf8 18 -74ffef78af08b12ba1c7876206570e26,8f2731708b30e1c402c6d6a69a983fe4,b8843efbd6adaa261a01518dc2a39aa2,c16ac825052c0456069f6408652aadf8 12 -8f2731708b30e1c402c6d6a69a983fe4,b8843efbd6adaa261a01518dc2a39aa2,c16ac825052c0456069f6408652aadf8 43 +8f2731708b30e1c402c6d6a69a983fe4 93 50044914d5111c5b5ec58c9d720e3b2d 855 -c472bb8d63e19218a4089821ae666db3 4214 +c472bb8d63e19218a4089821ae666db3 4321 b8843efbd6adaa261a01518dc2a39aa2,c472bb8d63e19218a4089821ae666db3 11 -a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 37 -2c4b8564bbbdf70c6560fdefdbe3ef6a,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 5 -2c4b8564bbbdf70c6560fdefdbe3ef6a,51a185c08bf7fac70a60abfeaefc2c90,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 8 -51a185c08bf7fac70a60abfeaefc2c90,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 4 -261f7e63beed04e2500a9d8ae867e4ab,51a185c08bf7fac70a60abfeaefc2c90,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 28 -261f7e63beed04e2500a9d8ae867e4ab,51a185c08bf7fac70a60abfeaefc2c90,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 75 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3,c472bb8d63e19218a4089821ae666db3 16 -51a185c08bf7fac70a60abfeaefc2c90,c472bb8d63e19218a4089821ae666db3 46 -105e63346a01d88e8339eddf9131c435,51a185c08bf7fac70a60abfeaefc2c90,c472bb8d63e19218a4089821ae666db3 12 -51a185c08bf7fac70a60abfeaefc2c90,5be09ea29709972af27bf90073843f8d,c472bb8d63e19218a4089821ae666db3 12 +2c4b8564bbbdf70c6560fdefdbe3ef6a,a593b1c67f90cd58523762c11773dce3 13 +261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26 132 +51a185c08bf7fac70a60abfeaefc2c90 4128 3ec680649615da0685b8c245e0f196e2 855 -184a72815a2ba5949635cc38769cedd0 4075 -184a72815a2ba5949635cc38769cedd0,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc,f9f80587acc7c2ac0e973ed343f69cfe 10 -184a72815a2ba5949635cc38769cedd0,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc 1 -184a72815a2ba5949635cc38769cedd0,c052e7e5a036bddc9f364324b203f1b3,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc 10 -184a72815a2ba5949635cc38769cedd0,c052e7e5a036bddc9f364324b203f1b3,e4f5f60483b68badd6caf4cc2812d2bc 4 -184a72815a2ba5949635cc38769cedd0,c052e7e5a036bddc9f364324b203f1b3 70 -184a72815a2ba5949635cc38769cedd0,74ffef78af08b12ba1c7876206570e26,c052e7e5a036bddc9f364324b203f1b3 3 -184a72815a2ba5949635cc38769cedd0,74ffef78af08b12ba1c7876206570e26 12 +184a72815a2ba5949635cc38769cedd0 4171 +c052e7e5a036bddc9f364324b203f1b3,e4f5f60483b68badd6caf4cc2812d2bc 14 184a72815a2ba5949635cc38769cedd0,6fb2de47c835bd9ab026c02d62f49807 134 184a72815a2ba5949635cc38769cedd0,cb86faf105b192fd9281b7f8e00f4fae 86 1e38bd82670c3f10982f70390c599a8d 831 1e38bd82670c3f10982f70390c599a8d,db22dfc14799b1aa666eb7d571cf04ec 24 -dc3aea9f418eccd5ea6326c6d0d47e9c 4157 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,dc3aea9f418eccd5ea6326c6d0d47e9c 14 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,d8dd1f9c56425f5874687a1ec67a31e8,dc3aea9f418eccd5ea6326c6d0d47e9c 4 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,ac9583b43abdb40e8da76e4dedccd534,d8dd1f9c56425f5874687a1ec67a31e8,dc3aea9f418eccd5ea6326c6d0d47e9c 75 -9c467defbbea2e4be338eecd26c87e4c,dc3aea9f418eccd5ea6326c6d0d47e9c 6 -9c467defbbea2e4be338eecd26c87e4c,bc94073196b0b2c13e62b5fa47c76b51,dc3aea9f418eccd5ea6326c6d0d47e9c 5 +dc3aea9f418eccd5ea6326c6d0d47e9c 4171 +51a185c08bf7fac70a60abfeaefc2c90,d8dd1f9c56425f5874687a1ec67a31e8 158 +9c467defbbea2e4be338eecd26c87e4c 6 +bc94073196b0b2c13e62b5fa47c76b51 5 dc3aea9f418eccd5ea6326c6d0d47e9c,f9f80587acc7c2ac0e973ed343f69cfe 142 5be09ea29709972af27bf90073843f8d,dc3aea9f418eccd5ea6326c6d0d47e9c,f9f80587acc7c2ac0e973ed343f69cfe 11 cf0acc0bc840779c02b16245aa59d35c 855 74ffef78af08b12ba1c7876206570e26 4118 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26 12 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3 2 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3,b8843efbd6adaa261a01518dc2a39aa2 8 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,a593b1c67f90cd58523762c11773dce3,b8843efbd6adaa261a01518dc2a39aa2,c3415e16a664399ecec286ad7ae48e07 19 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,9c467defbbea2e4be338eecd26c87e4c,b8843efbd6adaa261a01518dc2a39aa2,bc94073196b0b2c13e62b5fa47c76b51 6 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,9c467defbbea2e4be338eecd26c87e4c,b8843efbd6adaa261a01518dc2a39aa2 4 -261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,9c467defbbea2e4be338eecd26c87e4c,b8843efbd6adaa261a01518dc2a39aa2,e4f5f60483b68badd6caf4cc2812d2bc 10 -74ffef78af08b12ba1c7876206570e26,b8843efbd6adaa261a01518dc2a39aa2,e4f5f60483b68badd6caf4cc2812d2bc 19 -74ffef78af08b12ba1c7876206570e26,8f2731708b30e1c402c6d6a69a983fe4,b8843efbd6adaa261a01518dc2a39aa2 38 -105e63346a01d88e8339eddf9131c435,74ffef78af08b12ba1c7876206570e26 58 +261f7e63beed04e2500a9d8ae867e4ab,74ffef78af08b12ba1c7876206570e26,9c467defbbea2e4be338eecd26c87e4c 20 +b8843efbd6adaa261a01518dc2a39aa2 19 +105e63346a01d88e8339eddf9131c435 58 d915f009b3e02030ab68baf1c43c55ad 855 c052e7e5a036bddc9f364324b203f1b3 4199 -261f7e63beed04e2500a9d8ae867e4ab,c052e7e5a036bddc9f364324b203f1b3 89 +261f7e63beed04e2500a9d8ae867e4ab 89 8f2731708b30e1c402c6d6a69a983fe4,c052e7e5a036bddc9f364324b203f1b3 24 90248144d7173c1f1481008c43e65129 855 ac9583b43abdb40e8da76e4dedccd534 4144 a593b1c67f90cd58523762c11773dce3,ac9583b43abdb40e8da76e4dedccd534 40 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,ac9583b43abdb40e8da76e4dedccd534,d8dd1f9c56425f5874687a1ec67a31e8,f9f80587acc7c2ac0e973ed343f69cfe 4 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,ac9583b43abdb40e8da76e4dedccd534,f9f80587acc7c2ac0e973ed343f69cfe 32 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff,ac9583b43abdb40e8da76e4dedccd534,c3415e16a664399ecec286ad7ae48e07,f9f80587acc7c2ac0e973ed343f69cfe 2 -51a185c08bf7fac70a60abfeaefc2c90,ac9583b43abdb40e8da76e4dedccd534,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc,f9f80587acc7c2ac0e973ed343f69cfe 40 -ac9583b43abdb40e8da76e4dedccd534,c524a7c04519de87cb28b875146ac9ac 2 -ac9583b43abdb40e8da76e4dedccd534,c524a7c04519de87cb28b875146ac9ac,cb86faf105b192fd9281b7f8e00f4fae 3 -ac9583b43abdb40e8da76e4dedccd534,c3415e16a664399ecec286ad7ae48e07,c524a7c04519de87cb28b875146ac9ac,cb86faf105b192fd9281b7f8e00f4fae 17 +f9f80587acc7c2ac0e973ed343f69cfe 32 +c3415e16a664399ecec286ad7ae48e07 2 +e4f5f60483b68badd6caf4cc2812d2bc 44 +c524a7c04519de87cb28b875146ac9ac 5 +c3415e16a664399ecec286ad7ae48e07,c524a7c04519de87cb28b875146ac9ac 17 2c4b8564bbbdf70c6560fdefdbe3ef6a,ac9583b43abdb40e8da76e4dedccd534 67 539d334104090326aa72b38e4b353cc0 855 -51a185c08bf7fac70a60abfeaefc2c90 4039 -51a185c08bf7fac70a60abfeaefc2c90,d8dd1f9c56425f5874687a1ec67a31e8 75 -51a185c08bf7fac70a60abfeaefc2c90,bc94073196b0b2c13e62b5fa47c76b51,cb86faf105b192fd9281b7f8e00f4fae 5 -51a185c08bf7fac70a60abfeaefc2c90,bc94073196b0b2c13e62b5fa47c76b51 10 -51a185c08bf7fac70a60abfeaefc2c90,5be09ea29709972af27bf90073843f8d 41 -51a185c08bf7fac70a60abfeaefc2c90,799371fe289fbe0046797729659cb5ff 1 -51a185c08bf7fac70a60abfeaefc2c90,c3415e16a664399ecec286ad7ae48e07,e4f5f60483b68badd6caf4cc2812d2bc,f9f80587acc7c2ac0e973ed343f69cfe 4 d706ac56d212d11e1c12ec391c4ea01d 855 565c967a46f12cd0bf458fe366cdc6db 4340 565c967a46f12cd0bf458fe366cdc6db,c3415e16a664399ecec286ad7ae48e07 15 diff --git a/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt b/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt index 17c67993..07dce236 100644 --- a/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt +++ b/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt @@ -204,15 +204,15 @@ class MapReads : CliktCommand(help="BETA: Map reads to a pangenome using ropeBWT //remove any hits that are not the longest val bestHits = tempMems.filter { it.readEnd - it.readStart == maxLength } - val bestHapIds = bestHits.flatMap { it.listMemHits.map{hits -> hits.contig} }.toSet() - - val filteredBestHapIds = filterToOneReferenceRange(bestHapIds, hapIdToRefRangeMap) - - val totalNumHits = filteredBestHapIds.size + val totalNumHits = bestHits.sumOf { it.numHits } if(totalNumHits <= maxNumHits) { - //if the total number of hits is less than the maxNumHits, add all the hits to the readMapping + //if the total number of hits is less than the maxNumHits, Filter the haps down to a single ref range and then add all those to the readMapping //First turn the hapIds into a set then back to a list and sort so it will be consistent + val bestHapIds = bestHits.flatMap { it.listMemHits.map{hits -> hits.contig} }.toSet() + + val filteredBestHapIds = filterToOneReferenceRange(bestHapIds, hapIdToRefRangeMap) + val hapIdsHit = filteredBestHapIds.sorted() readMapping[hapIdsHit] = (readMapping[hapIdsHit]?:0) + 1 } diff --git a/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt b/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt index 94463874..b889d80d 100644 --- a/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt +++ b/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt @@ -55,25 +55,30 @@ class MapReadsTest { fun testCliktParams() { val mapReads = MapReads() - val noIndex = mapReads.test("--read-files test1.fq --output-dir testDir") + val noIndex = mapReads.test("--read-files test1.fq --output-dir testDir --hvcf-dir ${TestExtension.smallSeqInputDir}") assertEquals(1, noIndex.statusCode) assertEquals("Usage: map-reads []\n\n" + "Error: missing option --index\n", noIndex.stderr) - val noReads = mapReads.test("--index testIndex --output-dir testDir") + val noReads = mapReads.test("--index testIndex --output-dir testDir --hvcf-dir ${TestExtension.smallSeqInputDir}") assertEquals(1, noReads.statusCode) assertEquals("Usage: map-reads []\n\n" + "Error: must provide one of --key-file, --read-files\n", noReads.stderr) - val bothReadInputs = mapReads.test("--index testIndex --key-file testKeyFile --read-files test1.fq --output-dir testDir") + val bothReadInputs = mapReads.test("--index testIndex --key-file testKeyFile --read-files test1.fq --output-dir testDir --hvcf-dir ${TestExtension.smallSeqInputDir}") assertEquals(1, bothReadInputs.statusCode) assertEquals("Usage: map-reads []\n\n" + "Error: option --key-file cannot be used with --read-files\n", bothReadInputs.stderr) - val noOutputDir = mapReads.test("--index testIndex --read-files test1.fq") + val noOutputDir = mapReads.test("--index testIndex --read-files test1.fq --hvcf-dir ${TestExtension.smallSeqInputDir}") assertEquals(1, noOutputDir.statusCode) assertEquals("Usage: map-reads []\n\n" + "Error: missing option --output-dir\n", noOutputDir.stderr) + + val noHvcfDir = mapReads.test("--index testIndex --read-files test1.fq --output-dir testDir") + assertEquals(1, noHvcfDir.statusCode) + assertEquals("Usage: map-reads []\n\n" + + "Error: missing option --hvcf-dir\n", noHvcfDir.stderr) } @Test @@ -174,6 +179,8 @@ class MapReadsTest { val mapReads = MapReads() val readMapping = mapReads.createReadMappingsForFileReader(reader, 5, hapIdToRefRangeMap) + println(readMapping) + assertEquals(2, readMapping.size) assertTrue(readMapping.keys.contains(listOf("hap1","hap2","hap3"))) assertEquals(2, readMapping[listOf("hap1","hap2","hap3")]) @@ -206,10 +213,7 @@ class MapReadsTest { val outputDir = "${TestExtension.tempDir}ropebwtTest/" val outputFile = "$outputDir/LineA_1_readMapping.txt" - - val hvcfFiles = File(TestExtension.smallSeqInputDir).walkTopDown().filter { it.isFile } - .filter { it.name.endsWith("h.vcf") || it.name.endsWith("h.vcf.gz") }.map { "${TestExtension.smallSeqInputDir}/${it.name}" } - .toList() + val hvcfFiles = listOf("data/test/smallseq/LineA.h.vcf", "data/test/smallseq/LineB.h.vcf", "data/test/smallseq/Ref.h.vcf") val graph = HaplotypeGraph(hvcfFiles) @@ -222,6 +226,10 @@ class MapReadsTest { val observedLines = bufferedReader(outputFile).readLines() val observedReadMap = observedLines.filter { !it.startsWith("#") }.filter { !it.startsWith("HapId") }.map { it.split("\t") }.associate { it[0] to it[1].toInt() } + val sum = observedReadMap.values.sum() + println(sum) + println("expected: " + expectedReadMap.values.sum()) + assertEquals(expectedReadMap.size, observedReadMap.size) expectedReadMap.keys.forEach { assertEquals(expectedReadMap[it], observedReadMap[it]) } @@ -235,9 +243,7 @@ class MapReadsTest { val outputDir = "${TestExtension.tempDir}ropebwtTest/" val index = "data/test/ropebwt/testIndex.fmd" - val hvcfFiles = File(TestExtension.smallSeqInputDir).walkTopDown().filter { it.isFile } - .filter { it.name.endsWith("h.vcf") || it.name.endsWith("h.vcf.gz") }.map { "${TestExtension.smallSeqInputDir}/${it.name}" } - .toList() + val hvcfFiles = listOf("data/test/smallseq/LineA.h.vcf", "data/test/smallseq/LineB.h.vcf", "data/test/smallseq/Ref.h.vcf") val graph = HaplotypeGraph(hvcfFiles) From a0a0c245e2927a3a604b8602d787b3cffadac00d Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Fri, 17 Jan 2025 09:38:52 -0500 Subject: [PATCH 3/7] Adding in unit test for refRange filter --- .../phgv2/pathing/ropebwt/MapReadsTest.kt | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt b/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt index b889d80d..47956c75 100644 --- a/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt +++ b/src/test/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReadsTest.kt @@ -286,4 +286,34 @@ class MapReadsTest { assertEquals(expectedLines.size, observedLines.size) expectedLines.indices.forEach { assertEquals(expectedLines[it], observedLines[it]) } } + + @Test + fun testFilterToOneReferenceRange() { + val mapReads = MapReads() + val hapIdToRefRangeMap = mapOf("hap1" to listOf(ReferenceRange("chr1",100,200)), + "hap2" to listOf(ReferenceRange("chr1",100,200)), + "hap3" to listOf(ReferenceRange("chr1",100,200)), + "hap4" to listOf(ReferenceRange("chr1",200,300)), + "hap5" to listOf(ReferenceRange("chr1",200,300))) + + + + //all one refRange + val allOneHapIdsSet = setOf("hap1","hap2","hap3") + val expectedAllOne = setOf("hap1","hap2","hap3") + val filteredAllOne = mapReads.filterToOneReferenceRange(allOneHapIdsSet, hapIdToRefRangeMap) + assertEquals(3, filteredAllOne.size) + assertTrue(filteredAllOne.toSet().containsAll(expectedAllOne)) + assertTrue(expectedAllOne.containsAll(filteredAllOne.toSet())) + + + val threeVTwo = setOf("hap1","hap2","hap3","hap4","hap5") + val expectedThree = setOf("hap1","hap2","hap3") + val filteredThree = mapReads.filterToOneReferenceRange(threeVTwo, hapIdToRefRangeMap) + assertEquals(3, filteredThree.size) + assertTrue(filteredThree.toSet().containsAll(expectedThree)) + assertTrue(expectedThree.containsAll(filteredThree.toSet())) + + + } } \ No newline at end of file From 8af4699557757d1929f29d33f4b9973906435ee5 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Thu, 23 Jan 2025 14:53:52 -0500 Subject: [PATCH 4/7] Initial docs for ropeBWT3 imputation. --- docs/imputation.md | 4 + docs/imputation_ropebwt.md | 683 +++++++++++++++++++++++++++++++++++++ 2 files changed, 687 insertions(+) create mode 100644 docs/imputation_ropebwt.md diff --git a/docs/imputation.md b/docs/imputation.md index b73da671..4b25c22e 100644 --- a/docs/imputation.md +++ b/docs/imputation.md @@ -10,6 +10,10 @@ PHGv2 is under active development and we expect to encounter bugs. We aim to report bugs as soon as they are discovered. +!!! note + We are currently working on a new imputation pipeline that will be more efficient and accurate. + You can test this out by following the steps outlined in the "[RopeBWT3 Imputation](imputation_ropebwt.md)" documentation. + In this document, we will discuss the steps needed to perform imputation using the PHG: diff --git a/docs/imputation_ropebwt.md b/docs/imputation_ropebwt.md new file mode 100644 index 00000000..263f70aa --- /dev/null +++ b/docs/imputation_ropebwt.md @@ -0,0 +1,683 @@ +# Imputation using RopeBWT3 + +!!! note +This document is a work in progress and will be updated as the +imputation pipeline is further developed. Please check back for +updates. As of now the pipeline is in a BETA state but is functional. + +In this document, we will discuss the steps needed to perform +imputation using the ropebwt3 tool within the PHG. The steps: + +1. Export hVCF data +2. Index the pangenome from exported hVCF data +3. Map short reads +4. Find paths + +!!! note +The steps detailed in this document build on the materials from +the "[Building and Loading](build_and_load.md)" documentation. +Please review this if you have not worked with the PHG before! + +## Quick start + +* Get list of samples to export: + ```shell + phg list-samples \ + --db-path /my/db/uri \ + --data-set hvcf \ + --output-file /my/sample_names.txt + ``` + +* Export hVCF data: + ```shell + phg export-vcf \ + --db-path /my/db/uri \ + --dataset-type hvcf \ + --sample-file /my/sample_names.txt \ + --output-dir /my/hvcf/dir + ``` + +* Index pangenome: + ```shell + phg rope-bwt-index \ + --db-path /my/db/uri \ + --hvcf-dir /my/hvcf/dir \ + --output-dir /my/index/dir \ + --index-file-prefix myindex + ``` + +* Map short reads + ```shell + phg map-reads \ + --hvcf-dir /my/hvcf/dir \ + --index /my/index/dir/myindex.fmd \ + --key-file /my/path/keyfile \ + --output-dir /my/mapping/dir + ``` + +* Find paths + ```shell + phg find-paths \ + --path-keyfile /my/path/keyfile \ + --hvcf-dir /my/hvcf/dir \ + --reference-genome /my/ref/genome \ + --path-type haploid \ + --output-dir /my/imputed/hvcfs + ``` + +* OPTIONAL: Get SNPs from imputed hVCFs + ```shell + phg hvcf2gvcf \ + --hvcf-dir /my/imputed/hvcfs \ + --db-path /my/db/uri \ + --reference-file /my/ref/genome \ + --output-dir /my/gvcf/dir + ``` + +## Detailed walkthrough + +### hVCF export + +!!! note +This step is currently needed, but will be removed in the future as +we can do direct connections to the hVCF TileDB instance. + +Where we last left off in the "[Build and Load](build_and_load.md)" +steps, we had an example directory that looks similar to the +following: + +``` +phg_v2_example/ +├── data +│ ├── anchors.gff +│ ├── Ref-v5.fa +│ ├── LineA-final-01.fa +│ └── LineB-final-04.fa +├── output +│ ├── alignment_files/ +│ ├── ref_ranges.bed +│ ├── updated_assemblies +│ │ ├── Ref.fa +│ │ ├── LineA.fa +│ │ └── LineB.fa +│ └── vcf_files/ +└── vcf_dbs + ├── assemblies.agc + ├── gvcf_dataset/ + ├── hvcf_dataset/ + ├── hvcf_files/ + ├── reference/ + └── temp/ +``` + +!!! note +The following steps in the rest of this document will assume we are +in the top level of our `phg_v2_example` working directory. + + +For this example imputation pipeline, we will be using haplotypes +generated from two samples in our database: + +* `LineA` +* `LineB` + +If you do not have [hVCF files](hvcf_specifications.md) for samples +you wish to impute against already generated, we will first need to +export this haplotype data in the form of hVCF data from the TileDB +instance. This is done using the `export-vcf` command: + +```shell +./phg export-vcf \ + --db-path vcf_dbs \ + --dataset-type hvcf \ + --sample-file /my/sample_names.txt \ + --output-dir output/vcf_files +``` +This command takes in 4 parameters: + +* `--db-path` - path to directory storing the TileDB instances. The + AGC compressed genomes will be placed here on completion. +* `--dataset-type` - the type of dataset to export. In this case, we + are exporting the hVCF data. +* `--sample-file` - text file with list of sample names to export, one per line. +* `--output-dir` - the directory to place the exported hVCF files. + +In our above example, we use the `--sample-file` parameter, but if +your sample list is small, you can use the `--sample-names` parameter. +An example of a sample file is as follows: + +``` +$ cat sample_names.txt + +LineA +LineB +``` + +Example command using --sample-names parameter: + +```shell +./phg export-vcf \ + --db-path vcf_dbs \ + --dataset-type hvcf \ + --sample-names LineA,LineB \ + --output-dir output/vcf_files +``` + +After the export command, our directory structure will have new +files added to the `vcf_files` directory: + +``` +phg_v2_example/ +├── data +│ ├── anchors.gff +│ ├── Ref-v5.fa +│ ├── LineA-final-01.fa +│ └── LineB-final-04.fa +├── output +│ ├── alignment_files/ +│ ├── ref_ranges.bed +│ ├── updated_assemblies +│ │ ├── Ref.fa +│ │ ├── LineA.fa +│ │ └── LineB.fa +│ └── vcf_files +│ ├── LineA.h.vcf * +│ └── LineB.h.vcf * +└── vcf_dbs + ├── assemblies.agc + ├── gvcf_dataset/ + ├── hvcf_dataset/ + ├── hvcf_files/ + ├── reference/ + └── temp/ +``` + + + +### Pangenome indexing + +In order to run the later [ropebwt3 read mapping steps](#read-mapping) +, +we will first need to **build** a ropebwt3 index. +This is performed using the `rope-bwt-index` command: + + ```shell + phg rope-bwt-index \ + --db-path vcf_dbs \ + --hvcf-dir output/vcf_files \ + --output-dir output/index_files \ + --index-file-prefix myindex + ``` + + +This command has **4** required parameters: + +* `--db-path` - path to directory storing the TileDB instances and + `assemblies.agc` file made in the + "[Compress FASTA files](build_and_load.md#compress-fasta-files)" + section of the "[Build and Load](build_and_load.md)" documentation. +* `--hvcf-dir` - the directory containing the hVCF files. This is the + output directory from the `export-vcf` command. Right now this is + required, but will be optional in the future. +* `--output-dir` - the directory to place the index files. +* `--index-file-prefix` - the prefix for the index file + name. A number of files will be created all of which start with `--index-file-prefix`. + +Running rope-bwt-index creates an index file with the extension `.fmd` and some additional files for a sequence suffix array(`.ssa`) and a contig length file (`.len.gz`). +Both of these are needed to map reads to this index. +Since we have used defaults, the index files will show up in the +following `output/index_files` directory of our example: + +``` +phg_v2_example/ +├── data +│ ├── anchors.gff +│ ├── Ref-v5.fa +│ ├── LineA-final-01.fa +│ └── LineB-final-04.fa +├── output +│ ├── alignment_files/ +│ ├── ref_ranges.bed +│ ├── updated_assemblies +│ │ ├── Ref.fa +│ │ ├── LineA.fa +│ │ └── LineB.fa +│ └── vcf_files +│ │ ├── LineA.h.vcf +│ │ └── LineB.h.vcf +│ └── index_files +│ ├── myindex.fmd * +│ ├── myindex.fmd.len.gz * +│ └── myindex.fmd.ssa * +└── vcf_dbs + ├── assemblies.agc + ├── gvcf_dataset/ + ├── hvcf_dataset/ + ├── hvcf_files/ + ├── reference/ + └── temp/ +``` + + +#### Optional parameters +In addition to `--index-file`, this command can take other optional +parameters: + +| Parameter name | Description | Default value | +|----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------| +| `--threads` | Number of threads to use during indexing. | `3` | +| `--delete-fmr-index` | RopeBWT3 will originally output a .fmr index file. This is converted to fmd for efficiency. If this flag is set to false it will keep that original file. | `true` | +| `--no-diagnostics` or `-n` | A flag that eliminates the diagnostic report | Disabled (report is written) | + + +### Read mapping + +Now that we have a generated ropeBWT3 index file, we can efficiently +align short reads against the PHG using the `map-reads` command. To +demonstrate this, I will retrieve some example data from our +[PHGv2 GitHub test directory](https://github.com/maize-genetics/phg_v2/tree/main/data/test/kmerReadMapping/simulatedReads). +The files I will be using from this directory for this walkthrough +will be the following: + +* `LineA_LineB_1.fq` +* `LineA_LineB_2.fq` + +These files are simulated paired-end (e.g., `_1.fq` and `_2.fq`) +short reads with a length of 150 bps in +[FASTQ](https://en.wikipedia.org/wiki/FASTQ_format) format. I will +place these files under the `data` directory in sub folder called +`short_reads`: + +``` +phg_v2_example/ +├── data +│ └── short_reads * +│ │ ├── LineA_LineB_1.fq * +│ │ └── LineA_LineB_2.fq * +│ ├── anchors.gff +│ ├── Ref-v5.fa +│ ├── LineA-final-01.fa +│ └── LineB-final-04.fa +├── output +│ ├── alignment_files/ +│ ├── ref_ranges.bed +│ ├── updated_assemblies +│ │ ├── Ref.fa +│ │ ├── LineA.fa +│ │ └── LineB.fa +│ └── vcf_files +│ │ ├── LineA.h.vcf +│ │ └── LineB.h.vcf +│ └── index_files +│ ├── myindex.fmd +│ ├── myindex.fmd.len.gz +│ └── myindex.fmd.ssa +└── vcf_dbs + ├── assemblies.agc + ├── gvcf_dataset/ + ├── hvcf_dataset/ + ├── hvcf_files/ + ├── reference/ + └── temp/ +``` + +Now that we have both short read data and our k-mer index file, we +can pass these to the `map-reads` command: + +```shell +./phg map-reads \ + --hvcf-dir output/vcf_files \ + --index output/index_files/myindex.fmd \ + --key-file data/key_files/read_mapping_data.txt \ + --output-dir output/read_mappings +``` + +This command has the following parameters: + +* `--hvcf-dir` - the directory containing the hVCF files. +* `--key-file` - a tab-delimited list of FASTQ files for a collection + of samples. + + In the above example, I have made a keyfile and placed it in a + subdirectory under the `data` folder called `key_files`. + + My example keyfile would look like the following: + ``` + sampleName filename filename2 + LineA_B data/short_reads/LineA_LineB_1.fq data/short_reads/LineA_LineB_2.fq + ``` + + If you have more than one sample, you would place additional + lines at the bottom of the keyfile. For example: + ``` + sampleName filename filename2 + LineA_B data/short_reads/LineA_LineB_1.fq data/short_reads/LineA_LineB_2.fq + CrossC data/short_reads/cross_c_1.fq data/short_reads/cross_c_2.fq + CrossD data/short_reads/cross_d_1.fq data/short_reads/cross_d_2.fq + ``` + + > ℹ️ **Note** + The keyfile for this parameter needs **column names**. If you + are using single-reads, the column names would be: + `sampleName` and `filename`. If you have paired-end reads (like + the example above), the column names would be `sampleName`, + `filename`, and `filename2`. + + > ℹ️ **Note** + File names must be of type "FASTQ". In other words, files must + end in the permitted extensions: `.fq`, `.fq.gz`, `.fastq`, and + `.fastq.gz`. +* `--output-dir` - the directory to place the read-mapping files. +* `--index` - the ropeBWT3 index file created by the + `ropebwt-index` command. + + + +!!! tip +If you do not want to create a keyfile and only have one sample, +you can replace the `--key-file` parameter with the `--read-files` +parameter. This parameter will take the paths to the FASTQ reads as +a comma separated list. + + For example, if we were to modify the prior example, the command + structure would look like the following: + + ```shell + ./phg map-reads \ + --hvcf-dir output/vcf_files \ + --index output/index_files/myindex.fmd \ + --read-files data/short_reads/LineA_LineB_1.fq,data/short_reads/LineA_LineB_2.fq \ + --output-dir output/read_mappings + ``` + + +#### Optional parameters +The Map Reads command can take the following optional parameters: + +| Parameter name | Description | Default value | +|--------------------|-------------------------------------------------------------------------------------------------------------------------------------------|---------------| +| `--threads` | Number of threads used for mapping | `5` | +| `--min-mem-length` | Minimum length of the Maximal Exact Match(MEM) to retain. Any alignments below this value are ignored | `148` | +| `--max-num-hits` | Maximum number of alignments to consider for a given read. If there are more than `--max-num-hits` alignments, the read will be ignored. | `50` | + + + + +Now that we have performed read mapping, we will have a new +read-mapping file in our example working directory that will be +used for the path finding algorithm in the next step: + +``` +phg_v2_example/ +├── data +│ └── short_reads +│ │ ├── LineA_LineB_1.fq +│ │ └── LineA_LineB_2.fq +│ ├── anchors.gff +│ ├── Ref-v5.fa +│ ├── LineA-final-01.fa +│ └── LineB-final-04.fa +├── output +│ ├── alignment_files/ +│ ├── ref_ranges.bed +│ ├── updated_assemblies +│ │ ├── Ref.fa +│ │ ├── LineA.fa +│ │ └── LineB.fa +│ ├── vcf_files +│ │ ├── LineA.h.vcf +│ │ └── LineB.h.vcf +│ └── read_mappings * +│ └── LineA_LineB_1_readMapping.txt * +└── vcf_dbs + ├── assemblies.agc + ├── gvcf_dataset/ + ├── hvcf_dataset/ + ├── hvcf_files/ + ├── reference/ + └── temp/ +``` + +!!! note +The naming scheme for the read-mapping files follows the first +sample in the keyfile or comma-delimited list and: +1. strips the path and FASTQ extension text +2. adds the `readMapping.txt` suffix to the stripped sample name + + Additionally, $n$ amount of files will be generated for $n$ amount + of short read samples found in the keyfile or comma-delimited list. + For example, if you have 3 samples you are trying to impute, 3 read + mapping files will be generated in the given output. + + + +### Find Paths +`find-paths` is the command used to impute paths through a +[`HaplotypeGraph`](https://github.com/maize-genetics/phg_v2/blob/main/src/main/kotlin/net/maizegenetics/phgv2/api/HaplotypeGraph.kt) +object based on a set of read-mappings. The method uses read-mapping +files generated by the `map-reads` command. Running `find-paths` uses +the [Viterbi algorithm](https://en.wikipedia.org/wiki/Viterbi_algorithm) +to solve a [hidden Markov model (HMM)](https://en.wikipedia.org/wiki/Hidden_Markov_model) +to identify the set of haplotypes most likely to have generated the +observed read-mappings. Two path types are implemented: + +* `haploid` - A haploid path is a **single path** through the graph. +* `diploid` - A diploid path is **two paths** through the graph that + jointly explain the read-mappings. + +!!! note +`find-paths` can run the old kmer read mapping pipeline. +Eventually this will be transitioned over to using the ropebwt3 index +and alignment process. For now run the index and mapping step manually. + +#### Input files +`find-paths` can take either read-mapping files generated by the command +`map-kmers` or FASTQ files of sequence reads. If FASTQ files are used, +`map-kmers` is run to generate read-mappings but the intermediate +read-mapping files are not saved. **To save a copy of the read-mapping +files, `map-kmers` and `find-paths` must be run separately.** For +more information, please refer to the prior "[Read Mapping](#read-mapping)" +section for additional input details. + +!!! note +If you using read-mapping files, the files **must** end with the +`_readMapping.txt` suffix or errors will occur. + + +#### Example usage +Using our prior working example, we can set up the path finding +commands in the following code block. For reference, I have made +a new subdirectory in the `output` folder called `vcf_files_imputed` +and will place imputed hVCF files there: + +```shell +./phg find-paths \ + --path-keyfile data/key_files/path_finding_data.txt \ + --hvcf-dir output/vcf_files \ + --reference-genome output/updated_assemblies/Ref.fa \ + --path-type haploid \ + --output-dir output/vcf_files_imputed +``` + +This command has the following required parameters: + +* `--path-keyfile` **or** `--read-file` **but not both**: + + `--path-keyfile` - name and path of a tab-delimited keyfile that + contains a list of the read-mapping files or read files (FASTQ). + + In the working example, I have made a keyfile and placed it in + the `key_files` folder under the `data` directory. The keyfile + looks something like this: + ``` + sampleName filename + LineA_B output/read_mappings/LineA_LineB_1_readMapping.txt + ``` + + If you wish to skip the prior manual `map-reads` step, the data + in the keyfile can also be paths to FASTQ data (_see the + "[Read Mapping](#read-mapping)" section for further details_) + + > ℹ️ **Note** + The keyfile must have two columns labeled `sampleName` and + `filename` if you are specifying paths to read-mapping files. + If you are using paths to FASTQ data, you may add another + column to your keyfile, labelled `filename2`, only if your + FASTQ data is paired-end. + + > ℹ️ **Note** + The samples in the `sampleName` must be unique and must match + prior keyfile data from the "[Read Mapping](#read-mapping)" + steps. + + > ℹ️ **Note** + If an output hVCF for a sample name already exists in the + output directory, the sample will be skipped. + + `--read-file` - FASTQ samples or read-mapping files as a + comma-separated list. + + If bypassing the manual `map-reads` step, paths to FASTQ + files can be used. Either 1 (for single-end) or 2 (for + paired-end) comma-separated files can be input this way. + + If specifying read-mapping files, comma-separated paths to + files must have the `_readMapping.txt` suffix to work. +* `--hvcf-dir` - The directory containing the hvcf used to build the + haplotype graph used for imputation. +* `--reference-genome` - The name and path to the reference genome + fasta or fastq file. +* `--output-dir` - The directory where the output hVCFs will be + written. One file will be written for each sample, and the output + file name will be `.h.vcf`. +* `--path-type`: The type of path to be imputed. The value must be + either `haploid` or `diploid` + + +#### Optional parameters +Additional optional parameters may also be set to optimize your +imputation pipeline. Please see the proceeding section +("[Likely Ancestors](#likely-ancestors)") for more information on +how to leverage a set of the following parameters. + +| Parameter name | Description | Default value | +|----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------| +| `--out-parents-dir` | The directory where the imputed parents (ancestors) will be written for each sample. File names will be _imputed_parents.txt. If no directory name is supplied, the files will not be written. | `""` | +| `--prob-correct` | The probability that a mapped read was mapped correctly. | `0.99` | +| `--prob-same-gamete` | The probability of transitioning to the same gamete (sample) in the next reference range. This should be equal to 1 - (probability of a recombination). Probability of a recombination can be estimated as the total number of expected recombinations in a sample divided by the number of reference ranges | `0.99` | +| `--min-gametes` | The minimum number of gametes with a haplotype in a reference range. Reference ranges with fewer gametes will not be imputed. | `1` | +| `--min-reads` | The minimum number of reads per reference range. Reference ranges with fewer reads will not be imputed. If `--min-reads = 0`, all reference ranges will be imputed | `0` | +| `--inbreeding-coefficient` | The estimated coefficient of inbreeding for the samples being evaluated. Only used for diploid path type. **The value must be between `0.0` and `1.0`** | `0.0` | +| `--max-reads-per-kb` | ReferenceRanges with more than max-reads-per-kb will not be imputed. | `1000` | +| `--use-likely-ancestors` | The value must be `true` or `false`. This indicates whether the most likely ancestors of each sample will be used for path finding. | `false` | +| `--max-ancestors` | If `--use-likely-ancestors = true`, use at most max-ancestors. | `Integer.MAX_VALUE` | +| `--min-coverage` | If `--use-likely-ancestors = true`, use the fewest number of ancestors that together have this proportion of mappable reads. The values must be between `0.5` and `1.0` | `1.0` | +| `--likely-ancestor-file` | If `--use-likely-ancestors = true`, a record of the ancestors used for each sample will be written to this file, if a file name is provided. | `""` | +| `--threads` | The number of threads used to find paths. | `3` | + +#### Likely ancestors +The term "ancestors" refers to the taxa or samples (i.e., assemblies) +used to construct the haplotype graph used for imputation. The term +"ancestors" is used because the model considers the haplotypes in the +graph to represent the potential ancestral haplotypes from which the +samples to be imputed were derived. + +In some scenarios, **limiting the number of ancestors** used to +impute a sample will be useful. Some examples include: + +* if some samples were derived from an F1 cross and the individuals + used to build the graph include those parents, using only those + parents for imputation can improve accuracy. +* Also, because imputing diploid + paths is more computationally intensive than haploid paths, limiting + the number of ancestors used per sample may be necessary to control + the amount of time required to impute each sample. + +To restrict the number of ancestors used, set the +`--use-likely-ancestors` parameter to `true`, and provide a value for +either `--max-ancestors` or `--min-coverage`. For the case where the +samples have been derived from a limited number of ancestors setting +`--min-coverage` to `0.95` or `--max-ancestors` to the expected +number of ancestors is a useful strategy. In either case, providing a +name for the output file saves a record of the ancestors used for +each sample and should be checked to make sure the samples behaved as +expected. + +When using this method, ancestors are chosen by first counting the +number of reads for a sample that map to each ancestor in the +haplotype graph. The ancestor with the most mapped reads is chosen. +Next the reads not mapping to first chosen ancestor are used to count +reads mapping to the remaining ancestors and the ancestor with the +most reads is selected. This is repeated until either the proportion +of reads mapping to the selected ancestors >= `--min-coverage` or the +number ancestors selected equals `--max-ancestors`. If a name is +provided for the `--likely-ancestor-file`, the chosen ancestors are +written to that file in the order they were chosen along with the +cumulative coverage, which is the proportion of reads mapping to each +ancestor and the previously chosen ones. + + +#### General output and next steps +Now that we have newly imputed hVCF data, our example working +directory looks like the following: + +``` +phg_v2_example/ +├── data +│ └── short_reads +│ │ ├── LineA_LineB_1.fq +│ │ └── LineA_LineB_2.fq +│ ├── anchors.gff +│ ├── Ref-v5.fa +│ ├── LineA-final-01.fa +│ └── LineB-final-04.fa +├── output +│ ├── alignment_files/ +│ ├── ref_ranges.bed +│ ├── updated_assemblies +│ │ ├── Ref.fa +│ │ ├── LineA.fa +│ │ └── LineB.fa +│ ├── vcf_files +│ │ ├── LineA.h.vcf +│ │ └── LineB.h.vcf +│ │ vcf_files_imputed * +│ │ └── LineA_B.h.vcf * +│ └── read_mappings +│ └── LineA_LineB_1_readMapping.txt +└── vcf_dbs + ├── assemblies.agc + ├── gvcf_dataset/ + ├── hvcf_dataset/ + ├── hvcf_files/ + ├── reference/ + └── temp/ +``` + +Similar to the "[Build and Load](build_and_load.md#load-vcf-data-into-dbs)" +steps, we can now load this data into our TileDB instance using the +`load-vcf` command: + +```shell +./phg load-vcf \ + --vcf-dir output/vcf_files_imputed \ + --db-path vcf_dbs \ + --threads 10 +``` + +This command takes three parameters: + +* `--vcf-dir` - Directory containing hVCF data. Since I have imputed + data in a specific subdirectory, I will use the path + `output/vcf_files_imputed` +* `--db-path` - Path to the directory containing the TileDB instances. +* `--threads` - Number of threads for use by the TileDB loading + procedure. + +### Create g.vcf files (OPTIONAL) +Our imputed hVCF files provide data on a haplotype level. If desired we can take +the hVCF files and create gVCF files. This provides SNP level data and is done using +the `hvcf2gvcf` command: + +```shell +./phg hvcf2gvcf \ + --hvcf-dir output/vcf_files_imputed \ + --db-path vcf_dbs \ + --reference-file output/updated_assemblies/Ref.fa \ + --output-dir output/gvcf_files +``` + +This command takes 4 parameters: + +* `--hvcf-dir` - Directory containing hVCF data. Since I have imputed + data in a specific subdirectory, I will use the path + `output/vcf_files_imputed` +* `--db-path` - Path to the directory containing the TileDB instances. +* `--reference-file` - The reference genome fasta file. +* `--output-dir` - The directory to place the gVCF files. From 0a9b0776e6f96548bee9845118b2d4855fb0ad12 Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Thu, 23 Jan 2025 15:25:20 -0500 Subject: [PATCH 5/7] Fixing doc indents. --- docs/imputation_ropebwt.md | 40 +++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/imputation_ropebwt.md b/docs/imputation_ropebwt.md index 263f70aa..21e90f34 100644 --- a/docs/imputation_ropebwt.md +++ b/docs/imputation_ropebwt.md @@ -1,12 +1,12 @@ # Imputation using RopeBWT3 !!! note -This document is a work in progress and will be updated as the -imputation pipeline is further developed. Please check back for -updates. As of now the pipeline is in a BETA state but is functional. + This document is a work in progress and will be updated as the + imputation pipeline is further developed. Please check back for + updates. As of now the pipeline is in a BETA state but is functional. In this document, we will discuss the steps needed to perform -imputation using the ropebwt3 tool within the PHG. The steps: +imputation using the [ropebwt3](https://github.com/lh3/ropebwt3) tool within the PHG. The steps: 1. Export hVCF data 2. Index the pangenome from exported hVCF data @@ -14,9 +14,9 @@ imputation using the ropebwt3 tool within the PHG. The steps: 4. Find paths !!! note -The steps detailed in this document build on the materials from -the "[Building and Loading](build_and_load.md)" documentation. -Please review this if you have not worked with the PHG before! + The steps detailed in this document build on the materials from + the "[Building and Loading](build_and_load.md)" documentation. + Please review this if you have not worked with the PHG before! ## Quick start @@ -79,8 +79,8 @@ Please review this if you have not worked with the PHG before! ### hVCF export !!! note -This step is currently needed, but will be removed in the future as -we can do direct connections to the hVCF TileDB instance. + This step is currently needed, but will be removed in the future as + we can do direct connections to the hVCF TileDB instance. Where we last left off in the "[Build and Load](build_and_load.md)" steps, we had an example directory that looks similar to the @@ -111,8 +111,8 @@ phg_v2_example/ ``` !!! note -The following steps in the rest of this document will assume we are -in the top level of our `phg_v2_example` working directory. + The following steps in the rest of this document will assume we are + in the top level of our `phg_v2_example` working directory. For this example imputation pipeline, we will be using haplotypes @@ -434,10 +434,10 @@ phg_v2_example/ ``` !!! note -The naming scheme for the read-mapping files follows the first -sample in the keyfile or comma-delimited list and: -1. strips the path and FASTQ extension text -2. adds the `readMapping.txt` suffix to the stripped sample name + The naming scheme for the read-mapping files follows the first + sample in the keyfile or comma-delimited list and: + 1. strips the path and FASTQ extension text + 2. adds the `readMapping.txt` suffix to the stripped sample name Additionally, $n$ amount of files will be generated for $n$ amount of short read samples found in the keyfile or comma-delimited list. @@ -461,9 +461,9 @@ observed read-mappings. Two path types are implemented: jointly explain the read-mappings. !!! note -`find-paths` can run the old kmer read mapping pipeline. -Eventually this will be transitioned over to using the ropebwt3 index -and alignment process. For now run the index and mapping step manually. + `find-paths` can run the old kmer read mapping pipeline. + Eventually this will be transitioned over to using the ropebwt3 index + and alignment process. For now run the index and mapping step manually. #### Input files `find-paths` can take either read-mapping files generated by the command @@ -475,8 +475,8 @@ more information, please refer to the prior "[Read Mapping](#read-mapping)" section for additional input details. !!! note -If you using read-mapping files, the files **must** end with the -`_readMapping.txt` suffix or errors will occur. + If you using read-mapping files, the files **must** end with the + `_readMapping.txt` suffix or errors will occur. #### Example usage From 6b0d03b1951560ff6f6927929fbb7eb74fb312be Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Thu, 23 Jan 2025 15:35:03 -0500 Subject: [PATCH 6/7] Using new graph constructor --- .../net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt b/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt index 07dce236..5c9a2ca8 100644 --- a/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt +++ b/src/main/kotlin/net/maizegenetics/phgv2/pathing/ropebwt/MapReads.kt @@ -72,12 +72,8 @@ class MapReads : CliktCommand(help="BETA: Map reads to a pangenome using ropeBWT logCommand(this) myLogger.info("Building the Graph") - //loop through all files in hvcfDir and create a list of hvcf files - val hvcfFiles = File(hvcfDir).walkTopDown().filter { it.isFile } - .filter { it.name.endsWith("h.vcf") || it.name.endsWith("h.vcf.gz") }.map { "${hvcfDir}/${it.name}" } - .toList() - val graph = HaplotypeGraph(hvcfFiles) + val graph = HaplotypeGraph(hvcfDir) val hapIdToRefRangeMap = graph.hapIdToRefRangeMap() From 3cb3fa8005994742495ae6201ac59f88436564be Mon Sep 17 00:00:00 2001 From: Zack Miller Date: Thu, 23 Jan 2025 15:53:52 -0500 Subject: [PATCH 7/7] Updating the docs builder --- mkdocs.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index d25c7d34..acd2a8e9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,7 +82,9 @@ nav: - Getting Started: - Installation: installation.md - Building and loading: build_and_load.md - - Imputation: imputation.md + - Imputation: + - k-mer Imputation (Original): imputation.md + - RopeBWT3 Imputation (BETA): ropebwt3_imputation.md - Resequencing: resequencing.md - Exporting data: export_data.md - Reference: