broadinstitute · yesimon · Mar 6, 2019
diff --git a/assembly.py b/assembly.py
@@ -1080,8 +1080,8 @@ def main_filter_short_seqs(args):
     '''Check sequences in inFile, retaining only those that are at least minLength'''
     # orig by rsealfon, edited by dpark
     # TO DO: make this more generalized to accept multiple minLengths (for multiple chromosomes/segments)
-    with util.file.open_or_gzopen(args.inFile) as inf:
-        with util.file.open_or_gzopen(args.outFile, 'w') as outf:
+    with util.file.compressed_open(args.inFile, 'rt') as inf:
+        with util.file.compressed_open(args.outFile, 'wt') as outf:
             Bio.SeqIO.write(
                 [
                     s for s in Bio.SeqIO.parse(inf, args.format)
@@ -1619,8 +1619,8 @@ def deambig_fasta(inFasta, outFasta):
         random unambiguous base from among the possibilities described by the ambiguity
         code.  Write output to fasta file.
     '''
-    with util.file.open_or_gzopen(outFasta, 'wt') as outf:
-        with util.file.open_or_gzopen(inFasta, 'rt') as inf:
+    with util.file.compressed_open(outFasta, 'wt') as outf:
+        with util.file.compressed_open(inFasta, 'rt') as inf:
             for record in Bio.SeqIO.parse(inf, 'fasta'):
                 for line in util.file.fastaMaker([(record.id, ''.join(map(deambig_base, str(record.seq))))]):
                     outf.write(line)

diff --git a/file_utils.py b/file_utils.py
@@ -27,16 +27,16 @@ def merge_tarballs(out_tarball, in_tarballs, threads=None, extract_to_disk_path=
     return 0
 def parser_merge_tarballs(parser=argparse.ArgumentParser()):
     parser.add_argument(
-        'out_tarball', 
-        help='''output tarball (*.tar.gz|*.tar.lz4|*.tar.bz2|-);
+        'out_tarball',
+        help='''output tarball (*.tar.gz|*.tar.lz4|*.tar.bz2|*.tar.zst|-);
                 compression is inferred by the file extension.
         Note: if "-" is used, output will be written to stdout and
          --pipeOutHint must be provided to indicate compression type
          when compression type is not gzip (gzip is used by default).
         ''')
     parser.add_argument(
         'in_tarballs', nargs='+',
-        help=('input tarballs (*.tar.gz|*.tar.lz4|*.tar.bz2)')
+        help=('input tarballs (*.tar.gz|*.tar.lz4|*.tar.bz2|*.tar.zst)')
     )
     parser.add_argument('--extractToDiskPath',
                         dest="extract_to_disk_path",
@@ -61,4 +61,4 @@ def full_parser():
 
 
 if __name__ == '__main__':
-    util.cmd.main_argparse(__commands__, __doc__)
+    util.cmd.main_argparse(__commands__, __doc__)
diff --git a/illumina.py b/illumina.py
@@ -44,7 +44,7 @@ def parser_illumina_demux(parser=argparse.ArgumentParser()):
                         help='Output ExtractIlluminaBarcodes metrics file. Default is to dump to a temp file.',
                         default=None)
     parser.add_argument('--commonBarcodes',
-                        help='''Write a TSV report of all barcode counts, in descending order. 
+                        help='''Write a TSV report of all barcode counts, in descending order.
                                 Only applicable for read structures containing "B"''',
                         default=None)
     parser.add_argument('--sampleSheet',
@@ -89,7 +89,7 @@ def parser_illumina_demux(parser=argparse.ArgumentParser()):
 
 
 def main_illumina_demux(args):
-    ''' Read Illumina runs & produce BAM files, demultiplexing to one bam per sample, or 
+    ''' Read Illumina runs & produce BAM files, demultiplexing to one bam per sample, or
         for simplex runs, a single bam will be produced bearing the flowcell ID.
         Wraps together Picard's ExtractBarcodes (for multiplexed samples) and IlluminaBasecallsToSam
         while handling the various required input formats. Also can
@@ -126,13 +126,13 @@ def main_illumina_demux(args):
     link_locs=False
     # For HiSeq-4000/X runs, If Picard's CheckIlluminaDirectory is
     # called with LINK_LOCS=true, symlinks with absolute paths
-    # may be created, pointing from tile-specific *.locs to the 
+    # may be created, pointing from tile-specific *.locs to the
     # single s.locs file in the Intensities directory.
     # These links may break if the run directory is moved.
     # We should begin by removing broken links, if present,
     # and call CheckIlluminaDirectory ourselves if a 's.locs'
     # file is present, but only if the directory check fails
-    # since link_locs=true tries to create symlinks even if they 
+    # since link_locs=true tries to create symlinks even if they
     # (or the files) already exist
     try:
         tools.picard.CheckIlluminaDirectoryTool().execute(
@@ -163,8 +163,8 @@ def main_illumina_demux(args):
                 link_locs=link_locs
             )
 
-    multiplexed_samples = True if 'B' in read_structure else False            
-    
+    multiplexed_samples = True if 'B' in read_structure else False
+
     if multiplexed_samples:
         assert samples is not None, "This looks like a multiplexed run since 'B' is in the read_structure: a SampleSheet must be given."
     else:
@@ -308,19 +308,19 @@ def main_lane_metrics(args):
 def parser_common_barcodes(parser=argparse.ArgumentParser()):
     parser.add_argument('inDir', help='Illumina BCL directory (or tar.gz of BCL directory). This is the top-level run directory.')
     parser.add_argument('lane', help='Lane number.', type=int)
-    parser.add_argument('outSummary', help='''Path to the summary file (.tsv format). It includes several columns: 
-                                            (barcode1, likely_index_name1, barcode2, likely_index_name2, count), 
-                                            where likely index names are either the exact match index name for the barcode 
+    parser.add_argument('outSummary', help='''Path to the summary file (.tsv format). It includes several columns:
+                                            (barcode1, likely_index_name1, barcode2, likely_index_name2, count),
+                                            where likely index names are either the exact match index name for the barcode
                                             sequence, or those Hamming distance of 1 away.''')
 
     parser.add_argument('--truncateToLength',
                         help='If specified, only this number of barcodes will be returned. Useful if you only want the top N barcodes.',
                         type=int,
                         default=None)
-    parser.add_argument('--omitHeader', 
+    parser.add_argument('--omitHeader',
                         help='If specified, a header will not be added to the outSummary tsv file.',
                         action='store_true')
-    parser.add_argument('--includeNoise', 
+    parser.add_argument('--includeNoise',
                         help='If specified, barcodes with periods (".") will be included.',
                         action='store_true')
     parser.add_argument('--outMetrics',
@@ -350,8 +350,8 @@ def parser_common_barcodes(parser=argparse.ArgumentParser()):
     return parser
 
 def main_common_barcodes(args):
-    ''' 
-        Extract Illumina barcodes for a run and write a TSV report 
+    '''
+        Extract Illumina barcodes for a run and write a TSV report
         of the barcode counts in descending order
     '''
 
@@ -422,9 +422,9 @@ def sum_reducer(accumulator, element):
     count_to_write = truncateToLength if truncateToLength else len(barcode_counts)
     barcode_pairs_sorted_by_count = sorted(barcode_counts, key=barcode_counts.get, reverse=True)[:count_to_write]
 
-    mapped_counts = (   (k[:8], ",".join([x for x in illumina_reference.guess_index(k[:8], distance=1)] or ["Unknown"]), 
-                        k[8:], ",".join([x for x in illumina_reference.guess_index(k[8:], distance=1)] or ["Unknown"]), 
-                        barcode_counts[k]) 
+    mapped_counts = (   (k[:8], ",".join([x for x in illumina_reference.guess_index(k[:8], distance=1)] or ["Unknown"]),
+                        k[8:], ",".join([x for x in illumina_reference.guess_index(k[8:], distance=1)] or ["Unknown"]),
+                        barcode_counts[k])
                     for k in barcode_pairs_sorted_by_count)
 
     # write the barcodes and their corresponding counts
@@ -445,13 +445,13 @@ def sum_reducer(accumulator, element):
 def parser_guess_barcodes(parser=argparse.ArgumentParser()):
     parser.add_argument('in_barcodes', help='The barcode counts file produced by common_barcodes.')
     parser.add_argument('in_picard_metrics', help='The demultiplexing read metrics produced by Picard.')
-    parser.add_argument('out_summary_tsv', help='''Path to the summary file (.tsv format). It includes several columns: 
-                                            (sample_name, expected_barcode_1, expected_barcode_2, 
-                                            expected_barcode_1_name, expected_barcode_2_name, 
-                                            expected_barcodes_read_count, guessed_barcode_1, 
-                                            guessed_barcode_2, guessed_barcode_1_name, 
-                                            guessed_barcode_2_name, guessed_barcodes_read_count, 
-                                            match_type), 
+    parser.add_argument('out_summary_tsv', help='''Path to the summary file (.tsv format). It includes several columns:
+                                            (sample_name, expected_barcode_1, expected_barcode_2,
+                                            expected_barcode_1_name, expected_barcode_2_name,
+                                            expected_barcodes_read_count, guessed_barcode_1,
+                                            guessed_barcode_2, guessed_barcode_1_name,
+                                            guessed_barcode_2_name, guessed_barcodes_read_count,
+                                            match_type),
                                             where the expected values are those used by Picard during demultiplexing
                                             and the guessed values are based on the barcodes seen among the data.''')
 
@@ -465,11 +465,11 @@ def parser_guess_barcodes(parser=argparse.ArgumentParser()):
                         help='If specified, only guess barcodes for these sample names.',
                         type=str,
                         default=None)
-    parser.add_argument('--outlier_threshold', 
+    parser.add_argument('--outlier_threshold',
                         help='threshold of how far from unbalanced a sample must be to be considered an outlier.',
                         type=float,
                         default=0.675)
-    parser.add_argument('--expected_assigned_fraction', 
+    parser.add_argument('--expected_assigned_fraction',
                         help='The fraction of reads expected to be assigned. An exception is raised if fewer than this fraction are assigned.',
                         type=float,
                         default=0.7)
@@ -483,27 +483,27 @@ def parser_guess_barcodes(parser=argparse.ArgumentParser()):
                         type=int,
                         help='''The number of rows to use from the in_barcodes.''')
 
-    
+
     util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
     util.cmd.attach_main(parser, main_guess_barcodes, split_args=True)
     return parser
 
-def main_guess_barcodes(in_barcodes, 
-                        in_picard_metrics, 
-                        out_summary_tsv, 
-                        sample_names, 
-                        outlier_threshold, 
-                        expected_assigned_fraction, 
-                        number_of_negative_controls, 
-                        readcount_threshold, 
+def main_guess_barcodes(in_barcodes,
+                        in_picard_metrics,
+                        out_summary_tsv,
+                        sample_names,
+                        outlier_threshold,
+                        expected_assigned_fraction,
+                        number_of_negative_controls,
+                        readcount_threshold,
                         rows_limit):
     """
         Guess the barcode value for a sample name,
             based on the following:
              - a list is made of novel barcode pairs seen in the data, but not in the picard metrics
-             - for the sample in question, get the most abundant novel barcode pair where one of the 
+             - for the sample in question, get the most abundant novel barcode pair where one of the
                barcodes seen in the data matches one of the barcodes in the picard metrics (partial match)
-             - if there are no partial matches, get the most abundant novel barcode pair 
+             - if there are no partial matches, get the most abundant novel barcode pair
 
             Limitations:
              - If multiple samples share a barcode with multiple novel barcodes, disentangling them
@@ -518,10 +518,10 @@ def main_guess_barcodes(in_barcodes,
     """
 
     bh = util.illumina_indices.IlluminaBarcodeHelper(in_barcodes, in_picard_metrics, rows_limit)
-    guessed_barcodes = bh.find_uncertain_barcodes(sample_names=sample_names, 
-                                                    outlier_threshold=outlier_threshold, 
-                                                    expected_assigned_fraction=expected_assigned_fraction, 
-                                                    number_of_negative_controls=number_of_negative_controls, 
+    guessed_barcodes = bh.find_uncertain_barcodes(sample_names=sample_names,
+                                                    outlier_threshold=outlier_threshold,
+                                                    expected_assigned_fraction=expected_assigned_fraction,
+                                                    number_of_negative_controls=number_of_negative_controls,
                                                     readcount_threshold=readcount_threshold)
     bh.write_guessed_barcodes(out_summary_tsv, guessed_barcodes)
 
@@ -718,7 +718,7 @@ def __init__(self, infile, use_sample_name=True, only_lane=None, allow_non_uniqu
     def _detect_and_load_sheet(self, infile):
         if infile.endswith(('.csv','.csv.gz')):
             # one of a few possible CSV formats (watch out for line endings from other OSes)
-            with util.file.open_or_gzopen(infile, 'rU') as inf:
+            with util.file.compressed_open(infile, 'rU') as inf:
                 header = None
                 miseq_skip = False
                 row_num = 0

diff --git a/interhost.py b/interhost.py
@@ -561,7 +561,7 @@ def transposeChromosomeFiles(inputFilenamesList, sampleRelationFile=None, sample
     outputFilenames = []
 
     # open all files
-    inputFilesList = [util.file.open_or_gzopen(x, 'r') for x in inputFilenamesList]
+    inputFilesList = [util.file.compressed_open(x, 'rt') for x in inputFilenamesList]
     # get BioPython iterators for each of the FASTA files specified in the input
     fastaFiles = [SeqIO.parse(x, 'fasta') for x in inputFilesList]
 

diff --git a/intrahost.py b/intrahost.py
@@ -146,7 +146,7 @@ def vphaser_one_sample(inBam, inConsFasta, outTab, vphaserNumThreads=None,
     filteredIter = filter_strand_bias(variantIter, minReadsEach, maxBias)
 
     libraryFilteredIter = compute_library_bias(filteredIter, bam_to_process, inConsFasta)
-    with util.file.open_or_gzopen(outTab, 'wt') as outf:
+    with util.file.compressed_open(outTab, 'wt') as outf:
         for row in libraryFilteredIter:
             outf.write('\t'.join(row) + '\n')
 
@@ -471,12 +471,12 @@ def merge_to_vcf(
 
         samplenames_from_alignments = set()
         for alignmentFile in alignments:
-            with util.file.open_or_gzopen(alignmentFile, 'r') as inf:
+            with util.file.compressed_open(alignmentFile, 'rt') as inf:
                 for seq in Bio.SeqIO.parse(inf, 'fasta'):
                     samplenames_from_alignments.add(sampleIDMatch(seq.id))
 
         refnames = set()
-        with util.file.open_or_gzopen(refFasta, 'r') as inf:
+        with util.file.compressed_open(refFasta, 'rt') as inf:
             for seq in Bio.SeqIO.parse(inf, 'fasta'):
                     refnames.add(sampleIDMatch(seq.id))
 
@@ -512,7 +512,7 @@ def merge_to_vcf(
     log.info(samp_to_isnv)
 
     # get IDs and sequence lengths for reference sequence
-    with util.file.open_or_gzopen(refFasta, 'r') as inf:
+    with util.file.compressed_open(refFasta, 'rt') as inf:
         ref_chrlens = list((seq.id, len(seq)) for seq in Bio.SeqIO.parse(inf, 'fasta'))
 
     # use the output filepath specified if it is a .vcf, otherwise if it is gzipped we need
@@ -566,10 +566,10 @@ def merge_to_vcf(
         # copy the list of alignment files
         alignmentFiles = list(alignments)
 
-        with util.file.open_or_gzopen(refFasta, 'r') as inf:
+        with util.file.compressed_open(refFasta, 'rt') as inf:
             for refSeq in Bio.SeqIO.parse(inf, 'fasta'):
                 for alignmentFile in alignmentFiles:
-                    with util.file.open_or_gzopen(alignmentFile, 'r') as inf2:
+                    with util.file.compressed_open(alignmentFile, 'r') as inf2:
                         for seq in Bio.SeqIO.parse(inf2, 'fasta'):
                             if refSeq.id == seq.id:
                                 ref_seq_id_to_alignment_file[seq.id] = alignmentFile
@@ -583,7 +583,7 @@ def merge_to_vcf(
                 "There must be an isnv file for each sample. %s samples, %s isnv files" % (len(samples), len(isnvs)))
 
         for fileName in alignments:
-            with util.file.open_or_gzopen(fileName, 'r') as inf:
+            with util.file.compressed_open(fileName, 'rt') as inf:
                 # get two independent iterators into the alignment file
                 number_of_aligned_sequences = count_iter_items(Bio.SeqIO.parse(inf, 'fasta'))
                 num_isnv_files = len(isnvs)
@@ -622,7 +622,7 @@ def merge_to_vcf(
                 samplesToUse = [x for x in cm.chrMaps.keys() if sampleIDMatch(x) in samples]
 
                 alignmentFile = ref_seq_id_to_alignment_file[ref_sequence.id]
-                with util.file.open_or_gzopen(alignmentFile, 'r') as alignFileIn:
+                with util.file.compressed_open(alignmentFile, 'rt') as alignFileIn:
                     for seq in Bio.SeqIO.parse(alignFileIn, 'fasta'):
                         for sampleName in samplesToUse:
                             if seq.id == sampleName:
@@ -964,7 +964,7 @@ def compute_Fws(vcfrow):
 def add_Fws_vcf(inVcf, outVcf):
     '''Compute the Fws statistic on iSNV data. See Manske, 2012 (Nature)'''
     with open(outVcf, 'wt') as outf:
-        with util.file.open_or_gzopen(inVcf, 'rt') as inf:
+        with util.file.compressed_open(inVcf, 'rt') as inf:
             for line in inf:
                 if line.startswith('##'):
                     outf.write(line)
@@ -1123,7 +1123,7 @@ def main_iSNV_table(args):
     '''Convert VCF iSNV data to tabular text'''
     header = ['chr', 'pos', 'sample', 'patient', 'time', 'alleles', 'iSNV_freq', 'Hw', 'Hs', 'eff_type',
               'eff_codon_dna', 'eff_aa', 'eff_aa_pos', 'eff_prot_len', 'eff_gene', 'eff_protein']
-    with util.file.open_or_gzopen(args.outFile, 'wt') as outf:
+    with util.file.compressed_open(args.outFile, 'wt') as outf:
         outf.write('\t'.join(header) + '\n')
         for row in iSNV_table(util.file.read_tabfile_dict(args.inVcf)):
             sample_parts = row['sample'].split('.')