Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

open_or_gzopen -> compressed_open. Add support for zstd. #937

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,8 +1080,8 @@ def main_filter_short_seqs(args):
'''Check sequences in inFile, retaining only those that are at least minLength'''
# orig by rsealfon, edited by dpark
# TO DO: make this more generalized to accept multiple minLengths (for multiple chromosomes/segments)
with util.file.open_or_gzopen(args.inFile) as inf:
with util.file.open_or_gzopen(args.outFile, 'w') as outf:
with util.file.compressed_open(args.inFile, 'rt') as inf:
with util.file.compressed_open(args.outFile, 'wt') as outf:
Bio.SeqIO.write(
[
s for s in Bio.SeqIO.parse(inf, args.format)
Expand Down Expand Up @@ -1619,8 +1619,8 @@ def deambig_fasta(inFasta, outFasta):
random unambiguous base from among the possibilities described by the ambiguity
code. Write output to fasta file.
'''
with util.file.open_or_gzopen(outFasta, 'wt') as outf:
with util.file.open_or_gzopen(inFasta, 'rt') as inf:
with util.file.compressed_open(outFasta, 'wt') as outf:
with util.file.compressed_open(inFasta, 'rt') as inf:
for record in Bio.SeqIO.parse(inf, 'fasta'):
for line in util.file.fastaMaker([(record.id, ''.join(map(deambig_base, str(record.seq))))]):
outf.write(line)
Expand Down
8 changes: 4 additions & 4 deletions file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,16 @@ def merge_tarballs(out_tarball, in_tarballs, threads=None, extract_to_disk_path=
return 0
def parser_merge_tarballs(parser=argparse.ArgumentParser()):
parser.add_argument(
'out_tarball',
help='''output tarball (*.tar.gz|*.tar.lz4|*.tar.bz2|-);
'out_tarball',
help='''output tarball (*.tar.gz|*.tar.lz4|*.tar.bz2|*.tar.zst|-);
compression is inferred by the file extension.
Note: if "-" is used, output will be written to stdout and
--pipeOutHint must be provided to indicate compression type
when compression type is not gzip (gzip is used by default).
''')
parser.add_argument(
'in_tarballs', nargs='+',
help=('input tarballs (*.tar.gz|*.tar.lz4|*.tar.bz2)')
help=('input tarballs (*.tar.gz|*.tar.lz4|*.tar.bz2|*.tar.zst)')
)
parser.add_argument('--extractToDiskPath',
dest="extract_to_disk_path",
Expand All @@ -61,4 +61,4 @@ def full_parser():


if __name__ == '__main__':
util.cmd.main_argparse(__commands__, __doc__)
util.cmd.main_argparse(__commands__, __doc__)
82 changes: 41 additions & 41 deletions illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def parser_illumina_demux(parser=argparse.ArgumentParser()):
help='Output ExtractIlluminaBarcodes metrics file. Default is to dump to a temp file.',
default=None)
parser.add_argument('--commonBarcodes',
help='''Write a TSV report of all barcode counts, in descending order.
help='''Write a TSV report of all barcode counts, in descending order.
Only applicable for read structures containing "B"''',
default=None)
parser.add_argument('--sampleSheet',
Expand Down Expand Up @@ -89,7 +89,7 @@ def parser_illumina_demux(parser=argparse.ArgumentParser()):


def main_illumina_demux(args):
''' Read Illumina runs & produce BAM files, demultiplexing to one bam per sample, or
''' Read Illumina runs & produce BAM files, demultiplexing to one bam per sample, or
for simplex runs, a single bam will be produced bearing the flowcell ID.
Wraps together Picard's ExtractBarcodes (for multiplexed samples) and IlluminaBasecallsToSam
while handling the various required input formats. Also can
Expand Down Expand Up @@ -126,13 +126,13 @@ def main_illumina_demux(args):
link_locs=False
# For HiSeq-4000/X runs, If Picard's CheckIlluminaDirectory is
# called with LINK_LOCS=true, symlinks with absolute paths
# may be created, pointing from tile-specific *.locs to the
# may be created, pointing from tile-specific *.locs to the
# single s.locs file in the Intensities directory.
# These links may break if the run directory is moved.
# We should begin by removing broken links, if present,
# and call CheckIlluminaDirectory ourselves if a 's.locs'
# file is present, but only if the directory check fails
# since link_locs=true tries to create symlinks even if they
# since link_locs=true tries to create symlinks even if they
# (or the files) already exist
try:
tools.picard.CheckIlluminaDirectoryTool().execute(
Expand Down Expand Up @@ -163,8 +163,8 @@ def main_illumina_demux(args):
link_locs=link_locs
)

multiplexed_samples = True if 'B' in read_structure else False
multiplexed_samples = True if 'B' in read_structure else False

if multiplexed_samples:
assert samples is not None, "This looks like a multiplexed run since 'B' is in the read_structure: a SampleSheet must be given."
else:
Expand Down Expand Up @@ -308,19 +308,19 @@ def main_lane_metrics(args):
def parser_common_barcodes(parser=argparse.ArgumentParser()):
parser.add_argument('inDir', help='Illumina BCL directory (or tar.gz of BCL directory). This is the top-level run directory.')
parser.add_argument('lane', help='Lane number.', type=int)
parser.add_argument('outSummary', help='''Path to the summary file (.tsv format). It includes several columns:
(barcode1, likely_index_name1, barcode2, likely_index_name2, count),
where likely index names are either the exact match index name for the barcode
parser.add_argument('outSummary', help='''Path to the summary file (.tsv format). It includes several columns:
(barcode1, likely_index_name1, barcode2, likely_index_name2, count),
where likely index names are either the exact match index name for the barcode
sequence, or those Hamming distance of 1 away.''')

parser.add_argument('--truncateToLength',
help='If specified, only this number of barcodes will be returned. Useful if you only want the top N barcodes.',
type=int,
default=None)
parser.add_argument('--omitHeader',
parser.add_argument('--omitHeader',
help='If specified, a header will not be added to the outSummary tsv file.',
action='store_true')
parser.add_argument('--includeNoise',
parser.add_argument('--includeNoise',
help='If specified, barcodes with periods (".") will be included.',
action='store_true')
parser.add_argument('--outMetrics',
Expand Down Expand Up @@ -350,8 +350,8 @@ def parser_common_barcodes(parser=argparse.ArgumentParser()):
return parser

def main_common_barcodes(args):
'''
Extract Illumina barcodes for a run and write a TSV report
'''
Extract Illumina barcodes for a run and write a TSV report
of the barcode counts in descending order
'''

Expand Down Expand Up @@ -422,9 +422,9 @@ def sum_reducer(accumulator, element):
count_to_write = truncateToLength if truncateToLength else len(barcode_counts)
barcode_pairs_sorted_by_count = sorted(barcode_counts, key=barcode_counts.get, reverse=True)[:count_to_write]

mapped_counts = ( (k[:8], ",".join([x for x in illumina_reference.guess_index(k[:8], distance=1)] or ["Unknown"]),
k[8:], ",".join([x for x in illumina_reference.guess_index(k[8:], distance=1)] or ["Unknown"]),
barcode_counts[k])
mapped_counts = ( (k[:8], ",".join([x for x in illumina_reference.guess_index(k[:8], distance=1)] or ["Unknown"]),
k[8:], ",".join([x for x in illumina_reference.guess_index(k[8:], distance=1)] or ["Unknown"]),
barcode_counts[k])
for k in barcode_pairs_sorted_by_count)

# write the barcodes and their corresponding counts
Expand All @@ -445,13 +445,13 @@ def sum_reducer(accumulator, element):
def parser_guess_barcodes(parser=argparse.ArgumentParser()):
parser.add_argument('in_barcodes', help='The barcode counts file produced by common_barcodes.')
parser.add_argument('in_picard_metrics', help='The demultiplexing read metrics produced by Picard.')
parser.add_argument('out_summary_tsv', help='''Path to the summary file (.tsv format). It includes several columns:
(sample_name, expected_barcode_1, expected_barcode_2,
expected_barcode_1_name, expected_barcode_2_name,
expected_barcodes_read_count, guessed_barcode_1,
guessed_barcode_2, guessed_barcode_1_name,
guessed_barcode_2_name, guessed_barcodes_read_count,
match_type),
parser.add_argument('out_summary_tsv', help='''Path to the summary file (.tsv format). It includes several columns:
(sample_name, expected_barcode_1, expected_barcode_2,
expected_barcode_1_name, expected_barcode_2_name,
expected_barcodes_read_count, guessed_barcode_1,
guessed_barcode_2, guessed_barcode_1_name,
guessed_barcode_2_name, guessed_barcodes_read_count,
match_type),
where the expected values are those used by Picard during demultiplexing
and the guessed values are based on the barcodes seen among the data.''')

Expand All @@ -465,11 +465,11 @@ def parser_guess_barcodes(parser=argparse.ArgumentParser()):
help='If specified, only guess barcodes for these sample names.',
type=str,
default=None)
parser.add_argument('--outlier_threshold',
parser.add_argument('--outlier_threshold',
help='threshold of how far from unbalanced a sample must be to be considered an outlier.',
type=float,
default=0.675)
parser.add_argument('--expected_assigned_fraction',
parser.add_argument('--expected_assigned_fraction',
help='The fraction of reads expected to be assigned. An exception is raised if fewer than this fraction are assigned.',
type=float,
default=0.7)
Expand All @@ -483,27 +483,27 @@ def parser_guess_barcodes(parser=argparse.ArgumentParser()):
type=int,
help='''The number of rows to use from the in_barcodes.''')


util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, main_guess_barcodes, split_args=True)
return parser

def main_guess_barcodes(in_barcodes,
in_picard_metrics,
out_summary_tsv,
sample_names,
outlier_threshold,
expected_assigned_fraction,
number_of_negative_controls,
readcount_threshold,
def main_guess_barcodes(in_barcodes,
in_picard_metrics,
out_summary_tsv,
sample_names,
outlier_threshold,
expected_assigned_fraction,
number_of_negative_controls,
readcount_threshold,
rows_limit):
"""
Guess the barcode value for a sample name,
based on the following:
- a list is made of novel barcode pairs seen in the data, but not in the picard metrics
- for the sample in question, get the most abundant novel barcode pair where one of the
- for the sample in question, get the most abundant novel barcode pair where one of the
barcodes seen in the data matches one of the barcodes in the picard metrics (partial match)
- if there are no partial matches, get the most abundant novel barcode pair
- if there are no partial matches, get the most abundant novel barcode pair

Limitations:
- If multiple samples share a barcode with multiple novel barcodes, disentangling them
Expand All @@ -518,10 +518,10 @@ def main_guess_barcodes(in_barcodes,
"""

bh = util.illumina_indices.IlluminaBarcodeHelper(in_barcodes, in_picard_metrics, rows_limit)
guessed_barcodes = bh.find_uncertain_barcodes(sample_names=sample_names,
outlier_threshold=outlier_threshold,
expected_assigned_fraction=expected_assigned_fraction,
number_of_negative_controls=number_of_negative_controls,
guessed_barcodes = bh.find_uncertain_barcodes(sample_names=sample_names,
outlier_threshold=outlier_threshold,
expected_assigned_fraction=expected_assigned_fraction,
number_of_negative_controls=number_of_negative_controls,
readcount_threshold=readcount_threshold)
bh.write_guessed_barcodes(out_summary_tsv, guessed_barcodes)

Expand Down Expand Up @@ -718,7 +718,7 @@ def __init__(self, infile, use_sample_name=True, only_lane=None, allow_non_uniqu
def _detect_and_load_sheet(self, infile):
if infile.endswith(('.csv','.csv.gz')):
# one of a few possible CSV formats (watch out for line endings from other OSes)
with util.file.open_or_gzopen(infile, 'rU') as inf:
with util.file.compressed_open(infile, 'rU') as inf:
header = None
miseq_skip = False
row_num = 0
Expand Down
2 changes: 1 addition & 1 deletion interhost.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ def transposeChromosomeFiles(inputFilenamesList, sampleRelationFile=None, sample
outputFilenames = []

# open all files
inputFilesList = [util.file.open_or_gzopen(x, 'r') for x in inputFilenamesList]
inputFilesList = [util.file.compressed_open(x, 'rt') for x in inputFilenamesList]
# get BioPython iterators for each of the FASTA files specified in the input
fastaFiles = [SeqIO.parse(x, 'fasta') for x in inputFilesList]

Expand Down
20 changes: 10 additions & 10 deletions intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def vphaser_one_sample(inBam, inConsFasta, outTab, vphaserNumThreads=None,
filteredIter = filter_strand_bias(variantIter, minReadsEach, maxBias)

libraryFilteredIter = compute_library_bias(filteredIter, bam_to_process, inConsFasta)
with util.file.open_or_gzopen(outTab, 'wt') as outf:
with util.file.compressed_open(outTab, 'wt') as outf:
for row in libraryFilteredIter:
outf.write('\t'.join(row) + '\n')

Expand Down Expand Up @@ -471,12 +471,12 @@ def merge_to_vcf(

samplenames_from_alignments = set()
for alignmentFile in alignments:
with util.file.open_or_gzopen(alignmentFile, 'r') as inf:
with util.file.compressed_open(alignmentFile, 'rt') as inf:
for seq in Bio.SeqIO.parse(inf, 'fasta'):
samplenames_from_alignments.add(sampleIDMatch(seq.id))

refnames = set()
with util.file.open_or_gzopen(refFasta, 'r') as inf:
with util.file.compressed_open(refFasta, 'rt') as inf:
for seq in Bio.SeqIO.parse(inf, 'fasta'):
refnames.add(sampleIDMatch(seq.id))

Expand Down Expand Up @@ -512,7 +512,7 @@ def merge_to_vcf(
log.info(samp_to_isnv)

# get IDs and sequence lengths for reference sequence
with util.file.open_or_gzopen(refFasta, 'r') as inf:
with util.file.compressed_open(refFasta, 'rt') as inf:
ref_chrlens = list((seq.id, len(seq)) for seq in Bio.SeqIO.parse(inf, 'fasta'))

# use the output filepath specified if it is a .vcf, otherwise if it is gzipped we need
Expand Down Expand Up @@ -566,10 +566,10 @@ def merge_to_vcf(
# copy the list of alignment files
alignmentFiles = list(alignments)

with util.file.open_or_gzopen(refFasta, 'r') as inf:
with util.file.compressed_open(refFasta, 'rt') as inf:
for refSeq in Bio.SeqIO.parse(inf, 'fasta'):
for alignmentFile in alignmentFiles:
with util.file.open_or_gzopen(alignmentFile, 'r') as inf2:
with util.file.compressed_open(alignmentFile, 'r') as inf2:
for seq in Bio.SeqIO.parse(inf2, 'fasta'):
if refSeq.id == seq.id:
ref_seq_id_to_alignment_file[seq.id] = alignmentFile
Expand All @@ -583,7 +583,7 @@ def merge_to_vcf(
"There must be an isnv file for each sample. %s samples, %s isnv files" % (len(samples), len(isnvs)))

for fileName in alignments:
with util.file.open_or_gzopen(fileName, 'r') as inf:
with util.file.compressed_open(fileName, 'rt') as inf:
# get two independent iterators into the alignment file
number_of_aligned_sequences = count_iter_items(Bio.SeqIO.parse(inf, 'fasta'))
num_isnv_files = len(isnvs)
Expand Down Expand Up @@ -622,7 +622,7 @@ def merge_to_vcf(
samplesToUse = [x for x in cm.chrMaps.keys() if sampleIDMatch(x) in samples]

alignmentFile = ref_seq_id_to_alignment_file[ref_sequence.id]
with util.file.open_or_gzopen(alignmentFile, 'r') as alignFileIn:
with util.file.compressed_open(alignmentFile, 'rt') as alignFileIn:
for seq in Bio.SeqIO.parse(alignFileIn, 'fasta'):
for sampleName in samplesToUse:
if seq.id == sampleName:
Expand Down Expand Up @@ -964,7 +964,7 @@ def compute_Fws(vcfrow):
def add_Fws_vcf(inVcf, outVcf):
'''Compute the Fws statistic on iSNV data. See Manske, 2012 (Nature)'''
with open(outVcf, 'wt') as outf:
with util.file.open_or_gzopen(inVcf, 'rt') as inf:
with util.file.compressed_open(inVcf, 'rt') as inf:
for line in inf:
if line.startswith('##'):
outf.write(line)
Expand Down Expand Up @@ -1123,7 +1123,7 @@ def main_iSNV_table(args):
'''Convert VCF iSNV data to tabular text'''
header = ['chr', 'pos', 'sample', 'patient', 'time', 'alleles', 'iSNV_freq', 'Hw', 'Hs', 'eff_type',
'eff_codon_dna', 'eff_aa', 'eff_aa_pos', 'eff_prot_len', 'eff_gene', 'eff_protein']
with util.file.open_or_gzopen(args.outFile, 'wt') as outf:
with util.file.compressed_open(args.outFile, 'wt') as outf:
outf.write('\t'.join(header) + '\n')
for row in iSNV_table(util.file.read_tabfile_dict(args.inVcf)):
sample_parts = row['sample'].split('.')
Expand Down
Loading