From e258750f0f6c4e025aec480085cc8b9a96083706 Mon Sep 17 00:00:00 2001 From: js2264 Date: Fri, 16 Feb 2024 14:04:28 +0100 Subject: [PATCH] fix: do not write stats into new file with stats CLI --- hicstuff/commands.py | 9 +-- hicstuff/stats.py | 161 ++++++++++++++++++++++++++----------------- 2 files changed, 98 insertions(+), 72 deletions(-) diff --git a/hicstuff/commands.py b/hicstuff/commands.py index bacfc96..32cee7c 100644 --- a/hicstuff/commands.py +++ b/hicstuff/commands.py @@ -1670,13 +1670,8 @@ class Stats(AbstractCommand): def execute(self): log_file = self.args[""] - hcs.get_pipeline_stats(log_file) - prefix = re.sub(".hicstuff.*", "", basename(log_file)) - out_dir = dirname(log_file) - stats_file_path = join(out_dir, prefix + ".stats.txt") - with open(stats_file_path, 'r') as file: - lines = [line for line in file] - print(''.join(lines)) + stats = hcs.get_pipeline_stats(log_file) + hcs.print_pipeline_stats(stats) def parse_bin_str(bin_str): """Bin string parsing diff --git a/hicstuff/stats.py b/hicstuff/stats.py index 1cad2ac..a846b8e 100644 --- a/hicstuff/stats.py +++ b/hicstuff/stats.py @@ -2,9 +2,9 @@ # -*- coding: utf-8 -*- import re -from os.path import join -from os.path import dirname +import os from os.path import basename +from os.path import splitext import json def get_pipeline_stats(log_file): @@ -17,8 +17,8 @@ def get_pipeline_stats(log_file): Returns ------- - txt file: - A single text file containing stats about hicstuff pipeline execution: + list: + A single list containing stats about hicstuff pipeline execution: - Number of sequenced pairs from fastqs - Number (% of total reads) of unmapped reads - Number (% of total reads) of mapped reads @@ -34,8 +34,6 @@ def get_pipeline_stats(log_file): - Trans ratio """ - prefix = re.sub(".hicstuff.*", "", basename(log_file)) - out_dir = dirname(log_file) with open(log_file) as file: log_lines = [line.rstrip() for line in file] @@ -51,15 +49,12 @@ def get_pipeline_stats(log_file): tot_mapped = re.sub("/.*", "", tot_mapped) tot_mapped = int(float(tot_mapped)) tot_unmapped = fastq_pairs*2 - tot_mapped - pct_mapped = round(tot_mapped/(fastq_pairs*2)*100, 2) - pct_unmapped = round(tot_unmapped/(fastq_pairs*2)*100, 2) # 3. Number (% of total) of pairs tot_pairs = [s for s in log_lines if re.search("pairs successfully mapped", s)][0] tot_pairs = re.sub(".*INFO :: ", "", tot_pairs) tot_pairs = re.sub(" pairs successfully.*", "", tot_pairs) tot_pairs = int(float(tot_pairs)) - pct_pairs = round(tot_pairs/fastq_pairs*100, 2) # 4. Number (% of total) of filtered pairs filtered_pairs = [s for s in log_lines if re.search("pairs discarded:", s)] @@ -81,74 +76,110 @@ def get_pipeline_stats(log_file): uncuts_pairs=0 abnormal_pairs=0 filtered_pairs=0 - pct_filtered = round(filtered_pairs/tot_pairs*100, 2) - pct_loops_pairs = round(loops_pairs/tot_pairs*100, 2) - pct_uncuts_pairs = round(uncuts_pairs/tot_pairs*100, 2) - pct_abnormal_pairs = round(abnormal_pairs/tot_pairs*100, 2) - # 5. Number (% of total) of PCR dups pairs - PCR_pairs = [s for s in log_lines if re.search("PCR duplicates have been filtered", s)] - if (len(PCR_pairs) > 0): - PCR_pairs = PCR_pairs[0] - PCR_pairs = re.sub(".*have been filtered out \(", "", PCR_pairs) - PCR_pairs = re.sub(" / .*", "", PCR_pairs) - PCR_pairs = int(float(PCR_pairs)) + # 5. Number (% of total) of PCR duplicates pairs + pcr_pairs = [s for s in log_lines if re.search("PCR duplicates have been filtered", s)] + if (len(pcr_pairs) > 0): + pcr_pairs = pcr_pairs[0] + pcr_pairs = re.sub(".*have been filtered out \(", "", pcr_pairs) + pcr_pairs = re.sub(" / .*", "", pcr_pairs) + pcr_pairs = int(float(pcr_pairs)) else: - PCR_pairs = 0 - pct_PCR = round(PCR_pairs/tot_pairs*100, 2) + pcr_pairs = 0 # 6. Number (%) of final pairs - kept_pairs=tot_pairs-filtered_pairs-PCR_pairs - pct_kept = round(kept_pairs/tot_pairs*100, 2) - - # Open the log file and read its contents - stats_file_path = join(out_dir, prefix + ".stats.txt") - with open(stats_file_path, 'w') as stats_file: - stats_file.write("Sample: {prefix}\n".format(prefix = prefix)) - stats_file.write("Total read pairs: {reads}\n".format(reads = "{:,}".format(fastq_pairs))) - stats_file.write("Mapped reads: {tot_mapped} ({pct_mapped}% of all reads)\n".format( - tot_mapped = "{:,}".format(tot_mapped), - pct_mapped = pct_mapped - ) - ) - stats_file.write("Unmapped reads: {tot_unmapped} ({pct_unmapped}% of all reads)\n".format( - tot_unmapped = "{:,}".format(tot_unmapped), pct_unmapped = pct_unmapped - )) - stats_file.write("Recovered contacts: {tot_pairs} ({pct_pairs}% of all read pairs)\n".format( - tot_pairs = "{:,}".format(tot_pairs), pct_pairs = pct_pairs - )) - stats_file.write("Final contacts: {kept_pairs} ({pct_kept}% of all contacts)\n".format( - kept_pairs = "{:,}".format(kept_pairs), pct_kept = pct_kept - )) - stats_file.write(" Filtered out: {filtered_pairs} ({pct_filtered}% of all contacts)\n".format( - filtered_pairs = "{:,}".format(filtered_pairs), pct_filtered = pct_filtered - )) - stats_file.write(" Loops: {loops_pairs} ({pct_loops_pairs}% of all contacts)\n".format( - loops_pairs = "{:,}".format(loops_pairs), pct_loops_pairs = pct_loops_pairs - )) - stats_file.write(" Uncuts: {uncuts_pairs} ({pct_uncuts_pairs}% of all contacts)\n".format( - uncuts_pairs = "{:,}".format(uncuts_pairs), pct_uncuts_pairs = pct_uncuts_pairs - )) - stats_file.write(" Weirds: {abnormal_pairs} ({pct_abnormal_pairs}% of all contacts)\n".format( - abnormal_pairs = "{:,}".format(abnormal_pairs), pct_abnormal_pairs = pct_abnormal_pairs - )) - stats_file.write(" PCR duplicates: {PCR_pairs} ({pct_PCR}% of all contacts)\n".format( - PCR_pairs = "{:,}".format(PCR_pairs), pct_PCR = pct_PCR - )) - + removed_pairs=filtered_pairs+pcr_pairs + final_pairs=tot_pairs-removed_pairs + + # 7. Final stats stats = { + 'Sample': re.sub(".hicstuff.*", "", basename(log_file)), 'Total read pairs': fastq_pairs, 'Mapped reads': tot_mapped, 'Unmapped reads': tot_unmapped, 'Recovered contacts': tot_pairs, - 'Final contacts': kept_pairs, + 'Final contacts': final_pairs, + 'Removed contacts': removed_pairs, 'Filtered out': filtered_pairs, 'Loops': loops_pairs, 'Uncuts': uncuts_pairs, 'Weirds': abnormal_pairs, - 'PCR duplicates': PCR_pairs + 'PCR duplicates': pcr_pairs } - stats_json_path = join(out_dir, prefix + ".stats.json") - with open(stats_json_path, 'w') as json_file: - json.dump(stats, json_file, indent=4) + return(stats) + +def write_pipeline_stats(stats, out_file): + prefix = stats['Sample'] + fastq_pairs=stats['Total read pairs'] + tot_mapped=stats['Mapped reads'] + tot_unmapped=stats['Unmapped reads'] + tot_pairs=stats['Recovered contacts'] + final_pairs=stats['Final contacts'] + removed_pairs=stats['Removed contacts'] + filtered_pairs=stats['Filtered out'] + loops_pairs=stats['Loops'] + uncuts_pairs=stats['Uncuts'] + abnormal_pairs=stats['Weirds'] + pcr_pairs=stats['PCR duplicates'] + pct_pairs = round(tot_pairs/fastq_pairs*100, 2) + pct_mapped = round(tot_mapped/(fastq_pairs*2)*100, 2) + pct_unmapped = round(tot_unmapped/(fastq_pairs*2)*100, 2) + pct_filtered = round(filtered_pairs/tot_pairs*100, 2) + pct_loops_pairs = round(loops_pairs/tot_pairs*100, 2) + pct_uncuts_pairs = round(uncuts_pairs/tot_pairs*100, 2) + pct_abnormal_pairs = round(abnormal_pairs/tot_pairs*100, 2) + pct_pcr = round(pcr_pairs/tot_pairs*100, 2) + pct_removed=round(removed_pairs/tot_pairs*100, 2) + pct_final = round(final_pairs/tot_pairs*100, 2) + + # Open the log file and read its contents + _, file_extension = splitext(out_file) + if (file_extension == '.json'): + with open(out_file, 'w') as json_file: + json.dump(stats, json_file, indent=4) + else: + with open(out_file, 'w') as stats_file: + stats_file.write("Sample: {prefix}\n".format(prefix = prefix)) + stats_file.write("Total read pairs: {reads}\n".format(reads = "{:,}".format(fastq_pairs))) + stats_file.write("Mapped reads: {tot_mapped} ({pct_mapped}% of all reads)\n".format( + tot_mapped = "{:,}".format(tot_mapped), + pct_mapped = pct_mapped + ) + ) + stats_file.write("Unmapped reads: {tot_unmapped} ({pct_unmapped}% of all reads)\n".format( + tot_unmapped = "{:,}".format(tot_unmapped), pct_unmapped = pct_unmapped + )) + stats_file.write("Recovered contacts: {tot_pairs} ({pct_pairs}% of all read pairs)\n".format( + tot_pairs = "{:,}".format(tot_pairs), pct_pairs = pct_pairs + )) + stats_file.write("Removed contacts: {removed_pairs} ({pct_removed}% of all contacts)\n".format( + removed_pairs = "{:,}".format(removed_pairs), pct_removed = pct_removed + )) + stats_file.write(" Filtered out: {filtered_pairs} ({pct_filtered}% of all contacts)\n".format( + filtered_pairs = "{:,}".format(filtered_pairs), pct_filtered = pct_filtered + )) + stats_file.write(" Loops: {loops_pairs} ({pct_loops_pairs}% of all contacts)\n".format( + loops_pairs = "{:,}".format(loops_pairs), pct_loops_pairs = pct_loops_pairs + )) + stats_file.write(" Uncuts: {uncuts_pairs} ({pct_uncuts_pairs}% of all contacts)\n".format( + uncuts_pairs = "{:,}".format(uncuts_pairs), pct_uncuts_pairs = pct_uncuts_pairs + )) + stats_file.write(" Weirds: {abnormal_pairs} ({pct_abnormal_pairs}% of all contacts)\n".format( + abnormal_pairs = "{:,}".format(abnormal_pairs), pct_abnormal_pairs = pct_abnormal_pairs + )) + stats_file.write(" PCR duplicates: {pcr_pairs} ({pct_pcr}% of all contacts)\n".format( + pcr_pairs = "{:,}".format(pcr_pairs), pct_pcr = pct_pcr + )) + stats_file.write("Final contacts: {final_pairs} ({pct_final}% of all contacts)\n".format( + final_pairs = "{:,}".format(final_pairs), pct_final = pct_final + )) + +def print_pipeline_stats(stats): + tmp = '.'+stats['Sample']+'.txt' + write_pipeline_stats(stats, tmp) + with open(tmp, 'r') as file: + lines = [line for line in file] + print(''.join(lines)) + os.unlink(tmp) +