Merge pull request #9 from ANSES-Ploufragan/dev

Dev
ANSES-Ploufragan · Jan 20, 2025 · 2069216 · 2069216
2 parents f181dd9 + 81bc038
commit 2069216
Show file tree

Hide file tree

Showing 7 changed files with 95 additions and 8 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,9 @@
+0.2.3.3:
+  - in convert_tbl2json.py, now when no gene found in bed, deduce them from CDS and name them gene_1 .. gene_N (not ORFN 
+  because some prot are named ORF2 prot in PCV2, for an ORF that is not the second one!). It avoids to get a bug
+  for PCV2 viruses for instance
+  - in convert_vcffile_to_readablefile2.py, NOW handle genes in reverse orientation (did not display genes/prot labels previously)
+  - change version 0.2.3.2 -> 0.2.3.3
 0.2.3.2:
   - correct max number of args from 25 to 30 for vvv2_display.py
   - change version 0.2.3.1 -> 0.2.3.2

diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@
 
 setuptools.setup(
     name="vvv2_display",  # Required
-    version="0.2.3.2",  # Required
+    version="0.2.3.3",  # Required
     description="Viral Variant Visualizer 2 display",  # Optional
     long_description=long_description,
     long_description_content_type="text/markdown",  # Optional (see note above)

diff --git a/src/convert_tbl2json.py b/src/convert_tbl2json.py
@@ -1466,16 +1466,37 @@ def indexlist(item2find, list_or_string):
         f.write("\"virus_id\": \"unkown\", ")
     b_first_cds_found = False
 
+    # **********************************************
     # for genes
     f.write("\"genes\": {")
+    b_gene_found = False
     for i in range(len(names)):
         if types[i] == 'gene':
             if b_first_cds_found:
                 f.write(", ")
 
             f.write("\""+names[i]+"\": ["+str(starts[i])+", "+str(ends[i])+"]")
             b_first_cds_found = True
+            b_gene_found = True
+
+    # ------------------------------------------
+    # special case of not recorded genes
+    # means genes not recorded, like in PCV2,
+    # therefore we must deduce them from proteins, calling them ORF1...ORFN
+    if not b_gene_found:
+        b_first_cds_found = False
+        for i in range(len(names)):
+            if types[i] == 'cds':
+                tmp_name = 'gene_'+str(i+1)
+                if b_first_cds_found:
+                    f.write(", ")
+
+                f.write("\""+tmp_name+"\": ["+str(starts[i])+", "+str(ends[i])+"]")
+                b_first_cds_found = True
+    # ------------------------------------------
+
     f.write("}")
+    # **********************************************
 
     b_first_cds_found = False
     # for proteins
@@ -1530,6 +1551,7 @@ def indexlist(item2find, list_or_string):
 with open(bed_vardict_annot_f, 'w+') as f:
     # no header otherwise bug in vardict
     for i in range(len(names)):
+        # print("for vardict, treat BED contig "+names[i])
         f.write("\t".join([
             chrs[i],
             str(starts_vardict[i]),

diff --git a/src/convert_vcffile_to_readablefile2.py b/src/convert_vcffile_to_readablefile2.py
@@ -69,13 +69,21 @@ def find_key_genes(dico_json, genomepos):
     gene = "intergene"
     base_inf_allgenes = ''
     base_sup_allgenes = ''
+
     # print("find_key_genes call pos ("+str(genomepos)+")")
     for typ in dico_json:
         # print(f"dico_json key treated:{key}")
 
         if typ == "genes":
             for key in dico_json[typ]:
                 base_inf , base_sup = dico_json[typ][key][0] , dico_json[typ][key][1]
+
+                # if gene in reverse orientation, change coordinates to make base_inf < base_sup
+                if base_sup < base_inf:
+                    base_tmp = base_sup
+                    base_sup = base_inf
+                    base_inf = base_tmp
+
                 if (base_inf <= genomepos)and(genomepos <= base_sup):
                     # print("GENE base_inf:"+str(base_inf)+" <= genomepos:"+str(genomepos)+" <= base_sup:"+str(base_sup))
                     if gene == 'intergene':
@@ -91,6 +99,7 @@ def find_key_genes(dico_json, genomepos):
                 #     print(f"NOT( base_inf:{base_inf} <= genomepos:{genomepos} <= base_sup:{base_sup} )")
                 #     continue
     # print(f"return gene:{gene}\tbase_inf:{base_inf_allgenes}\tbase_sup:{base_sup_allgenes}")
+
     return gene, base_inf_allgenes, base_sup_allgenes
 
 def find_key_proteins(dico_json, genomepos, gene_res):
@@ -107,6 +116,13 @@ def find_key_proteins(dico_json, genomepos, gene_res):
         if typ == "proteins":
             for key in dico_json[typ]:
                 base_inf , base_sup = dico_json[typ][key][0] , dico_json[typ][key][1]
+
+                 # if gene in reverse orientation, change coordinates to make base_inf < base_sup
+                if base_sup < base_inf:
+                    base_tmp = base_sup
+                    base_sup = base_inf
+                    base_inf = base_tmp
+
                 if (base_inf <= genomepos)and(genomepos <= base_sup):
                     # print("PROT base_inf:"+str(base_inf)+" <= genomepos:"+str(genomepos)+" <= base_sup:"+str(base_sup))
                     if gene_res == 'intergene':
@@ -306,6 +322,7 @@ def is_homopolymer(lseq, rseq, ref, alt):
             pos = genomeposition[i]
             gene_id, base_inf, base_sup = find_key_genes(dico_json, pos)
             protein_id, pbase_inf, pbase_sup = find_key_proteins(dico_json, pos, gene_id)
+            # print("pos gene_id protein_id:"+str(pos)+"\t"+gene_id+"\t"+protein_id)
             line = write_line(temp_counts[i], pos, gene_id, protein_id, dico)
             filout.write(line)
 

diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R
@@ -1,6 +1,6 @@
 #!/usr/bin/env Rscript
 #
-# FT; last modification November 20th 2024
+# FT; last modification January 20th 2025
 # AF; last modification February 16th 2019
 #
 # Description:  This script has been written in order to generate a graph
@@ -226,7 +226,11 @@ genecols = append(genecols, complete_col_set)
 
 # needed to have color labels NOT ORDERED and to choose colors
 gene_id_labels = unique(density$gene_id)
+# print("gene_id_labels BEFORE numbering:")
+# print(gene_id_labels)
 density$gene_id_num = paste(  sprintf("%02d", match(density$gene_id, gene_id_labels) ), ":", density$gene_id) 
+# print("gene_id_labels AFTER numbering:")
+# print(density$gene_id_num)
 
 # if( b_verbose ){
 #   print("protein_id_labels BEFORE removing duplicates:")
@@ -329,7 +333,11 @@ ymm=as.numeric(unlist(ymm))
 
 gnames_label=lapply(nrange, FUN=function(x){ 
                                     glabel = ""
-                                    if( (xma[x] - xmi[x]) > min_glength){
+                                    if( (xma[x] > xmi[x]) &
+                                        ( (xma[x] - xmi[x]) > min_glength) ){
+                                      glabel=gnames[x]
+                                    } else if( (xma[x] < xmi[x]) &
+                                        ( (xmi[x] - xma[x]) > min_glength) ){
                                       glabel=gnames[x]
                                     }
                                     glabel

diff --git a/src/vvv2_display.py b/src/vvv2_display.py
@@ -38,7 +38,7 @@ def __main__():
     b_test_vvv2_display                    = False # ok 2022 05 05 complet, partial tc
     b_test_convert_tbl2json                = False # ok 2022 04 26 complete tc,
     b_test_correct_multicontig_vardict_vcf = False # ok 2022 04 29 partial tc
-    b_test_convert_vcffile_to_readable     = False # ok 2024 03 27 complete tc,
+    b_test_convert_vcffile_to_readable     = False # ok 2025 01 20 complete tc,
     b_test_correct_covdepth_f              = False # ok 2024 01 20 tc,
     b_test_visualize_snp_v4                = False # ok 2024 03 27 complete tc,
     b_test = False
@@ -175,6 +175,7 @@ def __main__():
     # parser.set_defaults(b_test_visualize_snp_v4=False)
     parser.set_defaults(b_verbose=False)
     parser.set_defaults(var_significant_threshold=7)
+    parser.set_defaults(b_log_scale=True)
     if var_significant_threshold is not None:
         var_significant_threshold_str = str(var_significant_threshold)
 
@@ -380,6 +381,39 @@ def __main__():
         print(prog_tag + " END")
         # --------------------------------------------------------------
 
+        # --------------------------------------------------------------
+        # COMPLETE GENOME PCV2
+        # in files
+        pass_annot_f  = test_dir + "/res3_vadr_pass.tbl" # from vadr results
+        fail_annot_f  = test_dir + "/res3_vadr_fail.tbl" # from vadr results
+        seq_stat_f    = test_dir + "/res3_vadr.seqstat"  # from vadr results
+        vardict_vcf_f = test_dir + "/res3_vardict.vcf"   # from lofreq results  
+        cov_depth_f   = test_dir + "/res3_covdepth.txt"  # from samtools results  
+        # tmp out files
+        json_annot_f  = test_dir + "/res3_vadr.json"     # from convert_tbl2json.py
+        contig_limits_f= test_dir + "/contig3_limits.txt"
+        contig_names_f = test_dir + "/contig3_names.txt"
+        cov_depth_corr_f= test_dir + "/res3_covdepth_corr.txt"
+        # final out file
+        png_var_f     = test_dir + "/res3_vvv2.png"     # from ...
+        cmd = ' '.join([ dir_path + "/vvv2_display.py",
+                            "--pass_tbl_f", pass_annot_f,
+                            "--fail_tbl_f", fail_annot_f,
+                            "--seq_stat_f", seq_stat_f,
+                            "--vcf_f", vardict_vcf_f,
+                            "--contig_limits_f", contig_limits_f,   
+                            "--contig_names_f", contig_names_f,   
+                            "--cov_depth_f", cov_depth_f,  
+                            "--cov_depth_corr_f", cov_depth_corr_f,              
+                            "--png_var_f", png_var_f,
+                            "--var_significant_threshold", var_significant_threshold_str
+                    ])
+        print(prog_tag + " START")    
+        print(prog_tag + " cmd:" + cmd)
+        os.system(cmd)
+        print(prog_tag + " END")
+        # --------------------------------------------------------------
+
         sys.exit()
     # ------------------------------------------------------------------    
 
@@ -483,8 +517,8 @@ def __main__():
         # vardict_vcf_f = test_dir + "/res_vardict.vcf"  # from vardict results
         correct_vcf_f     = test_dir + "/res_correct.vcf"                  
         json_annot_f      = test_dir + "/res_vadr.json"
-        snp_loc_f         =  test_dir + "/res_snp.txt"
-        snp_loc_summary_f =  test_dir + "/res_snp_summary.txt"
+        snp_loc_f         = test_dir + "/res_snp.txt"
+        snp_loc_summary_f = test_dir + "/res_snp_summary.txt"
 
     if(snp_loc_f == ''):
        snp_loc_f = vardict_vcf_f

diff --git a/vvv2_display.xml b/vvv2_display.xml
@@ -1,6 +1,6 @@
-<tool id="vvv2_display" name="vvv2_display: Display SNP proportions and CDS of an assembly in png image" version="0.2.3.2" python_template_version="3.9">
+<tool id="vvv2_display" name="vvv2_display: Display SNP proportions and CDS of an assembly in png image" version="0.2.3.3" python_template_version="3.9">
     <requirements>
-      <requirement type="package" version="0.2.3.2">vvv2_display</requirement>	      
+      <requirement type="package" version="0.2.3.3">vvv2_display</requirement>	      
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
        vvv2_display.py -f '$vadr_fail_annotation' -p '$vadr_pass_annotation' -s '$vadr_seqstat' -n '$vardict_vcf' -r '$snp_img' -w '$var_significant_thres' -o '$cov_depth' -e '$cov_depth_corr' -t '$snp_loc' -u '$snp_loc_summary' -j '$json_annot' -k '$bed_annot' -l '$correct_vcf' -m '$contig_limits' $cov_depth_scale