Merge branch 'dev'

ANSES-Ploufragan · Feb 4, 2025 · ab43578 · ab43578
2 parents e6c85c8 + 636d6ef
commit ab43578
Show file tree

Hide file tree

Showing 4 changed files with 105 additions and 15 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,6 @@
+0.2.3.5:
+  - bug correction: in convert_tbl2json.py, modif to take into account of vadr flu annotation (sig_peptide, note just after gene, etc)
+  - change version 0.2.3.4 -> 0.2.3.5
 0.2.3.4:
   - bug correction: in convert_tbl2json.py, handle case of ncRNA when gene is expected (for dengue example)
   - correct a bug when gene displayed between product and protein_id
@@ -6,8 +9,8 @@
   - change version 0.2.3.3 -> 0.2.3.4
 0.2.3.3:
   - in convert_tbl2json.py, now when no gene found in bed, deduce them from CDS and name them gene_1 .. gene_N (not ORFN 
-  because some prot are named ORF2 prot in PCV2, for an ORF that is not the second one!). It avoids to get a bug
-  for PCV2 viruses for instance
+    because some prot are named ORF2 prot in PCV2, for an ORF that is not the second one!). It avoids to get a bug
+    for PCV2 viruses for instance
   - in convert_vcffile_to_readablefile2.py, NOW handle genes in reverse orientation (did not display genes/prot labels previously)
   - change version 0.2.3.2 -> 0.2.3.3
 0.2.3.2:
@@ -63,6 +66,5 @@
   - pyvcf replaced by pysam to improve compatibility / long term maintening
   - consequence: PYTHON_SCRIPTS/convert_vcffile_to_readablefile.py replaced by PYTHON_SCRIPTS/convert_vcffile_to_readablefile2.py
   - vvv2.py renamed vvv2_display.py (because this program replace only the visualization part of vvv)
-
 0.1.0-alpha: first version
 
diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@
 
 setuptools.setup(
     name="vvv2_display",  # Required
-    version="0.2.3.4",  # Required
+    version="0.2.3.5",  # Required
     description="Viral Variant Visualizer 2 display",  # Optional
     long_description=long_description,
     long_description_content_type="text/markdown",  # Optional (see note above)

diff --git a/src/convert_tbl2json.py b/src/convert_tbl2json.py
@@ -49,7 +49,8 @@
 
 material_type_list = ['gene',
                       'misc_feature',
-                      'mat_peptide']
+                      'mat_peptide',
+                      'sig_peptide']
 material_type = '|'.join(material_type_list)
 
 parser = argparse.ArgumentParser()
@@ -445,6 +446,26 @@ def indexlist(item2find, list_or_string):
                         gene_name = ''
                         continue
 
+                    # added 2025 01 29 to handle sig_peptide of vadr flu annotation
+                    elif line_fields[2] == 'sig_peptide':
+                        if b_verbose:
+                            print(' '.join(['sig_peptide',
+                                        gene_start,
+                                        gene_end,
+                                        gene_name])
+                                )
+                        b_next_is_gene = False
+                        b_next_is_product = True                        
+
+                        # ask to treat current mat_peptide info
+                        cds_start = line_fields[0]
+                        cds_end = line_fields[1]
+                        cds_start = re.sub(non_alphanum, '', cds_start)
+                        cds_end   = re.sub(non_alphanum, '', cds_end)
+                        curr_type = 'sig_peptide'
+                        gene_name = ''
+                        continue
+
                     # section added 2024 09 24, when 2 gene section are following each other
                     # without any other description
                     elif line_fields[2] == 'gene':
@@ -864,6 +885,37 @@ def indexlist(item2find, list_or_string):
                         b_next_is_gene = False
                         b_next_is_product = True
                         continue
+                    elif (len(line_fields) == 3) and (line_fields[2] == 'mat_peptide'):
+                        cds_start = line_fields[0]
+                        cds_end   = line_fields[1]
+                        cds_start = re.sub(non_alphanum, '', cds_start)
+                        cds_end   = re.sub(non_alphanum, '', cds_end)
+                        b_next_is_product = True
+                        b_next_is_gene = False
+                        curr_type = 'CDS'
+                        if b_verbose:
+                            print("\t".join([
+                                "TMP_MAT_PEPTIDE:"+line_fields[2],
+                                cds_start,
+                                cds_end
+                            ]))
+                        continue
+                    # added 2025 01 29 to handle sig_peptide in vadr flu annotations
+                    elif line_fields[2] == 'sig_peptide':
+                        cds_start = line_fields[0]
+                        cds_end   = line_fields[1]
+                        cds_start = re.sub(non_alphanum, '', cds_start)
+                        cds_end   = re.sub(non_alphanum, '', cds_end)
+                        b_next_is_product = True
+                        b_next_is_gene = False
+                        curr_type = 'CDS'
+                        if b_verbose:
+                            print("\t".join([
+                                "TMP_SIG_PEPTIDE:"+line_fields[2],
+                                cds_start,
+                                cds_end
+                            ]))
+                        continue
                     else:
                         print("line_fields:"+str(line_fields)+", line "+ str(sys._getframe().f_lineno) )
                         sys.exit(prog_tag + "[Error] Case not encountered line "+ str(sys._getframe().f_lineno) )
@@ -956,8 +1008,28 @@ def indexlist(item2find, list_or_string):
                             b_next_is_protein_id = False    
                             b_next_is_gene = True # added 2024 10 01                    
                             continue
-
-                    if line_fields[0] == 'ncRNA_class':
+                    # added 2025 01 29 to handle function provided in flu vadr annotations
+                    elif line_fields[0] == 'function':
+                        if b_verbose or b_check_gene_prot_rec:
+                            print("\t".join([
+                                    prog_tag,
+                                    "function found:",
+                                    ",".join(line_fields),
+                                    ", line "+str(frame.f_lineno)
+                            ]))
+                        continue
+                    # added 2025 01 29 to handle note provided in flu vadr annotations just after gene
+                    # do not bring usefull info
+                    elif line_fields[0] == 'note':
+                        if b_verbose or b_check_gene_prot_rec:
+                            print("\t".join([
+                                    prog_tag,
+                                    "note found:",
+                                    ",".join(line_fields),
+                                    ", line "+str(frame.f_lineno)
+                            ]))
+                        continue
+                    elif line_fields[0] == 'ncRNA_class':
                         protein_id = line_fields[1]
                         tmp_name = protein_id
                         b_next_is_protein_id = False
@@ -1316,7 +1388,7 @@ def indexlist(item2find, list_or_string):
 
                         continue
 
-                elif line_fields[2] == 'gene':
+                elif (len(line_fields) == 3)and(line_fields[2] == 'gene'):
                     gene_start = line_fields[0]
                     gene_end   = line_fields[1]
                     gene_start = re.sub(non_alphanum, '', gene_start)
@@ -1331,7 +1403,7 @@ def indexlist(item2find, list_or_string):
                             "for line "+str(line_fields)
                         ]))
 
-                elif line_fields[2] == 'CDS':
+                elif (len(line_fields) == 3)and(line_fields[2] == 'CDS'):
                     cds_start = line_fields[0]
                     cds_end   = line_fields[1]
                     cds_start = re.sub(non_alphanum, '', cds_start)
@@ -1346,7 +1418,7 @@ def indexlist(item2find, list_or_string):
                             cds_end
                         ]))
 
-                elif line_fields[2] == 'ncRNA':
+                elif (len(line_fields) == 3)and(line_fields[2] == 'ncRNA'):
                     cds_start = line_fields[0]
                     cds_end   = line_fields[1]
                     cds_start = re.sub(non_alphanum, '', cds_start)
@@ -1361,7 +1433,7 @@ def indexlist(item2find, list_or_string):
                             cds_end
                         ]))
 
-                elif line_fields[2] == 'misc_feature':
+                elif (len(line_fields) == 3)and(line_fields[2] == 'misc_feature'):
                     misc_feature_start = line_fields[0]
                     misc_feature_end   = line_fields[1]
                     misc_feature_start = re.sub(non_alphanum, '', misc_feature_start)
@@ -1375,7 +1447,7 @@ def indexlist(item2find, list_or_string):
                             misc_feature_end
                         ]))
 
-                elif line_fields[2] == 'mat_peptide':
+                elif (len(line_fields) == 3)and(line_fields[2] == 'mat_peptide'):
                     cds_start = line_fields[0]
                     cds_end   = line_fields[1]
                     cds_start = re.sub(non_alphanum, '', cds_start)
@@ -1390,7 +1462,23 @@ def indexlist(item2find, list_or_string):
                             cds_end
                         ]))
 
-                elif line_fields[2] == 'stem_loop':
+                # added 2025 01 29 to handle sig_peptide in vadr flu annotations
+                elif (len(line_fields) == 3)and(line_fields[2] == 'sig_peptide'):
+                    cds_start = line_fields[0]
+                    cds_end   = line_fields[1]
+                    cds_start = re.sub(non_alphanum, '', cds_start)
+                    cds_end   = re.sub(non_alphanum, '', cds_end)
+                    b_next_is_product = True
+                    b_next_is_gene = False
+                    curr_type = 'CDS'
+                    if b_verbose:
+                        print("\t".join([
+                            "TMP_SIG_PEPTIDE:"+line_fields[2],
+                            cds_start,
+                            cds_end
+                        ]))
+
+                elif (len(line_fields) == 3)and(line_fields[2] == 'stem_loop'):
                     misc_feature_start = line_fields[0]
                     misc_feature_end   = line_fields[1]
                     misc_feature_start = re.sub(non_alphanum, '', misc_feature_start)

diff --git a/vvv2_display.xml b/vvv2_display.xml
@@ -1,6 +1,6 @@
-<tool id="vvv2_display" name="vvv2_display: Display SNP proportions and CDS of an assembly in png image" version="0.2.3.4" python_template_version="3.9">
+<tool id="vvv2_display" name="vvv2_display: Display SNP proportions and CDS of an assembly in png image" version="0.2.3.5" python_template_version="3.9">
     <requirements>
-      <requirement type="package" version="0.2.3.4">vvv2_display</requirement>	      
+      <requirement type="package" version="0.2.3.5">vvv2_display</requirement>	      
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
        vvv2_display.py -f '$vadr_fail_annotation' -p '$vadr_pass_annotation' -s '$vadr_seqstat' -n '$vardict_vcf' -r '$snp_img' -w '$var_significant_thres' -o '$cov_depth' -e '$cov_depth_corr' -t '$snp_loc' -u '$snp_loc_summary' -j '$json_annot' -k '$bed_annot' -l '$correct_vcf' -m '$contig_limits' $cov_depth_scale