Smaller bug fixes and improvements

aljpetri · Mar 21, 2023 · b8946d5 · b8946d5
1 parent d592410
commit b8946d5
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 45 deletions.
diff --git a/GraphGeneration.py b/GraphGeneration.py
@@ -774,17 +774,7 @@ def generateGraphfromIntervals(all_intervals_for_graph, k, delta_len, read_len_d
                                                     #edge_support[previous_node, name] = []
                                                     #edge_support[previous_node, name].append(r_id)
                                                     DG.add_edge(previous_node, name, length=length)
-                                                    cycle_added = isCyclic(DG)
                                                     cycle_added2 = cycle_finder(DG,previous_node)
-                                                    #cycle_added3 = cycle_finder(DG,name)
-                                                    #if cycle_added != cycle_added2 and cycle_added != cycle_added3:
-                                                    #    print(cycle_added,cycle_added2,cycle_added3)
-                                                    #    sys.exit(1)
-                                                    #cycle_added = isCyclic2(DG, name)
-                                                    #try:
-                                                    #    topo_sort=nx.topological_sort(DG)
-                                                    #    #print(topo_sort[0])
-                                                    #except:
                                                     if cycle_added2:
                                                         DG.remove_edge(previous_node, name)
                                                         name = str(inter[0]) + ", " + str(inter[1]) + ", " + str(r_id)

diff --git a/IsoformGeneration.py b/IsoformGeneration.py
@@ -141,7 +141,6 @@ def compute_equal_reads2(DG,support):
     node_support_left=set(support)
     visited_nodes_isoforms={}
     isoforms={}
-    all_supp=set(support)
     #indicates whether we want to merge a true subisoform into another isoform (ie. the isoform is a continous sublist of the longer one)
     merge_sub_isos=True
     #we iterate as long as still not all support was allocated to a path
@@ -210,7 +209,7 @@ def run_spoa(reads, spoa_out_file, spoa_path):
         curr_best_seqs is an array with q_id, pos1, pos2
         the current read is on index 0 in curr_best_seqs array
     """
-def generate_isoform_using_spoa(curr_best_seqs,reads, work_dir,outfolder,batch_id, max_seqs_to_spoa=200,iso_abundance=1):
+def generate_isoform_using_spoa(curr_best_seqs,reads, work_dir,outfolder,batch_id,iso_abundance, max_seqs_to_spoa=200):
     print("Generating the Isoforms")
     mapping = {}
     consensus_name="spoa"+str(batch_id)+"merged.fasta"
@@ -421,13 +420,8 @@ def parse_cigar_diversity_isoform_level_new(cigar_tuples, delta,delta_len,merge_
         #we have a match
         else:
             if this_start_pos < first_match:
-                #print("before first")
-                #print(elem)
-
                 before_first_matches += cig_len
             elif this_start_pos >= last_match:
-                #print("after last")
-                #print(elem)
                 after_last_matches+=cig_len
             #we know we have a match, but is it significant?
     """"#we have iterated over the full cigar string and now know where the last significant match is located
@@ -549,12 +543,13 @@ def align_to_merge(consensus1,consensus2,delta,delta_len,merge_sub_isoforms_3,me
     #print(s2_alignment)
     #print(cigar_tuples)
     #print(overall_len)
-    windowsize=20
-    alignment_threshold=0.8
+    windowsize=30
+    alignment_threshold=0.7
     start_match=find_first_significant_match(s1_alignment,s2_alignment,windowsize,alignment_threshold)
     end_match=find_first_significant_match(s1_alignment[::-1],s2_alignment[::-1],windowsize,alignment_threshold)
     #print("Start:",start_match," end:",end_match)
-    if not start_match and not end_match:
+    if start_match<0 or end_match<0:
+    #if not start_match and not end_match:
         return False
     end_match_pos=overall_len-end_match
     #print("EMP",end_match_pos)
@@ -797,7 +792,7 @@ def generate_isoforms(DG,all_reads,reads,work_dir,outfolder,batch_id,merge_sub_i
         generate_isoforms_new(equal_reads, all_reads, work_dir, outfolder, batch_id, max_seqs_to_spoa,
                                            iso_abundance,new_consensuses)
     else:
-        generate_isoform_using_spoa(equal_reads, all_reads, work_dir, outfolder, batch_id, max_seqs_to_spoa, iso_abundance)
+        generate_isoform_using_spoa(equal_reads, all_reads, work_dir, outfolder, batch_id, iso_abundance, max_seqs_to_spoa)
 
 
 DEBUG=False

diff --git a/main.py b/main.py
@@ -718,6 +718,8 @@ def main(args):
 
             tmp_cnt = 0
             all_intervals_for_graph = {}
+            #profiler = Profiler()
+            #profiler.start()
             for r_id in sorted(reads):  # , reverse=True):
 
                 # for r_id in reads:
@@ -887,16 +889,17 @@ def main(args):
             print("Generating the graph")
             all_batch_reads_dict[batch_id] = new_all_reads
             read_len_dict = get_read_lengths(all_reads)
-
+            #profiler.stop()
+            #profiler.print()
             #print("Used for GraphGen",", ",k_size,", ",delta_len,", ",read_len_dict,", ",new_all_reads)
             import time
             start_time = time.time()
-            profiler = Profiler()
-            profiler.start()
+            #profiler = Profiler()
+            #profiler.start()
             DG, known_intervals, node_overview_read, reads_for_isoforms, reads_list = generateGraphfromIntervals(
                 all_intervals_for_graph, k_size, delta_len, read_len_dict,new_all_reads)
-            profiler.stop()
-            print(profiler)
+            #profiler.stop()
+            #profiler.print()
             #DG, known_intervals, node_overview_read, reads_for_isoforms, reads_list = generateGraphfromIntervalsOld(
             #    all_intervals_for_graph, k_size, delta_len, read_len_dict, new_all_reads)
             print("--- %s seconds ---" % (time.time() - start_time))
@@ -935,11 +938,11 @@ def main(args):
         #in_edges = DG.in_edges(node, data=True)
         #print("in_edges:", in_edges)
         mode=args.slow
-        profiler = Profiler()
-        profiler.start()
+        #profiler = Profiler()
+        #profiler.start()
         simplifyGraph(DG, new_all_reads,work_dir,k_size,delta_len,mode)
-        profiler.stop()
-        print(profiler)
+        #profiler.stop()
+        #profiler.print()
         #snapshot2 = tracemalloc.take_snapshot()
         #print(snapshot2)
         #print("#Nodes for DG: " + str(DG.number_of_nodes()) + " , #Edges for DG: " + str(DG.number_of_edges()))
@@ -965,11 +968,11 @@ def main(args):
         merge_sub_isoforms_5=True
         delta_iso_len_3=5
         delta_iso_len_5=5"""
-        profiler = Profiler()
-        profiler.start()
+        #profiler = Profiler()
+        #profiler.start()
         generate_isoforms(DG, new_all_reads, reads_for_isoforms, work_dir, outfolder,batch_id, merge_sub_isoforms_3,merge_sub_isoforms_5,delta,delta_len, delta_iso_len_3, delta_iso_len_5,iso_abundance,max_seqs_to_spoa)
-        profiler.stop()
-        print(profiler)
+        #profiler.stop()
+        #profiler.print()
         #snapshot3 = tracemalloc.take_snapshot()
         #print(snapshot3)
         print("Isoforms generated")
@@ -998,15 +1001,16 @@ def main(args):
     #with open(os.path.join(outfolder, "all_batches_reads.txt"), 'wb') as file:
     #    file.write(pickle.dumps(all_batch_reads_dict))
     print("Starting batch merging")
-    profiler = Profiler()
-    profiler.start()
+    #profiler = Profiler()
+    #profiler.start()
     #if max_batchid>0:
     if not args.parallel:
             print("Merging the batches with linear strategy")
+
             merge_batches(max_batchid, work_dir, outfolder, new_all_reads, merge_sub_isoforms_3, merge_sub_isoforms_5, delta,
                       delta_len, max_seqs_to_spoa, delta_iso_len_3, delta_iso_len_5,iso_abundance,args.rc_identity_threshold)
-    profiler.stop()
-    print(profiler)
+    #profiler.stop()
+    #profiler.print()
     # eprint("tot_before:", tot_errors_before)
     # eprint("tot_after:", sum(tot_errors_after.values()), tot_errors_after)
     # eprint(len(corrected_reads))

diff --git a/pipeline_simulations.sh b/pipeline_simulations.sh
@@ -35,10 +35,10 @@ echo
 echo "Running isONclust"
 echo
 
-isONclust  --t $num_cores  --ont --fastq $raw_reads \
-             --outfolder $outfolder/clustering  --k 8 --w 8 --min_shared 3 #old k=10 oldw=12
-isONclust write_fastq --N $iso_abundance --clusters $outfolder/clustering/final_clusters.tsv \
-                      --fastq $raw_reads --outfolder  $outfolder/clustering/fastq_files
+#isONclust  --t $num_cores  --ont --fastq $raw_reads \
+#             --outfolder $outfolder/clustering  --k 8 --w 8 --min_shared 3 #old k=10 oldw=12
+#isONclust write_fastq --N $iso_abundance --clusters $outfolder/clustering/final_clusters.tsv \
+#                      --fastq $raw_reads --outfolder  $outfolder/clustering/fastq_files
 echo
 echo "Finished isONclust"
 echo
@@ -48,9 +48,9 @@ echo
 echo
 echo "Running isONcorrect"
 echo
-#run_isoncorrect --t $num_cores  --fastq_folder $raw_reads  --outfolder $outfolder/correction/
+run_isoncorrect --t $num_cores  --fastq_folder $raw_reads  --outfolder $outfolder/correction/
 #the following line is without isONclust:
-run_isoncorrect --t $num_cores  --fastq_folder $outfolder/clustering/fastq_files  --outfolder $outfolder/correction/
+#run_isoncorrect --t $num_cores  --fastq_folder $outfolder/clustering/fastq_files  --outfolder $outfolder/correction/
 
 echo
 echo "Finished isONcorrect"
@@ -62,7 +62,7 @@ echo "Running isONform"
 echo
 #python3.11 $isONform_folder/isONform_parallel.py --fastq_folder $outfolder/correction/ --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 5 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches
 #python3.11 $isONform_folder/main.py --fastq $outfolder/correction/*/*.fastq --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 5 --outfolder $outfolder/singleisoforms --iso_abundance $iso_abundance
-python3.11 $isONform_folder/isONform_parallel.py --fastq_folder $outfolder/correction/ --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 5 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches --merge_sub_isoforms_3 --merge_sub_isoforms_5 --delta_iso_len_3 5 --delta_iso_len_5 5 --slow
+python3.11 $isONform_folder/isONform_parallel.py --fastq_folder $outfolder/correction/ --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 10 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches --merge_sub_isoforms_3 --merge_sub_isoforms_5 --delta_iso_len_3 30 --delta_iso_len_5 50 --slow
 echo
 echo "Finished isONform"
 echo