Skip to content

Commit

Permalink
Smaller bug fixes and improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
aljpetri committed Mar 21, 2023
1 parent d592410 commit b8946d5
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 45 deletions.
10 changes: 0 additions & 10 deletions GraphGeneration.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,17 +774,7 @@ def generateGraphfromIntervals(all_intervals_for_graph, k, delta_len, read_len_d
#edge_support[previous_node, name] = []
#edge_support[previous_node, name].append(r_id)
DG.add_edge(previous_node, name, length=length)
cycle_added = isCyclic(DG)
cycle_added2 = cycle_finder(DG,previous_node)
#cycle_added3 = cycle_finder(DG,name)
#if cycle_added != cycle_added2 and cycle_added != cycle_added3:
# print(cycle_added,cycle_added2,cycle_added3)
# sys.exit(1)
#cycle_added = isCyclic2(DG, name)
#try:
# topo_sort=nx.topological_sort(DG)
# #print(topo_sort[0])
#except:
if cycle_added2:
DG.remove_edge(previous_node, name)
name = str(inter[0]) + ", " + str(inter[1]) + ", " + str(r_id)
Expand Down
17 changes: 6 additions & 11 deletions IsoformGeneration.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ def compute_equal_reads2(DG,support):
node_support_left=set(support)
visited_nodes_isoforms={}
isoforms={}
all_supp=set(support)
#indicates whether we want to merge a true subisoform into another isoform (ie. the isoform is a continous sublist of the longer one)
merge_sub_isos=True
#we iterate as long as still not all support was allocated to a path
Expand Down Expand Up @@ -210,7 +209,7 @@ def run_spoa(reads, spoa_out_file, spoa_path):
curr_best_seqs is an array with q_id, pos1, pos2
the current read is on index 0 in curr_best_seqs array
"""
def generate_isoform_using_spoa(curr_best_seqs,reads, work_dir,outfolder,batch_id, max_seqs_to_spoa=200,iso_abundance=1):
def generate_isoform_using_spoa(curr_best_seqs,reads, work_dir,outfolder,batch_id,iso_abundance, max_seqs_to_spoa=200):
print("Generating the Isoforms")
mapping = {}
consensus_name="spoa"+str(batch_id)+"merged.fasta"
Expand Down Expand Up @@ -421,13 +420,8 @@ def parse_cigar_diversity_isoform_level_new(cigar_tuples, delta,delta_len,merge_
#we have a match
else:
if this_start_pos < first_match:
#print("before first")
#print(elem)

before_first_matches += cig_len
elif this_start_pos >= last_match:
#print("after last")
#print(elem)
after_last_matches+=cig_len
#we know we have a match, but is it significant?
""""#we have iterated over the full cigar string and now know where the last significant match is located
Expand Down Expand Up @@ -549,12 +543,13 @@ def align_to_merge(consensus1,consensus2,delta,delta_len,merge_sub_isoforms_3,me
#print(s2_alignment)
#print(cigar_tuples)
#print(overall_len)
windowsize=20
alignment_threshold=0.8
windowsize=30
alignment_threshold=0.7
start_match=find_first_significant_match(s1_alignment,s2_alignment,windowsize,alignment_threshold)
end_match=find_first_significant_match(s1_alignment[::-1],s2_alignment[::-1],windowsize,alignment_threshold)
#print("Start:",start_match," end:",end_match)
if not start_match and not end_match:
if start_match<0 or end_match<0:
#if not start_match and not end_match:
return False
end_match_pos=overall_len-end_match
#print("EMP",end_match_pos)
Expand Down Expand Up @@ -797,7 +792,7 @@ def generate_isoforms(DG,all_reads,reads,work_dir,outfolder,batch_id,merge_sub_i
generate_isoforms_new(equal_reads, all_reads, work_dir, outfolder, batch_id, max_seqs_to_spoa,
iso_abundance,new_consensuses)
else:
generate_isoform_using_spoa(equal_reads, all_reads, work_dir, outfolder, batch_id, max_seqs_to_spoa, iso_abundance)
generate_isoform_using_spoa(equal_reads, all_reads, work_dir, outfolder, batch_id, iso_abundance, max_seqs_to_spoa)


DEBUG=False
Expand Down
38 changes: 21 additions & 17 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,8 @@ def main(args):

tmp_cnt = 0
all_intervals_for_graph = {}
#profiler = Profiler()
#profiler.start()
for r_id in sorted(reads): # , reverse=True):

# for r_id in reads:
Expand Down Expand Up @@ -887,16 +889,17 @@ def main(args):
print("Generating the graph")
all_batch_reads_dict[batch_id] = new_all_reads
read_len_dict = get_read_lengths(all_reads)

#profiler.stop()
#profiler.print()
#print("Used for GraphGen",", ",k_size,", ",delta_len,", ",read_len_dict,", ",new_all_reads)
import time
start_time = time.time()
profiler = Profiler()
profiler.start()
#profiler = Profiler()
#profiler.start()
DG, known_intervals, node_overview_read, reads_for_isoforms, reads_list = generateGraphfromIntervals(
all_intervals_for_graph, k_size, delta_len, read_len_dict,new_all_reads)
profiler.stop()
print(profiler)
#profiler.stop()
#profiler.print()
#DG, known_intervals, node_overview_read, reads_for_isoforms, reads_list = generateGraphfromIntervalsOld(
# all_intervals_for_graph, k_size, delta_len, read_len_dict, new_all_reads)
print("--- %s seconds ---" % (time.time() - start_time))
Expand Down Expand Up @@ -935,11 +938,11 @@ def main(args):
#in_edges = DG.in_edges(node, data=True)
#print("in_edges:", in_edges)
mode=args.slow
profiler = Profiler()
profiler.start()
#profiler = Profiler()
#profiler.start()
simplifyGraph(DG, new_all_reads,work_dir,k_size,delta_len,mode)
profiler.stop()
print(profiler)
#profiler.stop()
#profiler.print()
#snapshot2 = tracemalloc.take_snapshot()
#print(snapshot2)
#print("#Nodes for DG: " + str(DG.number_of_nodes()) + " , #Edges for DG: " + str(DG.number_of_edges()))
Expand All @@ -965,11 +968,11 @@ def main(args):
merge_sub_isoforms_5=True
delta_iso_len_3=5
delta_iso_len_5=5"""
profiler = Profiler()
profiler.start()
#profiler = Profiler()
#profiler.start()
generate_isoforms(DG, new_all_reads, reads_for_isoforms, work_dir, outfolder,batch_id, merge_sub_isoforms_3,merge_sub_isoforms_5,delta,delta_len, delta_iso_len_3, delta_iso_len_5,iso_abundance,max_seqs_to_spoa)
profiler.stop()
print(profiler)
#profiler.stop()
#profiler.print()
#snapshot3 = tracemalloc.take_snapshot()
#print(snapshot3)
print("Isoforms generated")
Expand Down Expand Up @@ -998,15 +1001,16 @@ def main(args):
#with open(os.path.join(outfolder, "all_batches_reads.txt"), 'wb') as file:
# file.write(pickle.dumps(all_batch_reads_dict))
print("Starting batch merging")
profiler = Profiler()
profiler.start()
#profiler = Profiler()
#profiler.start()
#if max_batchid>0:
if not args.parallel:
print("Merging the batches with linear strategy")

merge_batches(max_batchid, work_dir, outfolder, new_all_reads, merge_sub_isoforms_3, merge_sub_isoforms_5, delta,
delta_len, max_seqs_to_spoa, delta_iso_len_3, delta_iso_len_5,iso_abundance,args.rc_identity_threshold)
profiler.stop()
print(profiler)
#profiler.stop()
#profiler.print()
# eprint("tot_before:", tot_errors_before)
# eprint("tot_after:", sum(tot_errors_after.values()), tot_errors_after)
# eprint(len(corrected_reads))
Expand Down
14 changes: 7 additions & 7 deletions pipeline_simulations.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ echo
echo "Running isONclust"
echo

isONclust --t $num_cores --ont --fastq $raw_reads \
--outfolder $outfolder/clustering --k 8 --w 8 --min_shared 3 #old k=10 oldw=12
isONclust write_fastq --N $iso_abundance --clusters $outfolder/clustering/final_clusters.tsv \
--fastq $raw_reads --outfolder $outfolder/clustering/fastq_files
#isONclust --t $num_cores --ont --fastq $raw_reads \
# --outfolder $outfolder/clustering --k 8 --w 8 --min_shared 3 #old k=10 oldw=12
#isONclust write_fastq --N $iso_abundance --clusters $outfolder/clustering/final_clusters.tsv \
# --fastq $raw_reads --outfolder $outfolder/clustering/fastq_files
echo
echo "Finished isONclust"
echo
Expand All @@ -48,9 +48,9 @@ echo
echo
echo "Running isONcorrect"
echo
#run_isoncorrect --t $num_cores --fastq_folder $raw_reads --outfolder $outfolder/correction/
run_isoncorrect --t $num_cores --fastq_folder $raw_reads --outfolder $outfolder/correction/
#the following line is without isONclust:
run_isoncorrect --t $num_cores --fastq_folder $outfolder/clustering/fastq_files --outfolder $outfolder/correction/
#run_isoncorrect --t $num_cores --fastq_folder $outfolder/clustering/fastq_files --outfolder $outfolder/correction/

echo
echo "Finished isONcorrect"
Expand All @@ -62,7 +62,7 @@ echo "Running isONform"
echo
#python3.11 $isONform_folder/isONform_parallel.py --fastq_folder $outfolder/correction/ --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 5 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches
#python3.11 $isONform_folder/main.py --fastq $outfolder/correction/*/*.fastq --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 5 --outfolder $outfolder/singleisoforms --iso_abundance $iso_abundance
python3.11 $isONform_folder/isONform_parallel.py --fastq_folder $outfolder/correction/ --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 5 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches --merge_sub_isoforms_3 --merge_sub_isoforms_5 --delta_iso_len_3 5 --delta_iso_len_5 5 --slow
python3.11 $isONform_folder/isONform_parallel.py --fastq_folder $outfolder/correction/ --exact_instance_limit 50 --k 20 --w 31 --xmin 14 --xmax 80 --max_seqs_to_spoa 200 --delta_len 10 --outfolder $outfolder/isoforms --iso_abundance $iso_abundance --split_wrt_batches --merge_sub_isoforms_3 --merge_sub_isoforms_5 --delta_iso_len_3 30 --delta_iso_len_5 50 --slow
echo
echo "Finished isONform"
echo
Expand Down

0 comments on commit b8946d5

Please sign in to comment.