From 4889f1c6d8846b82d68a62fac52091d066cd84a6 Mon Sep 17 00:00:00 2001 From: chhylp123 Date: Sat, 7 Dec 2024 20:02:57 -0500 Subject: [PATCH] fix high-coverage issue for ONT --- CommandLines.h | 2 +- Overlaps.cpp | 31 ++++++++-------- anchor.cpp | 95 ++++++++++++++++++++++++++++++++++++++++++++------ ecovlp.cpp | 9 ++--- 4 files changed, 105 insertions(+), 32 deletions(-) diff --git a/CommandLines.h b/CommandLines.h index 7ab486e..951bc21 100644 --- a/CommandLines.h +++ b/CommandLines.h @@ -5,7 +5,7 @@ #include #include -#define HA_VERSION "0.22.0-r689" +#define HA_VERSION "0.23.0-r691" #define VERBOSE 0 diff --git a/Overlaps.cpp b/Overlaps.cpp index 3c2005b..d665446 100644 --- a/Overlaps.cpp +++ b/Overlaps.cpp @@ -1136,7 +1136,7 @@ void set_reverse_overlap(ma_hit_t* dest, ma_hit_t* source) -void normalize_ma_hit_t_single_side_advance(ma_hit_t_alloc* sources, long long num_sources) +void normalize_ma_hit_t_single_side_advance(ma_hit_t_alloc* sources, long long num_sources, uint32_t recuse_el) { double startTime = Get_T(); @@ -1160,38 +1160,35 @@ void normalize_ma_hit_t_single_side_advance(ma_hit_t_alloc* sources, long long n ///if(index != -1 && sources[tn].buffer[index].del == 0) - if(index != -1) - { + if(index != -1) { is_del = 0; - if(sources[i].buffer[j].del || sources[tn].buffer[index].del) - { + if(sources[i].buffer[j].del || sources[tn].buffer[index].del) { is_del = 1; } qLen_0 = Get_qe(sources[i].buffer[j]) - Get_qs(sources[i].buffer[j]); qLen_1 = Get_qe(sources[tn].buffer[index]) - Get_qs(sources[tn].buffer[index]); - if(qLen_0 == qLen_1) - { + if(qLen_0 == qLen_1) { ///qn must be not equal to tn ///make sources[qn] = sources[tn] if qn > tn - if(qn < tn) - { + if(qn < tn) { set_reverse_overlap(&(sources[tn].buffer[index]), &(sources[i].buffer[j])); } - } - else if(qLen_0 > qLen_1) - { + } else if(qLen_0 > qLen_1) { set_reverse_overlap(&(sources[tn].buffer[index]), &(sources[i].buffer[j])); } + + if(recuse_el && sources[i].buffer[j].el && sources[tn].buffer[index].el) is_del = 0; sources[i].buffer[j].del = is_del; sources[tn].buffer[index].del = is_del; - } - else ///means this edge just occurs in one direction - { + } else {///means this edge just occurs in one direction set_reverse_overlap(&ele, &(sources[i].buffer[j])); sources[i].buffer[j].del = ele.del = 1; + if(recuse_el && sources[i].buffer[j].el && ele.el) { + sources[i].buffer[j].del = ele.del = 0; + } add_ma_hit_t_alloc(&(sources[tn]), &ele); } } @@ -38700,9 +38697,9 @@ ma_sub_t **coverage_cut_ptr, int debug_g) ///it's hard to say which function is better ///normalize_ma_hit_t_single_side(sources, n_read); - normalize_ma_hit_t_single_side_advance(sources, n_read); + normalize_ma_hit_t_single_side_advance(sources, n_read, asm_opt.is_ont); // normalize_ma_hit_t_single_side_advance_mult(sources, n_read, asm_opt.thread_num); - normalize_ma_hit_t_single_side_advance(reverse_sources, n_read); + normalize_ma_hit_t_single_side_advance(reverse_sources, n_read, 0); // normalize_ma_hit_t_single_side_advance_mult(reverse_sources, n_read, asm_opt.thread_num); if (ha_opt_triobin(&asm_opt)) diff --git a/anchor.cpp b/anchor.cpp index 0adaf37..aeff331 100644 --- a/anchor.cpp +++ b/anchor.cpp @@ -1920,10 +1920,10 @@ void lchain_qgen_mcopy(Candidates_list* cl, overlap_region_alloc* ol, uint32_t r void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, All_reads* rdb, uint32_t apend_be, uint64_t max_n_chain, int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, int64_t quick_check, - uint32_t gen_off, int64_t mcopy_num, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, st_mt_t *sp) + uint32_t gen_off, int64_t mcopy_num, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, st_mt_t *sp, uint64_t ocv_w) { // fprintf(stderr, "+[M::%s] chain_cutoff::%u\n", __func__, chain_cutoff); - uint64_t i, k, l, m, cn = cl->length, yid, ol0, lch; overlap_region *r, t; ///srt = 0 + uint64_t i, k, l, m, cn = cl->length, yid, ol0, lch, *cc = NULL, cwn = 0, cws, cwe, os, oe, rs, re, cw0, cw1/**, dbgn = 0**/; overlap_region *r, t; ///srt = 0 clear_overlap_region_alloc(ol); for (l = 0, k = 1, m = 0, lch = 0; k <= cn; k++) { @@ -1945,9 +1945,9 @@ void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint3 // fprintf(stderr, "[M::%s::] rn::%lu\tmax_n_chain::%lu\n", __func__, ol->length, max_n_chain); // for (k = 0; k < ol->length; k++) { - // fprintf(stderr, "---[M::%s::%.*s(qid::%u)] q[%d, %d), t[%d, %d), khit_off::%u\n", __func__, + // fprintf(stderr, "---[M::%s::%.*s(qid::%u)] q[%d, %d), t[%d, %d), sc::%d, type::%d\n", __func__, // (int32_t)Get_NAME_LENGTH(R_INF, ol->list[k].y_id), Get_NAME(R_INF, ol->list[k].y_id), ol->list[k].y_id, - // ol->list[k].x_pos_s, ol->list[k].x_pos_e+1, ol->list[k].y_pos_s, ol->list[k].y_pos_e+1, ol->list[k].non_homopolymer_errors); + // ol->list[k].x_pos_s, ol->list[k].x_pos_e+1, ol->list[k].y_pos_s, ol->list[k].y_pos_e+1, ol->list[k].shared_seed, ha_ov_type(&(ol->list[k]), rl)); // } k = ol->length; @@ -1961,14 +1961,43 @@ void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint3 ++n[w]; if (((uint64_t)n[w]) == max_n_chain) s[w] = r->shared_seed; } + // fprintf(stderr, "[M::%s::] s[0]::%d, s[1]::%d, s[2]::%d, s[3]::%d\n", __func__, s[0], s[1], s[2], s[3]); if (s[0] > 0 || s[1] > 0 || s[2] > 0 || s[3] > 0) { - // n[0] = n[1] = n[2] = n[3] = 0; + if((((uint64_t)n[3]) >= max_n_chain) && (rl >= ocv_w)) { + cwn = (rl/ocv_w) + ((rl % ocv_w)?(1):(0)); + kv_resize(uint64_t, (*sp), (cwn)); + cc = sp->a; + for (i = cws = cwe = 0; i < cwn; i++) { + cwe = cws + ocv_w; if(cwe > rl) cwe = rl; + assert(cwe > cws); + cc[i] = (cwe - cws)*(max_n_chain>>1); + // fprintf(stderr, "[M::%s::] cw::[%lu, %lu), cc::%lu\n", __func__, cws, cwe, cc[i]); + if(cc[i] > UINT32_MAX) cc[i] = UINT32_MAX; cc[i] <<= 32; + cws += ocv_w; + } + } for (i = 0, k = 0, lch = 0; i < ol->length; ++i) { r = &(ol->list[i]); w = ha_ov_type(r, rl); // ++n[w]; // if (((int)n[w] <= max_n_chain) || (r->shared_seed >= s[w] && s[w] >= (asm_opt.k_mer_length<<1))) { if (r->shared_seed >= s[w]) { + if(cwn) { + m = (ol->list[i].x_pos_s/ocv_w); + rs = ol->list[i].x_pos_s; re = ol->list[i].x_pos_e + 1; + for (cws = m*ocv_w; m < cwn; m++) { + cwe = cws + ocv_w; if(cwe > rl) cwe = rl; + os = ((rs >= cws)? rs : cws); + oe = ((re <= cwe)? re : cwe); + if(oe <= os) break; + if(((uint32_t)cc[m]) + (oe - os) < UINT32_MAX) { + cc[m] += (oe - os); + } else { + cc[m] >>= 32; cc[m] <<= 32; cc[m] |= UINT32_MAX; + } + cws += ocv_w; + } + } if (k != i) { t = ol->list[k]; ol->list[k] = ol->list[i]; @@ -1976,6 +2005,52 @@ void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint3 } if(ol->list[k].align_length < chain_cutoff) lch = 1; ++k; + } else if(w == 3 && cwn > 0) { + m = (ol->list[i].x_pos_s/ocv_w); cw0 = cw1 = 0; + rs = ol->list[i].x_pos_s; re = ol->list[i].x_pos_e + 1; + for (cws = m*ocv_w; m < cwn; m++) { + cwe = cws + ocv_w; if(cwe > rl) cwe = rl; + os = ((rs >= cws)? rs : cws); + oe = ((re <= cwe)? re : cwe); + if(oe <= os) break; + // fprintf(stderr, "+++[M::%s::%.*s(qid::%u)] o[%lu, %lu), cur::%lu, max::%lu\n", __func__, + // (int32_t)Get_NAME_LENGTH(R_INF, ol->list[i].y_id), Get_NAME(R_INF, ol->list[i].y_id), ol->list[i].y_id, + // os, oe, ((uint64_t)((uint32_t)cc[m])), (cc[m]>>32)); + if((oe - os) + ((uint64_t)((uint32_t)cc[m])) >= (cc[m]>>32)) { + cw1 += (oe - os); + } else { + cw0 += (oe - os); + } + cws += ocv_w; + } + + if(cw0 >= ((cw0 + cw1)*0.7)) { + m = (ol->list[i].x_pos_s/ocv_w); + rs = ol->list[i].x_pos_s; re = ol->list[i].x_pos_e + 1; + for (cws = m*ocv_w; m < cwn; m++) { + cwe = cws + ocv_w; if(cwe > rl) cwe = rl; + os = ((rs >= cws)? rs : cws); + oe = ((re <= cwe)? re : cwe); + if(oe <= os) break; + if(((uint32_t)cc[m]) + (oe - os) < UINT32_MAX) { + cc[m] += (oe - os); + } else { + cc[m] >>= 32; cc[m] <<= 32; cc[m] |= UINT32_MAX; + } + cws += ocv_w; + } + if (k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + if(ol->list[k].align_length < chain_cutoff) lch = 1; + ++k; + // dbgn++; + // fprintf(stderr, "+++[M::%s::%.*s(qid::%u)] q[%d, %d), t[%d, %d), sc::%d, type::%d, cw0::%lu, cw1::%lu\n", __func__, + // (int32_t)Get_NAME_LENGTH(R_INF, ol->list[k].y_id), Get_NAME(R_INF, ol->list[k].y_id), ol->list[k].y_id, + // ol->list[k].x_pos_s, ol->list[k].x_pos_e+1, ol->list[k].y_pos_s, ol->list[k].y_pos_e+1, ol->list[k].shared_seed, ha_ov_type(&(ol->list[k]), rl), cw0, cw1); + } } } ol->length = k; @@ -2017,11 +2092,11 @@ void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint3 } l++; } - // fprintf(stderr, "+[M::%s] rid::%u, ol->length0::%lu, ol->length1::%lu\n", __func__, rid, ol->length, l); ol->length = l; } for (i = 0; i < ol->length; ++i) ol->list[i].align_length = 0; + // fprintf(stderr, "+[M::%s] rid::%u, ol->length0::%lu, dbgn::%lu\n", __func__, rid, ol->length, dbgn); } void lchain_qgen_mcopy_fast_re1(Candidates_list* cl, uint32_t cl_beg, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, uint64_t tl, @@ -2225,7 +2300,7 @@ void ul_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t } void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t mcopy_num, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut) + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t mcopy_num, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, uint64_t ocv_w) { extern void *ha_flt_tab; extern ha_pt_t *ha_idx; @@ -2236,11 +2311,11 @@ void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz // lchain_gen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); // lchain_qgen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); ///no need to sort here, overlap_list has been sorted at lchain_gen - lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, mcopy_num, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); + lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, mcopy_num, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp, ocv_w); } void h_ec_lchain_amz(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut) + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, uint64_t ocv_w) { extern void *ha_flt_tab; extern ha_pt_t *ha_idx; @@ -2251,7 +2326,7 @@ void h_ec_lchain_amz(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_ // lchain_gen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); // lchain_qgen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); ///no need to sort here, overlap_list has been sorted at lchain_gen - lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, enable_mcopy, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); + lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, enable_mcopy, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp, ocv_w); } uint64_t recalu_minimizer0(char *s, uint64_t len, uint64_t is_hpc, int64_t mz_k, uint64_t mz_h, tiny_queue_t *tq, uint64_t *rpos, uint64_t *rspan) diff --git a/ecovlp.cpp b/ecovlp.cpp index 07e2f6f..ba112f1 100644 --- a/ecovlp.cpp +++ b/ecovlp.cpp @@ -13,6 +13,7 @@ #define CNS_DEL_V (0x1fffffffu) #define del_cns_nn(z, nn_i) ((z).a[(nn_i)].sc == CNS_DEL_V) #define REFRESH_N 128 +#define COV_W 3072 KDQ_INIT(uint32_t) @@ -81,9 +82,9 @@ typedef struct {size_t n, m; char *a; UC_Read z; asg8_v q;} sl_v; void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t mcopy_num, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t mcopy_num, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, uint64_t ocv_w); void h_ec_lchain_amz(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, uint64_t ocv_w); void h_ec_lchain_re_gen(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, ha_pt_t *ha_idx, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip, UC_Read *tu, asg64_v *oidx, asg16_v *scc); @@ -3226,7 +3227,7 @@ static void worker_hap_ec(void *data, long i, int tid) recover_UC_Read(&b->self_read, &R_INF, i); qlen = b->self_read.length; - h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, ((asm_opt.is_ont)?(0.05):(0.02)), asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 3, 0.7, 2, 32);///ONT high error + h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, ((asm_opt.is_ont)?(0.05):(0.02)), asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 3, 0.7, 2, 32, COV_W);///ONT high error // b->num_read_base += b->olist.length; b->cnt[0] += b->self_read.length; @@ -3817,7 +3818,7 @@ static void worker_hap_dc_ec_gen_new_idx(void *data, long i, int tid) recover_UC_Read(&b->self_read, &R_INF, i); qlen = b->self_read.length; - h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, /**0.02**/0.001, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 3, 0.7, 2, 32); + h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, /**0.02**/0.001, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 3, 0.7, 2, 32, COV_W); overlap_region_sort_y_id(b->olist.list, b->olist.length);