Skip to content

Commit

Permalink
update for low coverge data
Browse files Browse the repository at this point in the history
  • Loading branch information
chhylp123 committed Dec 16, 2024
1 parent 4889f1c commit 3067771
Show file tree
Hide file tree
Showing 8 changed files with 1,073 additions and 154 deletions.
2 changes: 1 addition & 1 deletion CommandLines.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <pthread.h>
#include <stdint.h>

#define HA_VERSION "0.23.0-r691"
#define HA_VERSION "0.24.0-r702"

#define VERBOSE 0

Expand Down
2 changes: 2 additions & 0 deletions Correct.h
Original file line number Diff line number Diff line change
Expand Up @@ -1415,4 +1415,6 @@ void get_wqual(uint64_t zid, uint64_t zpos, uint64_t zrev, asg8_v *v, uint8_t *v
#define HPC_RR 4
#define HPC_CC 2

// #define FORCE_CUT 1

#endif
937 changes: 804 additions & 133 deletions Overlaps.cpp

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions Overlaps.h
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ static inline int count_out_without_del(const asg_t *g, uint32_t v)

void build_string_graph_without_clean(
int min_dp, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources,
long long n_read, uint64_t* readLen, long long mini_overlap_length,
uint64_t n_read, uint64_t* readLen, long long mini_overlap_length,
long long max_hang_length, long long clean_round, long long gap_fuzz,
float min_ovlp_drop_ratio, float max_ovlp_drop_ratio, char* output_file_name,
long long bubble_dist, int read_graph, int write);
Expand Down Expand Up @@ -927,7 +927,7 @@ ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, R_to_U* ruIndex, int m
void rescue_bubble_by_chain(asg_t *sg, ma_sub_t *coverage_cut, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources,
long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex,
float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, uint32_t chainLenThres, long long gap_fuzz,
bub_label_t* b_mask_t, long long no_trio_recover);
bub_label_t* b_mask_t, long long no_trio_recover, uint8_t *cmk);

typedef struct{
double weight;
Expand Down Expand Up @@ -1242,6 +1242,9 @@ uint64_t infer_mmhap_copy(ma_ug_t *ug, asg_t *sg, ma_hit_t_alloc *src, uint8_t *
uint64_t trans_sec_cut0(kv_u_trans_t *ta, asg64_v *srt, uint32_t id, double sec_rate, uint64_t bd, ma_ug_t *ug);
void clean_u_trans_t_idx_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, ma_hit_t_alloc* src, ug_rid_cov_t *in);
void gen_ug_rid_cov_t_by_ovlp(kv_u_trans_t *ta, ug_rid_cov_t *cc);
void rescue_chimeric_reads_aggressive(ma_ug_t *i_ug, asg_t *rg, ma_hit_t_alloc* sources, ma_sub_t *coverage_cut,
R_to_U* ruIndex, int max_hang, int min_ovlp, uint32_t chainLenThres, uint32_t is_bubble_check, uint32_t is_primary_check, kvec_asg_arc_t_warp* new_rtg_edges,
kvec_t_u32_warp* new_rtg_nodes, bub_label_t* b_mask_t, uint8_t *cmk);

#define UC_Read_resize(v, s) do {\
if ((v).size<(s)) {REALLOC((v).seq,(s));(v).size=(s);}\
Expand Down
153 changes: 143 additions & 10 deletions ecovlp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ typedef struct {
typedef struct {
ec_ovec_buf_t0 *a;
uint32_t n, rev;
uint8_t *cr;
} ec_ovec_buf_t;

ec_ovec_buf_t* gen_ec_ovec_buf_t(uint32_t n);
Expand Down Expand Up @@ -180,7 +181,7 @@ void destroy_ec_ovec_buf_t(ec_ovec_buf_t *p)
destroy_cns_gfa(&(z->cns));

}
free(p->a); free(p);
free(p->a); free(p->cr); free(p);

// fprintf(stderr, "[M::%s-chains] #->%lld\n", __func__, asm_opt.num_bases);
// fprintf(stderr, "[M::%s-passed-chains-0] #->%lld\n", __func__, asm_opt.num_corrected_bases);
Expand Down Expand Up @@ -3931,10 +3932,11 @@ uint32_t is_chemical_r(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_t co
}


uint32_t is_chemical_r_adv(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_t cov, int64_t cut_len, double dup_rate)
uint32_t is_chemical_r_adv(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_t cov, int64_t cut_len, double dup_rate, uint64_t is_del)
{
uint64_t k, s, e; int64_t dp, old_dp, st = 0, ed, s0, e0, rr, lt;
for (k = idx->n = 0; k < ov->length; k++) {
if(is_del && ov->buffer[k].del) continue;
s0 = (uint32_t)ov->buffer[k].qns; e0 = ov->buffer[k].qe;
if(s0 > 0) s0 += cut_len;
if(e0 < len) e0 -= cut_len;
Expand Down Expand Up @@ -3988,6 +3990,64 @@ uint32_t is_chemical_r_adv(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_
return 0;
}

int64_t cal_chemical_r_adv(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_t cut_len, double dup_rate, uint64_t is_del)
{
uint64_t k, s, e; int64_t dp, old_dp, st = 0, ed, s0, e0, rr, lt, min_cov;
for (k = idx->n = 0; k < ov->length; k++) {
if(is_del && ov->buffer[k].del) continue;
s0 = (uint32_t)ov->buffer[k].qns; e0 = ov->buffer[k].qe;
if(s0 > 0) s0 += cut_len;
if(e0 < len) e0 -= cut_len;
if(e0 <= s0) continue;
s = s0; e = e0;

lt = Get_READ_LENGTH((R_INF), ov->buffer[k].tn);
rr = (lt >= len)?(lt - len):(len - lt);
if((rr <= (len*dup_rate)) && (rr <= (lt*dup_rate)) && (ov->buffer[k].rev)) {
dp = (ov->buffer[k].qe) - ((uint32_t)ov->buffer[k].qns); dp = len - dp;
old_dp = ov->buffer[k].te - ov->buffer[k].ts; old_dp = lt - old_dp;
if((dp <= (len*dup_rate)) && (old_dp <= (lt*dup_rate))) continue;
}

kv_push(uint64_t, (*idx), (s<<1));
kv_push(uint64_t, (*idx), (e<<1)|1);
}

radix_sort_ec64(idx->a, idx->a + idx->n); s0 = e0 = rr = -1; min_cov = INT64_MAX;
for (k = 0, dp = 0, st = ed = 0; k < idx->n; ++k) {
old_dp = dp;
///if a[j] is qe
if (idx->a[k]&1) --dp;
else ++dp;

ed = idx->a[k]>>1;
if(ed > st) {
// if(ov->length && ((ov->buffer[0].qns>>32) == 5045637)) {
// fprintf(stderr, "[M::%s]\tmd::[%ld,%ld)\tcov::%ld\tlen::%ld\tid::%lu\n", __func__, st, ed, old_dp, len, ov->buffer[0].qns>>32);
// }
if(old_dp <= min_cov) {
// if(ov->length && (ov->buffer[0].qns>>32) == 22344) fprintf(stderr, "[M::%s]\tmd::[%ld,%ld)\tcov::%ld\tlen::%ld\n", __func__, st, ed, old_dp, len);
min_cov = old_dp;
}
}
st = ed;
}


ed = len; old_dp = dp;
if(ed > st) {
// if(ov->length && ((ov->buffer[0].qns>>32) == 5045637)) {
// fprintf(stderr, "[M::%s]\tmd::[%ld,%ld)\tcov::%ld\tlen::%ld\tid::%lu\n", __func__, st, ed, old_dp, len, ov->buffer[0].qns>>32);
// }
if(old_dp <= min_cov) {
// if(ov->length && (ov->buffer[0].qns>>32) == 22344) fprintf(stderr, "[M::%s]\tmd::[%ld,%ld)\tcov::%ld\tlen::%ld\n", __func__, st, ed, old_dp, len);
min_cov = old_dp;
}
}

return min_cov;
}

void prt_dbg_rid_paf(ma_hit_t_alloc *ov, UC_Read *ra, asg8_v *qa)
{
if(!(ov->length)) return;
Expand Down Expand Up @@ -4051,7 +4111,7 @@ static void worker_hap_dc_ec_chemical_r(void *data, long i, int tid)
if(b->cnt[1] == 0) {
// if(i == 6204620) prt_dbg_rid_paf(&(R_INF.paf[i]), &(b->self_read), &(b->v8q));
// if(is_chemical_r(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), 3, 16)) {
if(is_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_cov, asm_opt.chemical_flank, 0.02)) {
if(is_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_cov, asm_opt.chemical_flank, 0.02, 0)) {
// fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i));
R_INF.paf[i].length = 0; b->cnt[0]++;
}
Expand All @@ -4075,11 +4135,56 @@ static void worker_hap_dc_ec_chemical_r(void *data, long i, int tid)
static void worker_hap_dc_ec_chemical_arc(void *data, long i, int tid)
{
ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]);
ma_hit_t_alloc *paf = &(R_INF.paf[i]); uint64_t k;
ma_hit_t_alloc *paf = &(R_INF.paf[i]), *rev; uint64_t k, z;

if(b->cnt[1] == 0) {
if(is_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_cov, asm_opt.chemical_flank, 0.02, 1)) {
// fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i));
for (k = 0; k < paf->length; k++) paf->buffer[k].del = 1; b->cnt[0]++;
}
} else if(b->cnt[1] == 1) {
for (k = 0; k < paf->length; k++) {
if((Get_qn(paf->buffer[k])) > (Get_tn(paf->buffer[k]))) continue;
rev = &(R_INF.paf[paf->buffer[k].tn]);
for (z = 0; z < rev->length; z++) {
if((rev->buffer[z].tn == (Get_qn(paf->buffer[k])))) {
if(paf->buffer[k].del != rev->buffer[z].del) {
paf->buffer[k].del = rev->buffer[z].del = 1;
}
}
}
}
}

if(is_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_cov, asm_opt.chemical_flank, 0.02)) {
// fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i));
for (k = 0; k < paf->length; k++) paf->buffer[k].del = 1; b->cnt[0]++;
refresh_ec_ovec_buf_t0(b, REFRESH_N);
}

static void worker_hap_dc_ec_chemical_arc_mark(void *data, long i, int tid)
{
ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]);
ma_hit_t_alloc *paf = &(R_INF.paf[i]), *rev; uint64_t k, z; int64_t cov, msk_cut = asm_opt.chemical_cov;
uint8_t *msk = ((ec_ovec_buf_t*)data)->cr;

if(b->cnt[1] == 0) {
msk[i] = (uint8_t)-1;
cov = cal_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_flank, 0.02, 1);
if(cov <= msk_cut) msk[i] = cov;
if(cov <= msk_cut/**FORCE_CUT**/) {
// fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i));
for (k = 0; k < paf->length; k++) paf->buffer[k].del = 1; b->cnt[0]++;
}
} else if(b->cnt[1] == 1) {
for (k = 0; k < paf->length; k++) {
if((Get_qn(paf->buffer[k])) > (Get_tn(paf->buffer[k]))) continue;
rev = &(R_INF.paf[paf->buffer[k].tn]);
for (z = 0; z < rev->length; z++) {
if((rev->buffer[z].tn == (Get_qn(paf->buffer[k])))) {
if((paf->buffer[k].del != rev->buffer[z].del) || (msk[Get_qn(paf->buffer[k])] <= msk_cut/**FORCE_CUT**/) || (msk[Get_tn(paf->buffer[k])] <= msk_cut/**FORCE_CUT**/)) {
paf->buffer[k].del = rev->buffer[z].del = 1;
}
}
}
}
}

refresh_ec_ovec_buf_t0(b, REFRESH_N);
Expand Down Expand Up @@ -6066,7 +6171,7 @@ void handle_chemical_r(uint64_t n_thre, uint64_t n_a)

kt_for(n_thre, worker_hap_dc_ec_chemical_r, b, n_a);

fprintf(stderr, "[M::%s] # chemical reads: %lu, # arcs:: %lu\n", __func__, chem_n, dedup);
fprintf(stderr, "[M::%s] # chimeric reads: %lu, # arcs:: %lu\n", __func__, chem_n, dedup);

destroy_ec_ovec_buf_t(b);
}
Expand All @@ -6076,16 +6181,44 @@ void handle_chemical_arc(uint64_t n_thre, uint64_t n_a)
ec_ovec_buf_t *b = NULL; uint64_t k, chem_n = 0;
b = gen_ec_ovec_buf_t(n_thre);
for (k = 0; k < n_thre; ++k) {
b->a[k].cnt[0] = 0;
b->a[k].cnt[0] = 0; b->a[k].cnt[1] = 0;
}

kt_for(n_thre, worker_hap_dc_ec_chemical_arc, b, n_a);

for (k = 0; k < n_thre; ++k) {
chem_n += b->a[k].cnt[0];
b->a[k].cnt[0] = 0; b->a[k].cnt[1] = 1;
}

fprintf(stderr, "[M::%s] # chemical reads: %lu\n", __func__, chem_n);
kt_for(n_thre, worker_hap_dc_ec_chemical_arc, b, n_a);

fprintf(stderr, "[M::%s] # chimeric reads: %lu\n", __func__, chem_n);

destroy_ec_ovec_buf_t(b);
}

uint8_t* gen_chemical_arc_rf(uint64_t n_thre, uint64_t n_a)
{
ec_ovec_buf_t *b = NULL; uint64_t k, chem_n = 0; uint8_t *ra = NULL;
b = gen_ec_ovec_buf_t(n_thre);
for (k = 0; k < n_thre; ++k) {
b->a[k].cnt[0] = 0; b->a[k].cnt[1] = 0;
}
MALLOC(ra, n_a); ///memset(ra, -1, sizeof((*ra))*n_a);
b->cr = ra;

kt_for(n_thre, worker_hap_dc_ec_chemical_arc_mark, b, n_a);

for (k = 0; k < n_thre; ++k) {
chem_n += b->a[k].cnt[0];
b->a[k].cnt[0] = 0; b->a[k].cnt[1] = 1;
}

kt_for(n_thre, worker_hap_dc_ec_chemical_arc_mark, b, n_a);

fprintf(stderr, "[M::%s] # chimeric reads: %lu\n", __func__, chem_n);

b->cr = NULL; destroy_ec_ovec_buf_t(b);
return ra;
}
1 change: 1 addition & 0 deletions ecovlp.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ void sl_ec_r(uint64_t n_thre, uint64_t n_a);
void cal_ov_r(uint64_t n_thre, uint64_t n_a, uint64_t new_idx);
void handle_chemical_r(uint64_t n_thre, uint64_t n_a);
void handle_chemical_arc(uint64_t n_thre, uint64_t n_a);
uint8_t* gen_chemical_arc_rf(uint64_t n_thre, uint64_t n_a);

#endif
Loading

0 comments on commit 3067771

Please sign in to comment.