1 Star 0 Fork 0

Velcon-Zheng/wtdbg2

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
wtpoa.h 32.94 KB
一键复制 编辑 原始数据 按行查看 历史
ruanjue 提交于 2019-09-01 16:23 . optimization of SAMBlock
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164
/*
*
* Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com>
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __WTDBG_POA_CNS_RJ_H
#define __WTDBG_POA_CNS_RJ_H
#include "tripoa.h"
#include "kswx.h"
#include "filereader.h"
typedef struct {
char *reftag;
u4i reflen, refoff;
u4i node1, node2;
} lay_blk_t;
typedef struct {
u4i chridx, bidx; // block idx
u4i rdidx, rdoff:31, rddir:1;
char *rdtag;
BaseBank *seq;
u4i rbeg, rend;
} lay_seq_t;
static inline void _init_lay_seq_t(lay_seq_t *sq){
ZEROS(sq);
sq->seq = init_basebank();
}
static inline void _free_lay_seq_t(lay_seq_t *sq){
free_basebank(sq->seq);
}
define_recycle_list(layseqr, lay_seq_t, u4i, _init_lay_seq_t(a), _free_lay_seq_t(a));
typedef lay_seq_t* (*iter_cns_block)(void *obj);
typedef void (*info_cns_block)(void *obj, lay_seq_t *sq, lay_blk_t *bk);
typedef struct {
u4i cidx, eidx;
u4i node1, node2, soff, slen;
int beg, end;
} edge_cns_t;
define_list(edgecnsv, edge_cns_t);
typedef struct {
u4i cidx;
edgecnsv *rs;
String *tag;
BaseBank *seq, *cns;
u4i cnt;
} ctg_cns_t;
define_list(ctgcnsv, ctg_cns_t*);
static inline ctg_cns_t* init_ctg(u4i cidx){
ctg_cns_t *ctg;
ctg = malloc(sizeof(ctg_cns_t));
ctg->cidx = cidx;
ctg->rs = init_edgecnsv(2);
ctg->tag = init_string(32);
ctg->seq = init_basebank(2048);
ctg->cns = init_basebank(2048);
ctg->cnt = 0;
return ctg;
}
static inline void clear_ctg(ctg_cns_t *ctg){
ctg->cidx = MAX_U4;
ctg->cnt = 0;
clear_edgecnsv(ctg->rs);
clear_string(ctg->tag);
clear_basebank(ctg->seq);
clear_basebank(ctg->cns);
}
static inline void free_ctg(ctg_cns_t *ctg){
free_edgecnsv(ctg->rs);
free_string(ctg->tag);
free_basebank(ctg->seq);
free_basebank(ctg->cns);
free(ctg);
}
typedef struct {
u4i cidx, eidx;
int type; // 0, none; 1, edge cns; 2, edges aligning; 3, merging into cns
} cns_evt_t;
define_list(cnsevtv, cns_evt_t);
struct CTGCNS;
thread_beg_def(mcns);
struct CTGCNS *cc;
TriPOG *g;
cns_evt_t task;
edge_cns_t edges[2];
u1v *seq1, *seq2;
thread_end_def(mcns);
typedef struct CTGCNS {
void *obj;
iter_cns_block itercns;
info_cns_block infocns;
lay_seq_t *sq;
lay_blk_t BK;
u4i ncpu;
ctgcnsv *ctgs, *cycs;
u4i nctg;
cnsevtv *evts;
u8i ridx, bases;
u4i chridx, bidx;
int state;
u4i seqmax, inpmax;
int winlen, winmin, fail_skip;
int M, X, I, D, E, reglen;
u8i tri_rets[2];
u4i print_progress;
thread_def_shared_vars(mcns);
} CTGCNS;
static inline int revise_joint_point(u32list *cigars, int *qe, int *te){
u4i i, op, ln, max;
int qq, q, tt, t;
q = t = 0;
qq = tt = 0;
max = 0;
for(i=1;i<=cigars->size;i++){
op = cigars->buffer[cigars->size - i] & 0xF;
ln = cigars->buffer[cigars->size - i] >> 4;
if(op == 0){
if(ln > max){
qq = q; tt = t;
max = ln;
}
q += ln;
t += ln;
} else if(op == 1){
q += ln;
} else {
t += ln;
}
}
*qe -= qq;
*te -= tt;
return 1;
}
thread_beg_func(mcns);
struct CTGCNS *cc;
TriPOG *g;
ctg_cns_t *ctg;
edge_cns_t *edge;
u1v *seq1, *seq2;
kswx_t XX, *xs[2];
u8list *mem_cache;
u32list *cigars[2];
u4i i;
int qb, qe, tb, te, b, e, ol;
cc = mcns->cc;
g = mcns->g;
seq1 = mcns->seq1;
seq2 = mcns->seq2;
mem_cache = init_u8list(1024);
cigars[0] = init_u32list(64);
cigars[1] = NULL;
xs[0] = &XX;
xs[1] = NULL;
thread_beg_loop(mcns);
if(mcns->task.type == 1){
if(g->seqs->nseq){
end_tripog(g);
}
} else if(mcns->task.type == 2){
qb = 0; qe = seq1->size;
tb = 0; te = seq2->size;
if(qe > cc->reglen) qb = qe - cc->reglen;
if(te > cc->reglen) te = cc->reglen;
ol = num_min(qe -qb, te - tb);
kswx_overlap_align_core(xs, cigars, qe - qb, seq1->buffer + qb, te - tb, seq2->buffer + tb, 1, cc->M, cc->X, (cc->I + cc->D) / 2, (cc->I + cc->D) / 2, cc->E, mem_cache);
if(XX.aln < Int(0.5 * cc->reglen) || XX.mat < Int(XX.aln * 0.9)){
// full length alignment
int maxl;
maxl = num_min(seq1->size, seq2->size);
maxl = num_min(maxl, cc->reglen * 4);
qb = 0; qe = seq1->size;
tb = 0; te = seq2->size;
if(qe > maxl) qb = qe - maxl;
if(te > maxl) te = maxl;
ol = num_min(qe -qb, te - tb);
kswx_overlap_align_core(xs, cigars, qe - qb, seq1->buffer + qb, te - tb, seq2->buffer + tb, 1, cc->M, cc->X, (cc->I + cc->D) / 2, (cc->I + cc->D) / 2, cc->E, mem_cache);
}
XX.qb += qb;
XX.qe += qb;
XX.tb += tb;
XX.te += tb;
b = XX.qe;
e = XX.te;
revise_joint_point(cigars[0], &b, &e);
mcns->edges[0].end = b;
mcns->edges[1].beg = e;
if(cns_debug){
thread_beg_syn(mcns);
fprintf(stderr, "JOINT\tC%u\tE%u\tqe = %d -> %d\tte = %d -> %d\t[%5d,%5d]\t[%5d,%5d,%d,%d,%d]\n", mcns->edges[0].cidx, mcns->edges[0].eidx, XX.qe, b, XX.te, e, (int)seq1->size, (int)seq2->size, XX.aln, XX.mat, XX.mis, XX.ins, XX.del);
fflush(stderr);
thread_end_syn(mcns);
}
} else if(mcns->task.type == 3){
ctg = get_ctgcnsv(cc->ctgs, mcns->task.cidx);
clear_basebank(ctg->cns);
for(i=0;i<ctg->rs->size;i++){
edge = ref_edgecnsv(ctg->rs, i);
if(edge->end > edge->beg){
fast_fwdbits2basebank(ctg->cns, ctg->seq->bits, edge->soff + edge->beg, edge->end - edge->beg);
} else if(Int(ctg->cns->size) + edge->end > edge->beg){
ctg->cns->size = ctg->cns->size + edge->end - edge->beg;
normalize_basebank(ctg->cns);
} else {
clear_basebank(ctg->cns);
}
}
}
thread_end_loop(mcns);
free_u8list(mem_cache);
free_u32list(cigars[0]);
thread_end_func(mcns);
static inline CTGCNS* init_ctgcns(void *obj, iter_cns_block itercns, info_cns_block infocns, u4i ncpu, int shuffle_rds, u4i seqmax, int winlen, int winmin, int fail_skip, int reglen, POGPar *par){
CTGCNS *cc;
thread_prepare(mcns);
cc = malloc(sizeof(CTGCNS));
cc->obj = obj;
cc->itercns = itercns;
cc->infocns = infocns;
cc->sq = NULL;
ZEROS(&cc->BK);
cc->ncpu = ncpu;
cc->ctgs = init_ctgcnsv(8);
cc->cycs = init_ctgcnsv(8);
cc->evts = init_cnsevtv(64);
cc->nctg = 0;
cc->ridx = 0;
cc->bases = 0;
cc->state = 1;
cc->seqmax = seqmax;
cc->inpmax = seqmax * 5;
cc->winlen = winlen;
cc->winmin = winmin;
cc->fail_skip = fail_skip;
cc->M = par->M;
cc->X = par->X;
cc->I = par->I;
cc->D = par->D;
cc->E = par->E;
cc->reglen = reglen;
thread_beg_init(mcns, ncpu);
mcns->cc = cc;
mcns->g = init_tripog(seqmax, shuffle_rds, winlen, winmin, fail_skip, par);
ZEROS(&mcns->task);
mcns->seq1 = init_u1v(1024);
mcns->seq2 = init_u1v(1024);
ZEROS(&(mcns->edges[0]));
ZEROS(&(mcns->edges[1]));
mcns->edges[0].eidx = MAX_U4;
mcns->edges[1].eidx = MAX_U4;
thread_end_init(mcns);
cc->tri_rets[0] = 0;
cc->tri_rets[1] = 0;
cc->print_progress = 0;
cc->chridx = MAX_U4;
cc->bidx = MAX_U4;
thread_export(mcns, cc);
return cc;
}
static inline void reset_ctgcns(CTGCNS *cc, void *obj, iter_cns_block itercns, info_cns_block infocns){
ctg_cns_t *ctg;
u4i i;
cc->obj = obj;
cc->itercns = itercns;
cc->infocns = infocns;
cc->sq = NULL;
ZEROS(&cc->BK);
cc->ridx = 0;
cc->state = 1;
cc->chridx = MAX_U4;
cc->bidx = MAX_U4;
for(i=0;i<cc->ctgs->size;i++){
ctg = get_ctgcnsv(cc->ctgs, i);
if(ctg == NULL) continue;
free_ctg(ctg);
}
clear_ctgcnsv(cc->ctgs);
cc->nctg = 0;
clear_cnsevtv(cc->evts);
cc->bases = 0;
}
static inline void free_ctgcns(CTGCNS *cc){
ctg_cns_t *ctg;
u4i i;
thread_prepare(mcns);
thread_import(mcns, cc);
thread_beg_close(mcns);
free_tripog(mcns->g);
free_u1v(mcns->seq1);
free_u1v(mcns->seq2);
thread_end_close(mcns);
for(i=0;i<cc->ctgs->size;i++){
ctg = get_ctgcnsv(cc->ctgs, i);
if(ctg == NULL) continue;
free_ctg(ctg);
}
free_ctgcnsv(cc->ctgs);
for(i=0;i<cc->cycs->size;i++){
ctg = get_ctgcnsv(cc->cycs, i);
if(ctg == NULL) continue;
free_ctg(ctg);
}
free_ctgcnsv(cc->cycs);
free_cnsevtv(cc->evts);
free(cc);
}
static inline void print_lays_ctgcns(CTGCNS *cc, FILE *out){
lay_blk_t *bk;
bk = &cc->BK;
while((cc->sq = cc->itercns(cc->obj))){
if(cc->sq->chridx != cc->chridx){
cc->chridx = cc->sq->chridx;
cc->bidx = MAX_U4;
cc->infocns(cc->obj, cc->sq, bk);
fflush(out);
fprintf(out, ">%s len=%u\n", bk->reftag, bk->reflen);
}
if(cc->sq->bidx != cc->bidx){
cc->bidx = cc->sq->bidx;
cc->infocns(cc->obj, cc->sq, bk);
fprintf(out, "E\t%u\tN%u\t+\tN%u\t+\n", bk->refoff, bk->node1, bk->node2);
}
{
if(cc->sq->rdtag){
fprintf(out, "S\t%s\t%c\t%u\t%u\t", cc->sq->rdtag, "+-"[cc->sq->rddir], cc->sq->rdoff, (u4i)cc->sq->seq->size);
} else {
fprintf(out, "S\tR%llu\t%c\t%u\t%u\t", (u8i)cc->sq->rdidx, "+-"[cc->sq->rddir], cc->sq->rdoff, (u4i)cc->sq->seq->size);
}
print_seq_basebank(cc->sq->seq, 0, cc->sq->seq->size, out);
fprintf(out, "\t%u\t%u\n", cc->sq->rbeg, cc->sq->rend);
}
}
}
static inline void repay_ctgcns(CTGCNS *cc, ctg_cns_t *ctg){
clear_ctg(ctg);
push_ctgcnsv(cc->cycs, ctg);
}
static inline ctg_cns_t* iter_ctgcns(CTGCNS *cc){
ctg_cns_t *ctg, *ret;
edge_cns_t *edge;
cns_evt_t EVT;
lay_blk_t *bk;
u8i eidx;
thread_prepare(mcns);
thread_import(mcns, cc);
bk = &cc->BK;
ret = NULL;
while(1){
cc->sq = cc->itercns(cc->obj);
if(cc->sq){
if(cc->sq->chridx != cc->chridx){
if(cc->cycs->size){
pop_ctgcnsv(cc->cycs, &ctg);
ctg->cidx = cc->ctgs->size;
} else {
ctg = init_ctg(cc->ctgs->size);
}
push_ctgcnsv(cc->ctgs, ctg);
cc->chridx = cc->sq->chridx;
cc->bidx = MAX_U4;
cc->infocns(cc->obj, cc->sq, bk);
if(bk->reftag){
append_string(ctg->tag, bk->reftag, strlen(bk->reftag));
} else {
append_string(ctg->tag, "anonymous", 10);
}
}
if(cc->sq->bidx != cc->bidx){
cc->ridx ++;
if(!cns_debug && cc->print_progress && (cc->ridx % cc->print_progress) == 0){
fprintf(stderr, "\r%u contigs %llu edges %llu bases", (u4i)cc->ctgs->size, cc->ridx, cc->bases); fflush(stderr);
}
} else {
if(mcns->g->seqs->nseq < cc->inpmax){
fwdbitpush_tripog(mcns->g, cc->sq->seq->bits, 0, cc->sq->seq->size, cc->sq->rbeg, cc->sq->rend);
}
continue;
}
}
if(mcns->task.type != 0){
thread_wake(mcns);
}
while(1){
thread_wait_one(mcns);
if(mcns->task.type == 2){
ctg = get_ctgcnsv(cc->ctgs, mcns->task.cidx);
ctg->rs->buffer[mcns->edges[0].eidx].end = mcns->edges[0].end;
ctg->rs->buffer[mcns->edges[1].eidx].beg = mcns->edges[1].beg;
mcns->edges[0].eidx = MAX_U4;
mcns->edges[1].eidx = MAX_U4;
mcns->task.type = 0;
ctg->cnt ++;
if(ctg->cnt + 1 == ctg->rs->size && (ctg->cidx + 1 < cc->ctgs->size || cc->sq == NULL)){
EVT.cidx = ctg->cidx;
EVT.eidx = MAX_U4;
EVT.type = 3;
array_heap_push(cc->evts->buffer, cc->evts->size, cc->evts->cap, cns_evt_t, EVT, num_cmpxx(b.type, a.type, a.cidx, b.cidx, a.eidx, b.eidx));
}
} else if(mcns->task.type == 1){
ctg = get_ctgcnsv(cc->ctgs, mcns->task.cidx);
cc->tri_rets[mcns->g->is_tripog] ++;
if(cns_debug){
thread_beg_syn(mcns);
fprintf(stderr, "%u_%s_N%u_N%u\t%d\t%d\t", mcns->task.eidx, ctg->tag->string, mcns->edges[0].node1, mcns->edges[0].node2, mcns->g->seqs->nseq, (u4i)mcns->g->cns->size);
println_seq_basebank(mcns->g->cns, 0, mcns->g->cns->size, stderr);
thread_end_syn(mcns);
}
mcns->edges[0].soff = ctg->seq->size;
mcns->edges[0].slen = mcns->g->cns->size;
mcns->edges[0].beg = 0;
mcns->edges[0].end = mcns->g->cns->size;
fast_fwdbits2basebank(ctg->seq, mcns->g->cns->bits, 0, mcns->g->cns->size);
eidx = mcns->task.eidx;
ctg->rs->buffer[eidx] = mcns->edges[0];
mcns->edges[0].eidx = MAX_U4;
mcns->task.type = 0;
if(eidx && ctg->rs->buffer[eidx - 1].eidx != MAX_U4){
EVT.cidx = ctg->cidx;
EVT.eidx = eidx - 1;
EVT.type = 2;
array_heap_push(cc->evts->buffer, cc->evts->size, cc->evts->cap, cns_evt_t, EVT, num_cmpxx(b.type, a.type, a.cidx, b.cidx, a.eidx, b.eidx));
}
if(eidx + 1 < ctg->rs->size && ctg->rs->buffer[eidx + 1].eidx != MAX_U4){
EVT.cidx = ctg->cidx;
EVT.eidx = eidx;
EVT.type = 2;
array_heap_push(cc->evts->buffer, cc->evts->size, cc->evts->cap, cns_evt_t, EVT, num_cmpxx(b.type, a.type, a.cidx, b.cidx, a.eidx, b.eidx));
}
if(ctg->rs->size == 1 && (ctg->cidx + 1 < cc->ctgs->size || cc->sq == NULL)){
EVT.cidx = ctg->cidx;
EVT.eidx = MAX_U4;
EVT.type = 3;
array_heap_push(cc->evts->buffer, cc->evts->size, cc->evts->cap, cns_evt_t, EVT, num_cmpxx(b.type, a.type, a.cidx, b.cidx, a.eidx, b.eidx));
}
} else if(mcns->task.type == 3){
ctg = get_ctgcnsv(cc->ctgs, mcns->task.cidx);
cc->bases += ctg->cns->size;
cc->nctg ++;
ret = ctg;
set_ctgcnsv(cc->ctgs, mcns->task.cidx, NULL);
mcns->task.type = 0;
break;
}
if(cc->evts->size){
EVT = cc->evts->buffer[0];
array_heap_remove(cc->evts->buffer, cc->evts->size, cc->evts->cap, cns_evt_t, 0, num_cmpxx(b.type, a.type, a.cidx, b.cidx, a.eidx, b.eidx));
mcns->task = EVT;
if(EVT.type == 2){
ctg = get_ctgcnsv(cc->ctgs, EVT.cidx);
mcns->edges[0] = ctg->rs->buffer[EVT.eidx];
mcns->edges[0].eidx = EVT.eidx;
clear_and_inc_u1v(mcns->seq1, mcns->edges[0].slen);
bitseq_basebank(ctg->seq, mcns->edges[0].soff, mcns->edges[0].slen, mcns->seq1->buffer);
mcns->edges[1] = ctg->rs->buffer[EVT.eidx + 1];
mcns->edges[1].eidx = EVT.eidx + 1;
clear_and_inc_u1v(mcns->seq2, mcns->edges[1].slen);
bitseq_basebank(ctg->seq, mcns->edges[1].soff, mcns->edges[1].slen, mcns->seq2->buffer);
}
thread_wake(mcns);
} else {
break;
}
}
if(cc->sq){
ctg = get_ctgcnsv(cc->ctgs, cc->ctgs->size - 1);
cc->bidx = cc->sq->bidx;
cc->infocns(cc->obj, cc->sq, bk);
mcns->task.cidx = ctg->cidx;
mcns->task.eidx = ctg->rs->size;
mcns->task.type = 1;
mcns->edges[1].eidx = MAX_U4;
edge = next_ref_edgecnsv(ctg->rs);
edge->cidx = ctg->cidx;
edge->eidx = MAX_U4;
edge->node1 = bk->node1;
edge->node2 = bk->node2;
edge->soff = 0;
edge->slen = 0;
edge->beg = 0;
edge->end = 0;
mcns->edges[0] = *edge;
mcns->edges[0].eidx = offset_edgecnsv(ctg->rs, edge);
beg_tripog(mcns->g);
fwdbitpush_tripog(mcns->g, cc->sq->seq->bits, 0, cc->sq->seq->size, cc->sq->rbeg, cc->sq->rend);
}
if(ret){
break;
} else if(cc->sq == NULL){
if(cc->nctg == cc->ctgs->size){ // all finished
if(!cns_debug && cc->print_progress){
fprintf(stderr, "\r%u contigs %llu edges %llu bases\n", (u4i)cc->ctgs->size, cc->ridx, cc->bases); fflush(stderr);
}
break;
}
}
}
thread_export(mcns, cc);
return ret;
}
/*
static inline int iter_ctgcns2(CTGCNS *cc){
edge_cns_t *edge;
lay_blk_t *bk;
u8i i, eidx;
int next, waitall;
thread_prepare(mcns);
thread_import(mcns, cc);
next = 0;
waitall = 0;
bk = &cc->BK;
while(cc->state){
if(cc->state == 1){
if(cc->sq == NULL){
cc->sq = cc->itercns(cc->obj);
}
if(cc->sq == NULL || cc->sq->chridx != cc->chridx){
if(cc->chridx != MAX_U4){
if(mcns->edges[0].idx != MAX_U8){
thread_wake(mcns);
cc->erun ++;
}
cc->state = 2;
next = 4;
waitall = 1;
} else if(cc->sq){
cc->state = 5;
} else {
thread_export(mcns, cc);
return 0;
}
} else if(cc->sq->bidx != cc->bidx){
cc->ridx ++;
if(mcns->edges[0].idx != MAX_U8){
thread_wake(mcns);
cc->erun ++;
}
cc->state = 2;
next = 3;
waitall = 0;
if(!cns_debug && cc->print_progress && (cc->ridx % cc->print_progress) == 0){
fprintf(stderr, "\r%u contigs %llu edges %llu bases", (u4i)cc->ctgs->size, cc->ridx, cc->bases); fflush(stderr);
}
} else {
if(0){
if(cc->sq->rdtag){
fprintf(stdout, "S\t%s\t%c\t%u\t%u\t", cc->sq->rdtag, "+-"[cc->sq->rddir], cc->sq->rdoff, (u4i)cc->sq->seq->size);
} else {
fprintf(stdout, "S\tR%llu\t%c\t%u\t%u\t", (u8i)cc->sq->rdidx, "+-"[cc->sq->rddir], cc->sq->rdoff, (u4i)cc->sq->seq->size);
}
print_seq_basebank(cc->sq->seq, 0, cc->sq->seq->size, stdout);
fprintf(stdout, "\t%u\t%u\n", cc->sq->rbeg, cc->sq->rend);
}
if(mcns->g->seqs->nseq < cc->inpmax){
fwdbitpush_tripog(mcns->g, cc->sq->seq->bits, 0, cc->sq->seq->size, cc->sq->rbeg, cc->sq->rend);
}
cc->sq = NULL;
}
} else if(cc->state == 2){
while(1){
if(waitall){
if(cc->tasks->size == 0){
thread_wait_done(mcns);
} else {
thread_wait_one(mcns);
}
} else {
thread_wait_one(mcns);
}
if(mcns->edges[1].idx != MAX_U8){
cc->erun --;
cc->rs->buffer[mcns->edges[0].idx].end = mcns->edges[0].end;
cc->rs->buffer[mcns->edges[1].idx].beg = mcns->edges[1].beg;
mcns->edges[0].idx = MAX_U8;
mcns->edges[1].idx = MAX_U8;
} else if(mcns->edges[0].idx != MAX_U8){
cc->erun --;
cc->tri_rets[mcns->g->is_tripog] ++;
if(cns_debug){
thread_beg_syn(mcns);
fprintf(stderr, "%llu_%s_N%u_N%u\t%d\t%d\t", mcns->edges[0].idx, cc->tag->string, mcns->edges[0].node1, mcns->edges[0].node2, mcns->g->seqs->nseq, (u4i)mcns->g->cns->size);
println_seq_basebank(mcns->g->cns, 0, mcns->g->cns->size, stderr);
thread_end_syn(mcns);
}
mcns->edges[0].soff = cc->seq->size;
mcns->edges[0].slen = mcns->g->cns->size;
mcns->edges[0].beg = 0;
mcns->edges[0].end = mcns->g->cns->size;
fast_fwdbits2basebank(cc->seq, mcns->g->cns->bits, 0, mcns->g->cns->size);
eidx = mcns->edges[0].idx;
cc->rs->buffer[eidx] = mcns->edges[0];
mcns->edges[0].idx = MAX_U8;
if(eidx && cc->rs->buffer[eidx - 1].idx != MAX_U8){
push_u8v(cc->tasks, eidx - 1);
}
if(eidx + 1 < cc->rs->size && cc->rs->buffer[eidx + 1].idx != MAX_U8){
push_u8v(cc->tasks, eidx);
}
}
if(pop_u8v(cc->tasks, &eidx)){
mcns->edges[0] = cc->rs->buffer[eidx];
mcns->edges[0].idx = eidx;
clear_and_inc_u1v(mcns->seq1, mcns->edges[0].slen);
bitseq_basebank(cc->seq, mcns->edges[0].soff, mcns->edges[0].slen, mcns->seq1->buffer);
mcns->edges[1] = cc->rs->buffer[eidx + 1];
mcns->edges[1].idx = eidx + 1;
clear_and_inc_u1v(mcns->seq2, mcns->edges[1].slen);
bitseq_basebank(cc->seq, mcns->edges[1].soff, mcns->edges[1].slen, mcns->seq2->buffer);
thread_wake(mcns);
cc->erun ++;
} else {
if(waitall){
if(cc->erun == 0){
break;
}
} else {
break;
}
}
}
cc->state = next;
} else if(cc->state == 3){
cc->bidx = cc->sq->bidx;
cc->infocns(cc->obj, cc->sq, bk);
mcns->edges[1].idx = MAX_U8;
edge = next_ref_edgecnsv(cc->rs);
edge->idx = MAX_U8;
edge->node1 = bk->node1;
edge->node2 = bk->node2;
edge->soff = 0;
edge->slen = 0;
edge->beg = 0;
edge->end = 0;
mcns->edges[0] = *edge;
mcns->edges[0].idx = offset_edgecnsv(cc->rs, edge);
beg_tripog(mcns->g);
cc->state = 1;
if(0){
fprintf(stdout, "E\t%u\tN%u\t+\tN%u\t+\n", bk->refoff, bk->node1, bk->node2);
}
} else if(cc->state == 4){
clear_basebank(cc->cns);
for(i=0;i<cc->rs->size;i++){
edge = ref_edgecnsv(cc->rs, i);
if(edge->end > edge->beg){
fast_fwdbits2basebank(cc->cns, cc->seq->bits, edge->soff + edge->beg, edge->end - edge->beg);
} else if(Int(cc->cns->size) + edge->end > edge->beg){
cc->cns->size = cc->cns->size + edge->end - edge->beg;
normalize_basebank(cc->cns);
} else {
clear_basebank(cc->cns);
}
}
cc->bases += cc->cns->size;
if(cc->sq == NULL){
if(!cns_debug && cc->print_progress){
fprintf(stderr, "\r%u contigs %llu edges %llu bases\n", (u4i)cc->ctgs->size, cc->ridx, cc->bases); fflush(stderr);
}
cc->state = 0;
} else {
cc->state = 5;
}
thread_export(mcns, cc);
return 1;
} else if(cc->state == 5){
cc->chridx = cc->sq->chridx;
//cc->cidx ++;
cc->bidx = MAX_U4;
cc->infocns(cc->obj, cc->sq, bk);
clear_string(cc->tag);
if(bk->reftag){
append_string(cc->tag, bk->reftag, strlen(bk->reftag));
} else {
append_string(cc->tag, "anonymous", 10);
}
clear_basebank(cc->seq);
clear_u8v(cc->tasks);
clear_edgecnsv(cc->rs);
cc->state = 1;
if(0){
fflush(stdout);
fprintf(stdout, ">%s len=%u\n", bk->reftag, bk->reflen);
}
}
}
thread_export(mcns, cc);
return 0;
}
*/
typedef struct {
u4i chridx;
u4v *chrs;
SeqBank *refs;
BitVec *vsts;
FileReader *fr;
layseqr *seqs;
u4v *heap;
u8i rdidx;
u4i cidx, bidx, sidx, lidx;
u2i bsize, bstep; // block size, and slide steps
int flags; // if flags & 0x1, only polish reference presented in SAM lines. 0x2: Don't filter secondary/supplementary alignments
u4i bidx2;
} SAMBlock;
static inline SAMBlock* init_samblock(SeqBank *refs, FileReader *fr, u2i bsize, u2i bovlp, int flags){
SAMBlock *sb;
lay_seq_t *stop;
u2i bstep;
bstep = bsize - bovlp;
assert(bstep <= bsize && 2 * bstep >= bsize);
sb = malloc(sizeof(SAMBlock));
sb->chridx = 0;
sb->chrs = init_u4v(32);
push_u4v(sb->chrs, MAX_U4);
sb->refs = refs;
sb->vsts = init_bitvec(sb->refs->nseq);
sb->fr = fr;
sb->seqs = init_layseqr(1024);
sb->heap = init_u4v(1024);
sb->rdidx = 0;
sb->cidx = 1;
sb->bidx = 0;
sb->bidx2 = MAX_U4;
sb->bsize = bsize;
sb->bstep = bstep;
sb->sidx = MAX_U4; // last output lay_seq
sb->lidx = MAX_U4; // last input lay_seq
sb->flags = flags;
stop = pop_layseqr(sb->seqs);
stop->chridx = refs->nseq + 1;
stop->bidx = 0;
return sb;
}
static inline void free_samblock(SAMBlock *sb){
free_u4v(sb->chrs);
free_bitvec(sb->vsts);
free_layseqr(sb->seqs);
free_u4v(sb->heap);
free(sb);
}
static inline lay_seq_t* _push_padding_ref_samblock(SAMBlock *sb, lay_seq_t *sq){
lay_seq_t *st;
u8i off;
u4i sqidx, len, i;
if(sb->cidx > sb->refs->nseq) return sq;
sqidx = offset_layseqr(sb->seqs, sq);
if((sb->flags & 0x1) && sb->bidx2 == MAX_U4){
sb->bidx = sq->bidx;
}
sb->bidx2 = sb->bidx;
while(sb->cidx < sq->chridx || (sb->cidx == sq->chridx && sb->bidx <= sq->bidx)){
st = pop_layseqr(sb->seqs);
st->chridx = sb->cidx;
st->bidx = sb->bidx;
st->rdidx = 0;
st->rddir = 0;
st->rdoff = st->bidx * sb->bstep;
st->rbeg = st->rdoff;
if(sb->chrs->size <= sb->cidx){
for(i=0;i<sb->refs->nseq;i++){
if(get_bitvec(sb->vsts, i) == 0) break;
}
if(i == sb->refs->nseq) break;
push_u4v(sb->chrs, i);
one_bitvec(sb->vsts, i);
}
off = sb->refs->rdoffs->buffer[sb->chrs->buffer[sb->cidx]];
len = sb->refs->rdlens->buffer[sb->chrs->buffer[sb->cidx]];
st->rend = num_min(st->rbeg + sb->bsize, len);
clear_basebank(st->seq);
fast_fwdbits2basebank(st->seq, sb->refs->rdseqs->bits, off + st->rdoff, st->rend - st->rbeg);
array_heap_push(sb->heap->buffer, sb->heap->size, sb->heap->cap, u4i, offset_layseqr(sb->seqs, st),
num_cmpxx(ref_layseqr(sb->seqs, a)->chridx, ref_layseqr(sb->seqs, b)->chridx, ref_layseqr(sb->seqs, a)->bidx,
ref_layseqr(sb->seqs, b)->bidx, ref_layseqr(sb->seqs, a)->rdidx, ref_layseqr(sb->seqs, b)->rdidx));
if(st->rend >= len){
sb->cidx ++;
sb->bidx = 0;
sb->bidx2 = sb->bidx;
if(sb->cidx >= sb->refs->nseq) break;
} else {
sb->bidx ++;
}
sq = ref_layseqr(sb->seqs, sqidx);
}
sq = ref_layseqr(sb->seqs, sqidx);
return sq;
}
static inline lay_seq_t* iter_samblock(void *obj){
SAMBlock *sb;
lay_seq_t *sc, *sl;
u4i chr, chridx, scidx, off, minlen, rddir, rdoff, nxt, val, len, op;
char *ptr, *str;
int c, samflag;
sb = (SAMBlock*)obj;
if(sb->sidx != MAX_U4){
recyc_layseqr(sb->seqs, sb->sidx);
sb->sidx = MAX_U4;
}
do {
if(sb->heap->size == 0){
sb->lidx = MAX_U4;
}
while(sb->lidx == MAX_U4){
c = readtable_filereader(sb->fr);
if(c == -1){
if(sb->flags & 0x1){
} else {
sc = ref_layseqr(sb->seqs, 0);
sc = _push_padding_ref_samblock(sb, sc);
}
break;
}
if(sb->fr->line->string[0] == '@') continue;
if(c < 11) continue;
samflag = atoi(get_col_str(sb->fr, 1));
if(samflag & 0x004) continue;
//if(get_col_str(sb->fr, 9)[0] == '*') continue;
if(!(sb->flags & 0x2)){
if(samflag & (0x100 | 0x800)) continue; // filter secondary/supplementary alignments
}
// chr
if((chr = getval_cuhash(sb->refs->rdhash, get_col_str(sb->fr, 2))) == MAX_U4){
continue;
}
if(chr != sb->chrs->buffer[sb->chridx]){
chridx = ++ sb->chridx;
push_u4v(sb->chrs, chr);
one_bitvec(sb->vsts, chr);
} else {
chridx = sb->chridx;
}
off = atol(get_col_str(sb->fr, 3)) - 1;
ptr = get_col_str(sb->fr, 5);
if((*ptr) == '*') continue;
sb->rdidx ++;
rdoff = 0;
minlen = num_min(get_col_len(sb->fr, 9), sb->bsize) / 2;
rddir = (atol(get_col_str(sb->fr, 1)) & 0x10) >> 4;
str = get_col_str(sb->fr, 9);
{
encap_layseqr(sb->seqs, 2);
sc = pop_layseqr(sb->seqs);
sc->chridx = chridx;
sc->bidx = (off / sb->bstep);
sc->rdidx = sb->rdidx;
sc->rddir = rddir;
sc->rdoff = rdoff;
sc->rbeg = off;
sc->rend = 0;
clear_basebank(sc->seq);
}
sl = NULL;
// parse SAM cigar
{
nxt = (off / sb->bstep) * sb->bstep;
if(nxt && off - nxt < UInt(sb->bsize - sb->bstep)){
if(sc->bidx){
scidx = offset_layseqr(sb->seqs, sc);
sl = pop_layseqr(sb->seqs);
sc = ref_layseqr(sb->seqs, scidx);
sl->chridx = chridx;
sl->bidx = sc->bidx - 1;
sl->rdidx = sb->rdidx;
sl->rddir = rddir;
sl->rdoff = rdoff;
sl->rbeg = off;
sl->rend = 0;
clear_basebank(sl->seq);
}
nxt += sb->bsize - sb->bstep;
} else {
nxt += sb->bstep;
}
}
val = 0;
while(*ptr){
op = MAX_U4;
switch(*ptr){
case '0' ... '9': val = val * 10 + (*ptr) - '0'; break;
case 'X':
case '=':
case 'M': op = 0b111; break;
case 'I': op = 0b110; break;
case 'D': op = 0b101; break;
case 'N': op = 0b001; break;
case 'S': op = 0b010; break;
case 'H':
case 'P': op = 0b000; break;
default:
fprintf(stderr, " -- %s\n", get_col_str(sb->fr, 5));
fprintf(stderr, " -- Bad cigar %d '%c' in %s -- %s:%d --\n", (int)(ptr - get_col_str(sb->fr, 5)), *ptr, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
ptr ++;
if(op == MAX_U4) continue;
if(val == 0) val = 1;
while(val){
if(op & 0b001){
len = num_min(val, nxt - off);
off += len;
} else {
len = val;
}
val -= len;
if(op & 0b010){
rdoff += len;
if(op & 0b100){
if(sc){
if(sc->seq->size == 0){
sc->rbeg = (op & 0b001)? off - len : off;
}
seq2basebank(sc->seq, str, len);
sc->rend = off;
}
if(sl){
if(sl->seq->size == 0){
sl->rbeg = (op & 0b001)? off - len : off;
}
seq2basebank(sl->seq, str, len);
sl->rend = off;
}
}
str += len;
}
if(off == nxt){
if(sl){
if(sl->rend >= sl->rbeg + minlen){
scidx = offset_layseqr(sb->seqs, sc);
sl = _push_padding_ref_samblock(sb, sl);
sc = ref_layseqr(sb->seqs, scidx);
if(sb->lidx == MAX_U4){
sb->lidx = offset_layseqr(sb->seqs, sl);
}
array_heap_push(sb->heap->buffer, sb->heap->size, sb->heap->cap, u4i, offset_layseqr(sb->seqs, sl),
num_cmpxx(ref_layseqr(sb->seqs, a)->chridx, ref_layseqr(sb->seqs, b)->chridx, ref_layseqr(sb->seqs, a)->bidx,
ref_layseqr(sb->seqs, b)->bidx, ref_layseqr(sb->seqs, a)->rdidx, ref_layseqr(sb->seqs, b)->rdidx));
} else {
push_layseqr(sb->seqs, sl);
}
sl = NULL;
nxt += 2 * sb->bstep - sb->bsize;
} else {
scidx = offset_layseqr(sb->seqs, sc);
encap_layseqr(sb->seqs, 1);
sl = ref_layseqr(sb->seqs, scidx);
sc = pop_layseqr(sb->seqs);
sc->chridx = chridx;
sc->bidx = sl->bidx + 1;
sc->rdidx = sb->rdidx;
sc->rddir = rddir;
sc->rdoff = rdoff;
sc->rbeg = off;
sc->rend = 0;
clear_basebank(sc->seq);
nxt += sb->bsize - sb->bstep;
}
}
}
}
if(sl){
if(sl->rend >= sl->rbeg + minlen){
u4i scidx;
scidx = sc? offset_layseqr(sb->seqs, sc) : MAX_U4;
sl = _push_padding_ref_samblock(sb, sl);
sc = scidx == MAX_U4? NULL : ref_layseqr(sb->seqs, scidx);
if(sb->lidx == MAX_U4){
sb->lidx = offset_layseqr(sb->seqs, sl);
}
array_heap_push(sb->heap->buffer, sb->heap->size, sb->heap->cap, u4i, offset_layseqr(sb->seqs, sl),
num_cmpxx(ref_layseqr(sb->seqs, a)->chridx, ref_layseqr(sb->seqs, b)->chridx, ref_layseqr(sb->seqs, a)->bidx,
ref_layseqr(sb->seqs, b)->bidx, ref_layseqr(sb->seqs, a)->rdidx, ref_layseqr(sb->seqs, b)->rdidx));
} else {
push_layseqr(sb->seqs, sl);
}
}
if(sc){
if(sc->rend >= sc->rbeg + minlen){
scidx = sl? offset_layseqr(sb->seqs, sl) : MAX_U4;
sc = _push_padding_ref_samblock(sb, sc);
sl = scidx == MAX_U4? NULL : ref_layseqr(sb->seqs, scidx);
if(sb->lidx == MAX_U4){
sb->lidx = offset_layseqr(sb->seqs, sc);
}
array_heap_push(sb->heap->buffer, sb->heap->size, sb->heap->cap, u4i, offset_layseqr(sb->seqs, sc),
num_cmpxx(ref_layseqr(sb->seqs, a)->chridx, ref_layseqr(sb->seqs, b)->chridx, ref_layseqr(sb->seqs, a)->bidx,
ref_layseqr(sb->seqs, b)->bidx, ref_layseqr(sb->seqs, a)->rdidx, ref_layseqr(sb->seqs, b)->rdidx));
} else {
push_layseqr(sb->seqs, sc);
}
}
}
if(sb->heap->size == 0) break;
if(sb->lidx != MAX_U4){
sl = ref_layseqr(sb->seqs, sb->lidx);
sb->sidx = sb->heap->buffer[0];
sc = ref_layseqr(sb->seqs, sb->sidx);
if(sc->chridx == sl->chridx && sc->bidx + 1 >= sl->bidx){
sb->lidx = MAX_U4;
continue;
}
array_heap_remove(sb->heap->buffer, sb->heap->size, sb->heap->cap, u4i, 0,
num_cmpxx(ref_layseqr(sb->seqs, a)->chridx, ref_layseqr(sb->seqs, b)->chridx, ref_layseqr(sb->seqs, a)->bidx,
ref_layseqr(sb->seqs, b)->bidx, ref_layseqr(sb->seqs, a)->rdidx, ref_layseqr(sb->seqs, b)->rdidx));
sc->rbeg -= sc->bidx * sb->bstep;
sc->rend -= sc->bidx * sb->bstep;
} else {
sb->sidx = sb->heap->buffer[0];
array_heap_remove(sb->heap->buffer, sb->heap->size, sb->heap->cap, u4i, 0,
num_cmpxx(ref_layseqr(sb->seqs, a)->chridx, ref_layseqr(sb->seqs, b)->chridx, ref_layseqr(sb->seqs, a)->bidx,
ref_layseqr(sb->seqs, b)->bidx, ref_layseqr(sb->seqs, a)->rdidx, ref_layseqr(sb->seqs, b)->rdidx));
sc = ref_layseqr(sb->seqs, sb->sidx);
sc->rbeg -= sc->bidx * sb->bstep;
sc->rend -= sc->bidx * sb->bstep;
}
return sc;
} while(1);
sb->sidx = MAX_U4;
return NULL;
}
static inline void info_samblock(void *obj, lay_seq_t *sq, lay_blk_t *bk){
SAMBlock *sb;
sb = (SAMBlock*)obj;
if(sq == NULL) return;
bk->node1 = sq->bidx;
bk->node2 = sq->bidx + 1;
bk->reftag = sb->refs->rdtags->buffer[sb->chrs->buffer[sq->chridx]];
bk->reflen = sb->refs->rdlens->buffer[sb->chrs->buffer[sq->chridx]];
bk->refoff = sq->bidx * sb->bstep;
}
typedef struct {
String *tag;
FileReader *fr;
u4i chridx, bidx;
u8i rdidx;
lay_seq_t *key;
lay_blk_t *blk;
} WTLAYBlock;
static inline WTLAYBlock* init_wtlayblock(FileReader *fr){
WTLAYBlock *wb;
wb = malloc(sizeof(WTLAYBlock));
wb->tag = init_string(32);
append_string(wb->tag, "annonymous", 10);
wb->fr = fr;
wb->chridx = 0;
wb->bidx = 0;
wb->rdidx = 0;
wb->key = calloc(1, sizeof(lay_seq_t));
wb->key->seq = init_basebank();
wb->blk = calloc(1, sizeof(lay_blk_t));
return wb;
}
static inline void free_wtlayblock(WTLAYBlock *wb){
free_string(wb->tag);
free_basebank(wb->key->seq);
free(wb->key);
free(wb->blk);
free(wb);
}
static inline lay_seq_t* iter_wtlayblock(void *obj){
WTLAYBlock *wb;
char *ss;
int c, sl;
wb = (WTLAYBlock*)obj;
clear_basebank(wb->key->seq);
while((c = readtable_filereader(wb->fr)) != -1){
switch(wb->fr->line->string[0]){
case '>':
wb->chridx ++;
wb->bidx = 0;
clear_string(wb->tag);
ss = get_col_str(wb->fr, 0) + 1;
for(sl=0;ss[sl] && ss[sl] != ' ' && ss[sl] != '\t';sl++){
}
append_string(wb->tag, ss, sl);
wb->blk->node1 = 0;
wb->blk->node2 = 0;
ss = strstr(ss + sl, "len=");
if(ss){
wb->blk->reflen = atol(ss);
} else {
wb->blk->reflen = 0;
}
wb->blk->reftag = wb->tag->string;
break;
case 'E':
wb->bidx ++;
wb->blk->refoff = atoll(get_col_str(wb->fr, 1));
wb->blk->node1 = atoll(get_col_str(wb->fr, 2) + 1);
wb->blk->node2 = atoll(get_col_str(wb->fr, 4) + 1);
wb->blk->reftag = wb->tag->string;
break;
case 'S':
case 's':
ss = get_col_str(wb->fr, 5);
sl = get_col_len(wb->fr, 5);
wb->key->chridx = wb->chridx;
wb->key->bidx = wb->bidx;
wb->key->rdidx = wb->rdidx ++;
wb->key->rddir = (get_col_str(wb->fr, 2)[0] == '-');
wb->key->rdoff = atoi(get_col_str(wb->fr, 3));
if(c >= 8){
wb->key->rbeg = atoi(get_col_str(wb->fr, 6));
wb->key->rend = atoi(get_col_str(wb->fr, 7));
} else {
wb->key->rbeg = 0;
wb->key->rend = 0;
}
seq2basebank(wb->key->seq, ss, sl);
return wb->key;
}
}
return NULL;
}
static inline void info_wtlayblock(void *obj, lay_seq_t *sq, lay_blk_t *bk){
if(sq == NULL) return;
memcpy(bk, ((WTLAYBlock*)obj)->blk, sizeof(lay_blk_t));
}
#endif
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/Velcon-Zheng/wtdbg2.git
git@gitee.com:Velcon-Zheng/wtdbg2.git
Velcon-Zheng
wtdbg2
wtdbg2
master

搜索帮助