1 Star 0 Fork 0

Velcon-Zheng/wtdbg2

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
poacns.h 59.14 KB
一键复制 编辑 原始数据 按行查看 历史
ruanjue 提交于 2019-06-21 19:45 . check bad tailing alignment in POA
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342
/*
*
* Copyright (c) 2018, Jue Ruan <ruanjue@gmail.com>
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef PO_MSA_CNS_RJ_H
#define PO_MSA_CNS_RJ_H
#include "bit2vec.h"
#include "dna.h"
#include "chararray.h"
#include "list.h"
#include "hashset.h"
#include <emmintrin.h>
#include <tmmintrin.h>
#if __BYTE_ORDER == 1234
//#pragma message(" ** " __FILE__ " has been tested in LITTLE_ENDIAN **\n")
#else
#pragma message(" ** " __FILE__ " hasn't been tested in BIG_ENDIAN **\n")
#endif
static int cns_debug = 0;
#define POG_RDLEN_MAX 0x7FF8
#define POG_RDCNT_MAX 0x3FFF
#define POG_BASEWIDTH 3
#define POG_BASEMAX 0x3F
#define POG_HEAD_NODE 0
#define POG_TAIL_NODE 1
#define POG_DP_BT_M 0
#define POG_DP_BT_I 1
#define POG_DP_BT_D 2
#define POG_ALNMODE_OVERLAP 0
#define POG_ALNMODE_GLOBAL 1
#define POG_VST_MAX MAX_U2
#define POG_SCORE_MIN (-(MAX_B2 >> 1))
typedef struct {
u2i rid, pos, base;
u2i rbeg, rend, rmax;
u2i nin, vst;
u4i edge, erev;
u4i aligned;
union {
u4i aux;
u4i bpos; // backbone pos
};
u4i coff;
union {
u8i flag;
struct { u4i roff, voff; };
};
} pog_node_t;
define_list(pognodev, pog_node_t);
typedef struct {
u4i node;
u4i cov:31, is_aux:1;
u4i next;
} pog_edge_t;
define_list(pogedgev, pog_edge_t);
typedef struct {
int cnsmode; // 0: gen_cns_pog, 1: dp_call_cns_pog
int refmode;
int alnmode;
int tribase;
int sse;
int near_dialog;
int W_score;
int W, rW, cW; // 64, 16, 8
int Wmax; // 1024
float W_mat_rate; // 0.92
int M, X, I, D, E;
float H; // homopolymer merge
int T; // bonus for end
u4i msa_min_cnt;
float msa_min_freq;
} POGPar;
static const POGPar DEFAULT_POG_PAR = (POGPar){0, 0, POG_ALNMODE_OVERLAP, 0, 1, 0, 0, 64, 16, 8, 1024, 0.92, 2, -5, -2, -4, -1, -3, 20, 3, 0.5};
typedef struct {
u4i coff:29, bt:3;
float max;
} pog_cns_t;
typedef struct {
SeqBank *seqs;
u2v *sbegs, *sends; // suggested [beg, end) on ref(1st seq) in reference-based mode
pognodev *nodes;
pogedgev *edges;
POGPar *par;
b2v *qprof;
b2v *rows;
u4v *rowr;
b2v *btds;
u2v *btvs;
u4v *btxs;
u4v *stack;
u2i backbone;
u4i msa_len;
u1v *msa;
u2v *bcnts[7];
u2v *hcovs;
u1v *cbts;
BaseBank *cns;
u8i ncall;
} POG;
static inline POG* init_pog(POGPar par){
POG *g;
g = malloc(sizeof(POG));
g->seqs = init_seqbank();
g->sbegs = init_u2v(32);
g->sends = init_u2v(32);
g->nodes = init_pognodev(16 * 1024);
g->edges = init_pogedgev(16 * 1024);
g->par = malloc(sizeof(POGPar));
memcpy(g->par, &par, sizeof(POGPar));
g->qprof = init_b2v(4 * 1024);
g->rows = init_b2v(16 * 1024);
g->rowr = init_u4v(64);
g->btds = init_b2v(16 * 1024);
g->btvs = init_u2v(8 * 1024 * 1024);
g->btxs = init_u4v(1024);
g->stack = init_u4v(32);
g->backbone = 0;
g->msa_len = 0;
g->msa = init_u1v(16 * 1024);
g->bcnts[0] = init_u2v(4 * 1024);
g->bcnts[1] = init_u2v(4 * 1024);
g->bcnts[2] = init_u2v(4 * 1024);
g->bcnts[3] = init_u2v(4 * 1024);
g->bcnts[4] = init_u2v(4 * 1024);
g->bcnts[5] = init_u2v(4 * 1024);
g->bcnts[6] = init_u2v(4 * 1024);
g->hcovs = init_u2v(4 * 1024);
g->cbts = init_u1v(5 * 2 * 1024);
g->cns = init_basebank();
g->ncall = 0;
return g;
}
static inline void renew_pog(POG *g){
free_seqbank(g->seqs); g->seqs = init_seqbank();
renew_u2v(g->sbegs, 32);
renew_u2v(g->sends, 32);
renew_pognodev(g->nodes, 16 * 1024);
renew_pogedgev(g->edges, 16 * 1024);
renew_b2v(g->qprof, 4 * 1024);
renew_b2v(g->rows, 16 * 1024);
renew_u4v(g->rowr, 64);
renew_b2v(g->btds, 16 * 1024);
renew_u2v(g->btvs, 8 * 1024 * 1024);
renew_u4v(g->btxs, 16 * 1024);
renew_u4v(g->stack, 32);
renew_u1v(g->msa, 16 * 1024);
renew_u2v(g->bcnts[0], 4 * 1024);
renew_u2v(g->bcnts[1], 4 * 1024);
renew_u2v(g->bcnts[2], 4 * 1024);
renew_u2v(g->bcnts[3], 4 * 1024);
renew_u2v(g->bcnts[4], 4 * 1024);
renew_u2v(g->bcnts[5], 4 * 1024);
renew_u2v(g->bcnts[6], 4 * 1024);
renew_u2v(g->hcovs, 4 * 1024);
renew_u1v(g->cbts, 5 * 2 * 1024);
free_basebank(g->cns); g->cns = init_basebank();
}
static inline void free_pog(POG *g){
free_seqbank(g->seqs);
free_u2v(g->sbegs);
free_u2v(g->sends);
free_pognodev(g->nodes);
free_pogedgev(g->edges);
free_b2v(g->qprof);
free_b2v(g->rows);
free_u4v(g->rowr);
free_b2v(g->btds);
free_u2v(g->btvs);
free_u4v(g->btxs);
free_u4v(g->stack);
free_u1v(g->msa);
free_u2v(g->bcnts[0]);
free_u2v(g->bcnts[1]);
free_u2v(g->bcnts[2]);
free_u2v(g->bcnts[3]);
free_u2v(g->bcnts[4]);
free_u2v(g->bcnts[5]);
free_u2v(g->bcnts[6]);
free_u2v(g->hcovs);
free_u1v(g->cbts);
free_basebank(g->cns);
free(g->par);
free(g);
}
static inline void push_pog_core(POG *g, char *seq, u4i len, u2i refbeg, u2i refend){
if(g->seqs->nseq < POG_RDCNT_MAX){
len = num_min(len, POG_RDLEN_MAX);
push_seqbank(g->seqs, NULL, 0, seq, len);
push_u2v(g->sbegs, refbeg);
push_u2v(g->sends, refend);
}
}
static inline void push_pog(POG *g, char *seq, u1i len){ push_pog_core(g, seq, len, 0, 0); }
static inline void fwdbitpush_pog_core(POG *g, u8i *bits, u8i off, u4i len, u2i refbeg, u2i refend){
if(g->seqs->nseq < POG_RDCNT_MAX){
len = num_min(len, POG_RDLEN_MAX);
fwdbitpush_seqbank(g->seqs, NULL, 0, bits, off, len);
push_u2v(g->sbegs, refbeg);
push_u2v(g->sends, refend);
}
}
static inline void fwdbitpush_pog(POG *g, u8i *bits, u8i off, u4i len){ fwdbitpush_pog_core(g, bits, off, len, 0, 0); }
static inline void revbitpush_pog_core(POG *g, u8i *bits, u8i off, u4i len, u2i refbeg, u2i refend){
if(g->seqs->nseq < POG_RDCNT_MAX){
len = num_min(len, POG_RDLEN_MAX);
revbitpush_seqbank(g->seqs, NULL, 0, bits, off, len);
push_u2v(g->sbegs, refbeg);
push_u2v(g->sends, refend);
}
}
static inline void revbitpush_pog(POG *g, u8i *bits, u8i off, u4i len){ revbitpush_pog_core(g, bits, off, len, 0, 0); }
static inline void print_dot_pog(POG *g, FILE *out){
pog_node_t *n;
pog_edge_t *e;
u4i nidx, eidx;
fprintf(out, "digraph {\n");
fprintf(out, "rankdir=LR\n");
fprintf(out, "N0 [label=\"BEG\"]\n");
fprintf(out, "N1 [label=\"END\"]\n");
for(nidx=POG_TAIL_NODE+1;nidx<g->nodes->size;nidx++){
n = ref_pognodev(g->nodes, nidx);
fprintf(out, "N%u [label=R%u_%u_%c]\n", nidx, n->rid, n->pos, bit_base_table[(n->base) & 0x03]);
}
for(nidx=0;nidx<g->nodes->size;nidx++){
n = ref_pognodev(g->nodes, nidx);
if(n->aligned != nidx){
fprintf(out, "N%u -> N%u [color=magenta style=dashed]\n", nidx, n->aligned);
}
eidx = n->edge;
while(eidx){
e = ref_pogedgev(g->edges, eidx);
#if DEBUG
if(e->next == eidx){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
eidx = e->next;
if(e->is_aux) continue;
fprintf(out, "N%u -> N%u [label=%u]\n", nidx, e->node, e->cov);
}
}
fprintf(out, "}\n");
}
static inline void fprint_dot_pog(POG *g, char *prefix, char *suffix){
FILE *out;
out = open_file_for_write(prefix, suffix, 1);
print_dot_pog(g, out);
fclose(out);
}
static inline void print_vstdot_pog(POG *g, char *fname){
FILE *out;
pog_node_t *n;
pog_edge_t *e;
u4i nidx, eidx;
out = open_file_for_write(fname, NULL, 1);
fprintf(out, "digraph {\n");
for(nidx=0;nidx<g->nodes->size;nidx++){
n = ref_pognodev(g->nodes, nidx);
if(nidx && n->vst == 0) continue;
fprintf(out, "N%u [label=\"N%u:%u:%u:%d\"]\n", nidx, nidx, n->aux, n->nin, n->vst);
eidx = n->edge;
while(eidx){
e = ref_pogedgev(g->edges, eidx);
eidx = e->next;
if(e->is_aux) continue;
fprintf(out, "N%u -> N%u [label=%u]\n", nidx, e->node, e->cov);
}
}
fprintf(out, "}\n");
fclose(out);
}
static inline void print_seqs_pog(POG *g, char *prefix, char *suffix){
FILE *out;
u4i i;
out = open_file_for_write(prefix, suffix, 1);
for(i=0;i<g->seqs->nseq;i++){
fprintf(out, ">S%u len=%u\n", i, g->seqs->rdlens->buffer[i]);
println_fwdseq_basebank(g->seqs->rdseqs, g->seqs->rdoffs->buffer[i], g->seqs->rdlens->buffer[i], out);
}
fclose(out);
}
static inline void print_msa_pog(POG *g, FILE *out){
char *str;
u1i *cns;
u4i i, j, b, e, c, n;
str = malloc(g->msa_len + 1);
fprintf(out, "[POS] ");
for(i=j=0;i<g->msa_len;i++){
if((i % 10) == 0){
fprintf(out, "|%04u", i + 1);
j += 5;
} else if(i >= j){
putc(' ', out);
j ++;
}
}
fputc('\n', out);
for(i=0;i<g->seqs->nseq+1;i++){
if(i == g->seqs->nseq){
fprintf(out, "[CNS] ");
} else {
fprintf(out, "[%03u] ", i);
}
b = i * g->msa_len;
e = b + g->msa_len;
n = 0;
for(j=b;j<e;j++){
c = g->msa->buffer[j];
if(c < 4) n ++;
str[j-b] = "ACGT-acgt*"[c];
//fputc("ACGT-"[c], out);
}
str[e-b] = 0;
fputs(str, out);
if(i == g->seqs->nseq){
fprintf(out, "\t%u\t%u\n", (u4i)g->cns->size, n);
} else {
fprintf(out, "\t%u\t%u\n", g->seqs->rdlens->buffer[i], n);
}
}
fprintf(out, "[POS] ");
cns = ref_u1v(g->msa, g->msa_len * g->seqs->nseq);
for(i=j=b=0;i<g->msa_len;i++){
if(cns[i] < 4){
j ++;
if((j % 10) == 1){
while(b < i){
fputc(' ', out);
b ++;
}
fprintf(out, "|%04u", j);
b += 5;
}
}
}
fprintf(out, "\n");
if(1){
u4i divs[5];
u2i *hcovs;
hcovs = g->hcovs->buffer;
divs[0] = 10000;
divs[1] = 1000;
divs[2] = 100;
divs[3] = 10;
divs[4] = 1;
for(i=0;i<4;i++){
for(j=0;j<g->msa_len;j++){
if(hcovs[j] < divs[i + 1]){
str[j] = ' ';
} else {
str[j] = '0' + ((hcovs[j] % divs[i]) / divs[i + 1]);
}
}
str[j] = 0;
fprintf(out, "[%c ] %s\n", "HCOV"[i], str);
}
}
free(str);
}
static inline pog_node_t* add_node_pog(POG *g, u2i rid, u2i pos, u2i base){
pog_node_t *u;
u = next_ref_pognodev(g->nodes);
ZEROS(u);
u->rid = rid;
u->pos = pos;
u->base = base;
u->aligned = offset_pognodev(g->nodes, u);
return u;
}
static inline pog_edge_t* add_edge_pog(POG *g, pog_node_t *u, pog_node_t *v, int cov, int is_aux){
pog_edge_t *e, *f, *p;
u4i eidx;
#if DEBUG
if(u == v){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
if(u->edge == 0){
e = next_ref_pogedgev(g->edges);
u->edge = offset_pogedgev(g->edges, e);
e->node = offset_pognodev(g->nodes, v);
e->cov = cov;
e->is_aux = is_aux;
e->next = 0;
v->nin ++;
return e;
} else {
e = f = p = NULL;
encap_pogedgev(g->edges, 1);
eidx = u->edge;
while(eidx){
f = ref_pogedgev(g->edges, eidx);
eidx = f->next;
if(f->node == offset_pognodev(g->nodes, v)){
e = f;
break;
}
p = f;
}
if(e == NULL){
e = next_ref_pogedgev(g->edges);
e->node = offset_pognodev(g->nodes, v);
e->cov = cov;
e->is_aux = is_aux;
e->next = 0;
v->nin ++;
} else {
e->is_aux &= is_aux;
e->cov += cov;
if(cov == 0){
return e;
} else if(p == NULL){
return e;
}
// detach e from u->edge
p->next = e->next;
e->next = 0;
}
f = ref_pogedgev(g->edges, u->edge);
if(e->cov > f->cov){
e->next = u->edge;
u->edge = offset_pogedgev(g->edges, e);
} else {
while(f->next){
if(ref_pogedgev(g->edges, f->next)->cov < e->cov){
break;
}
f = ref_pogedgev(g->edges, f->next);
}
e->next = f->next;
f->next = offset_pogedgev(g->edges, e);
}
return e;
}
}
static inline pog_node_t* merge_node_pog(POG *g, pog_node_t *x, pog_node_t *u){
pog_node_t *v, *w;
pog_edge_t *e;
u4i xidx, eidx;
xidx = offset_pognodev(g->nodes, x);
do {
v = ref_pognodev(g->nodes, xidx);
if(v->nin && v->base == u->base){
eidx = u->edge;
while(eidx){
e = ref_pogedgev(g->edges, eidx);
eidx = e->next;
w = ref_pognodev(g->nodes, e->node);
add_edge_pog(g, v, w, e->cov, e->is_aux);
w->nin --; // will delete e
}
u->edge = 0;
break;
}
v = NULL;
} while(xidx != offset_pognodev(g->nodes, x));
u->aligned = x->aligned;
x->aligned = offset_pognodev(g->nodes, u);
return v? v : u;
}
static inline void beg_pog_core(POG *g, BaseBank *cns, u8i off, u2i len, int clear_all){
pog_node_t *head, *tail, *u, *v;
pog_edge_t *e;
u4i i, bb, bk;
g->ncall ++;
if((g->ncall % 16) == 0){
renew_pog(g);
}
clear_and_encap_pognodev(g->nodes, 2 + len);
clear_pogedgev(g->edges);
ZEROS(next_ref_pogedgev(g->edges));
head = next_ref_pognodev(g->nodes);
ZEROS(head);
if(g->par->tribase){
head->base = POG_BASEMAX + 1;
} else {
head->base = 4;
}
head->aligned = POG_HEAD_NODE;
tail = next_ref_pognodev(g->nodes);
ZEROS(tail);
if(g->par->tribase){
tail->base = POG_BASEMAX + 1;
} else {
tail->base = 4;
}
tail->aligned = POG_TAIL_NODE;
u = head;
bk = 0;
// add backbone nodes
g->backbone = len;
for(i=0;i<len;i++){
bb = get_basebank(cns, off + i);
if(g->par->tribase){
bk = ((bk << 2) | bb) & POG_BASEMAX;
} else {
bk = bb;
}
v = add_node_pog(g, g->seqs->nseq, i, bk);
v->bpos = i;
e = add_edge_pog(g, u, v, 0, 1);
u = v;
}
e = add_edge_pog(g, u, tail, 0, 1);
if(clear_all){
clear_seqbank(g->seqs);
clear_u2v(g->sbegs);
clear_u2v(g->sends);
clear_basebank(g->cns);
}
}
static inline void beg_pog(POG *g){
beg_pog_core(g, NULL, 0, 0, 1);
}
static inline void prepare_rd_align_pog(POG *g, u2i rid){
pog_node_t *u, *v;
pog_edge_t *e;
b2i *row, *btds;
u8i rmax;
u4i seqoff, seqlen, seqlex, slen, seqinc;
u4i nidx, eidx, i, j, bb;
seqoff = g->seqs->rdoffs->buffer[rid];
seqlen = g->seqs->rdlens->buffer[rid];
seqlex = roundup_times(seqlen, 8); // 128 = 8 * sizeof(b2i)
slen = seqlex >> 3;
seqinc = seqlex + 8; // seqlex, max_idx, beg, end, and paddings
// init graph
encap_pognodev(g->nodes, seqlen + 2);
encap_pogedgev(g->edges, seqlen + 2);
// estimate cap of g->rows
for(i=0;i<g->nodes->size;i++){
v = ref_pognodev(g->nodes, i);
v->roff = 0;
v->voff = 0;
v->coff = 0;
v->vst = 0;
v->aux = 0;
}
clear_u4v(g->stack);
push_u4v(g->stack, POG_HEAD_NODE);
rmax = 0;
bb = 1;
while(pop_u4v(g->stack, &nidx)){
u = ref_pognodev(g->nodes, nidx);
eidx = u->edge;
while(eidx){
e = ref_pogedgev(g->edges, eidx);
#if DEBUG
if(eidx == e->next){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
eidx = e->next;
v = ref_pognodev(g->nodes, e->node);
if(v->vst == 0){
bb ++;
if(bb > rmax){
rmax = bb;
}
}
v->vst ++;
if(v->vst == v->nin){
push_u4v(g->stack, e->node);
}
}
bb --;
}
#if DEBUG
if(bb){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
rmax += 2;
head_sl_b2v(g->rows, g->rows->n_head);
clear_and_encap_b2v(g->rows, rmax * seqinc + 8);
head_sr_b2v(g->rows, (16 - (((u8i)g->rows->buffer) & 0xF)) >> 1);
head_sl_b2v(g->btds, g->btds->n_head);
clear_and_encap_b2v(g->btds, rmax * seqinc + 8);
head_sr_b2v(g->btds, (16 - (((u8i)g->btds->buffer) & 0xF)) >> 1);
inc_b2v(g->rows, seqinc);
inc_b2v(g->btds, seqinc);
clear_u4v(g->rowr);
bb = 0;
for(i=0;i<g->nodes->size;i++){
v = ref_pognodev(g->nodes, i);
v->vst = 0;
v->erev = bb;
bb += v->nin;
if(i < 2) bb ++; // in case of add mnode
}
clear_u2v(g->btvs);
inc_u2v(g->btvs, 8);
clear_and_inc_u4v(g->btxs, bb + 1);
//g->btxs->buffer[0] = 0;
memset(g->btxs->buffer, 0, (bb + 1) * sizeof(u4i));
u = ref_pognodev(g->nodes, POG_HEAD_NODE);
if(g->rowr->size){
u->coff = u->roff = g->rowr->buffer[--g->rowr->size];
} else {
u->roff = g->rows->size;
inc_b2v(g->rows, seqinc);
u->coff = g->btds->size;
inc_b2v(g->btds, seqinc);
}
row = ref_b2v(g->rows, u->roff);
btds = ref_b2v(g->btds, u->coff);
if(g->par->alnmode == POG_ALNMODE_OVERLAP){ // overlap alignment
memset(row, 0, seqinc * sizeof(b2i));
//row[0] = g->par->T;
} else { // global
__m128i MIN;
MIN = _mm_set1_epi16(POG_SCORE_MIN);
for(j=0;j<slen;j++){
_mm_store_si128(((__m128i*)row) + j, MIN);
}
//row[0] = g->par->T;
}
memset(btds, 0, seqinc * sizeof(b2i));
if(g->par->near_dialog && g->par->W_score <= 0){
row[seqlex] = g->par->W;
row[seqlex + 1] = 0;
row[seqlex + 2] = num_min(Int(seqlex), 2 * g->par->W + 1);
} else {
row[seqlex] = 0;
row[seqlex + 1] = 0;
row[seqlex + 2] = seqlex;
}
}
// SSE
// BANDED, Auto fit the previous-row's max score in center
// OVERLAP
static inline void sse_band_row_rdaln_pog(POG *g, u4i nidx1, u4i nidx2, u4i seqlen, u2i vst, u4i coff1, u4i coff2, u4i roff1, u4i roff2, b2i *qp, int W, int center){
__m128i I, D, H, E, F, S, MAX, MIN, CMP;
__m128i BT, BT1, BT2, BT1_MASK, BT2_MASK, BTX_MASK;
b2i *row1, *row2, *btds;
u4i i, slen, seqlex, beg, end;
int lsth, lstf, msk, mi;
UNUSED(coff1);
slen = (seqlen + 7) / 8;
seqlex = slen * 8;
I = _mm_set1_epi16(g->par->I);
D = _mm_set1_epi16(g->par->D);
MIN = _mm_set1_epi16(POG_SCORE_MIN);
BT1_MASK = _mm_set1_epi16(0b10);
BT2_MASK = _mm_set1_epi16(0b01);
BTX_MASK = _mm_set1_epi16(vst << 2);
row1 = ref_b2v(g->rows, roff1);
row2 = ref_b2v(g->rows, roff2);
btds = ref_b2v(g->btds, coff2);
if(W){
if(center < 0){
if(row1[row1[seqlex]] >= g->par->W_score){
if(g->par->near_dialog){
if(row1[seqlex + 1] > 0 || row1[seqlex + 2] < Int(seqlex)){
if(row1[seqlex] == (row1[seqlex + 1] + row1[seqlex + 2]) / 2){
center = (row1[seqlex + 1] + row1[seqlex + 2]) / 2 + 1;
} else if(row1[seqlex] > (row1[seqlex + 1] + row1[seqlex + 2]) / 2){
center = (row1[seqlex + 1] + row1[seqlex + 2]) / 2 + 2;
} else {
center = (row1[seqlex + 1] + row1[seqlex + 2]) / 2;
}
} else { // first set center
center = row1[seqlex];
}
} else {
center = row1[seqlex];
}
beg = num_max(center - W, row1[seqlex + 1]);
end = num_min(center + 1 + W, row1[seqlex + 2] + 1);
if(end > seqlex) end = seqlex;
} else {
beg = row1[seqlex + 1];
end = num_min(row1[seqlex + 2] + 1, Int(seqlex));
}
} else {
beg = num_max(center - W, row1[seqlex + 1]);
end = num_min(center + 1 + W, row1[seqlex + 2] + 1);
if(end > seqlex) end = seqlex;
}
} else {
beg = 0;
end = seqlex;
}
if(beg & 0x7U) beg = beg & (~0x7U);
if(end & 0x7U) end = (end + 8) & (~0x7U);
beg = beg / 8;
end = (end + 7) / 8;
if(row1[seqlex + 1] >= row1[seqlex + 2]){ // happening in realign mode
for(i=beg*8;i<end*8;i+=8){
_mm_store_si128(((__m128i*)(row1 + i)), MIN);
}
} else {
if(Int(beg * 8) < row1[seqlex + 1]){
for(i=beg*8;Int(i)<row1[seqlex + 1];i+=8){
_mm_store_si128(((__m128i*)(row1 + i)), MIN);
}
}
if(Int(end * 8) > row1[seqlex + 2]){
for(i=row1[seqlex+2];i<end*8;i+=8){
_mm_store_si128(((__m128i*)(row1 + i)), MIN);
}
}
}
if(nidx2 == POG_TAIL_NODE){
while(end * 8 < seqlex){
_mm_store_si128(((__m128i*)(row1)) + end, MIN);
end ++;
}
}
MAX = _mm_set1_epi16(POG_SCORE_MIN);
if(beg){
lstf = lsth = POG_SCORE_MIN;
} else {
if(row1[seqlex + 1] >= row1[seqlex + 2]){
lstf = POG_SCORE_MIN;
lsth = 0; // auto global alignment
} else {
lstf = (g->par->alnmode == POG_ALNMODE_OVERLAP)? 0 : POG_SCORE_MIN;
if(nidx1){
lsth = (g->par->alnmode == POG_ALNMODE_OVERLAP)? 0 : POG_SCORE_MIN;;
} else {
lsth = g->par->T;
}
}
}
for(i=beg;i<end;i++){
E = _mm_load_si128(((__m128i*)(row1)) + i);
H = _mm_slli_si128(E, 2);
H = _mm_insert_epi16(H, lsth, 0);
H = _mm_adds_epi16(H, _mm_load_si128(((__m128i*)qp) + i));
lsth = _mm_extract_epi16(E, 7);
E = _mm_adds_epi16(E, D);
S = _mm_max_epi16(H, E);
BT1 = _mm_cmpgt_epi16(S, H);
BT1 = _mm_and_si128(BT1, BT1_MASK);
F = _mm_slli_si128(S, 2);
F = _mm_insert_epi16(F, lstf, 0);
F = _mm_adds_epi16(F, I);
F = _mm_max_epi16(S, F);
while(1){
H = _mm_slli_si128(F, 2);
H = _mm_insert_epi16(H, lstf, 0);
H = _mm_adds_epi16(H, I);
CMP = _mm_cmpgt_epi16(H, F);
msk = _mm_movemask_epi8(CMP);
if(msk == 0){
break;
} else {
F = _mm_max_epi16(H, F);
}
}
MAX = _mm_max_epi16(MAX, F);
lstf = _mm_extract_epi16(F, 7);
BT2 = _mm_cmpgt_epi16(F, S);
BT2 = _mm_and_si128(BT2, BT2_MASK);
BT = _mm_xor_si128(BT1, BT2); // (0, (1, 3), 2)
BT = _mm_xor_si128(BT, BTX_MASK); // (0, (1, 3), 2) | (vst << 2)
_mm_store_si128(((__m128i*)row2) + i, F);
_mm_stream_si128(((__m128i*)btds) + i, BT);
}
if(1){
b2i ary[8];
u4i j, k;
_mm_store_si128((__m128i*)ary, MAX);
k = 0;
for(j=1;j<8;j++){
if(ary[j] > ary[k]){
k = j;
}
}
mi = beg * 8 + k;
for(i=beg+1;i<end;i++){
if(row2[i * 8 + k] > row2[mi]){
mi = i * 8 + k;
}
}
} else {
mi = beg * 8;
for(i=mi+1;i<end*8;i++){
if(row2[i] > row2[mi]){
mi = i;
}
}
}
row2[seqlex] = mi;
row2[seqlex + 1] = beg * 8;
row2[seqlex + 2] = end * 8;
}
static inline void merge_row_rdaln_pog(POG *g, u4i seqlen, u4i coff1, u4i coff2, u4i roff1, u4i roff2){
b2i *row1, *row2, *btd1, *btd2;
u4i i, seqlex, beg[3], end[3], sz, b;
row1 = ref_b2v(g->rows, roff1);
row2 = ref_b2v(g->rows, roff2);
btd1 = ref_b2v(g->btds, coff1);
btd2 = ref_b2v(g->btds, coff2);
seqlex = roundup_times(seqlen, 8);
beg[0] = (row1[seqlex + 1]);
beg[1] = (row2[seqlex + 1]);
end[0] = (row1[seqlex + 2]);
end[1] = (row2[seqlex + 2]);
beg[2] = num_max(beg[0], beg[1]);
end[2] = num_min(end[0], end[1]);
if(1){
__m128i R1, R2, D1, D2, MK;
for(i=beg[2];i&0x7;i++){
if(row2[i] < row1[i]){
row2[i] = row1[i];
btd2[i] = btd1[i];
}
}
for(;i+8<=end[2];i+=8){
R1 = _mm_load_si128((__m128i*)(row1 + i));
R2 = _mm_load_si128((__m128i*)(row2 + i));
D1 = _mm_load_si128((__m128i*)(btd1 + i));
D2 = _mm_load_si128((__m128i*)(btd2 + i));
MK = _mm_cmpgt_epi16(R1, R2);
R2 = _mm_max_epi16(R1, R2);
_mm_store_si128((__m128i*)(row2 + i), R2);
D2 = _mm_or_si128(_mm_and_si128(MK, D1), _mm_andnot_si128(MK, D2));
_mm_store_si128((__m128i*)(btd2 + i), D2);
}
for(;i<end[2];i++){
if(row2[i] < row1[i]){
row2[i] = row1[i];
btd2[i] = btd1[i];
}
}
} else {
for(i=beg[2];i<end[2];i++){
if(row2[i] < row1[i]){
row2[i] = row1[i];
btd2[i] = btd1[i];
}
}
}
if(0){
if(beg[0] < beg[2]){
sz = num_min(beg[2], end[0]) - beg[0];
memcpy(row2 + beg[0], row1 + beg[0], sz * sizeof(b2i));
memcpy(btd2 + beg[0], btd1 + beg[0], sz * sizeof(b2i));
}
for(i=end[2];i<beg[2];i++){ // in case of two independent regions
row2[i] = POG_SCORE_MIN;
}
if(end[0] > end[2]){
sz = num_min(end[0], seqlex) - end[2];
memcpy(row2 + end[2], row1 + end[2], sz * sizeof(b2i));
memcpy(btd2 + end[2], btd1 + end[2], sz * sizeof(b2i));
}
} else {
if(beg[0] < beg[1]){
b = beg[0];
if(end[0] >= beg[1]){
sz = beg[1] - beg[0];
memcpy(row2 + b, row1 + b, sz * sizeof(b2i));
memcpy(btd2 + b, btd1 + b, sz * sizeof(b2i));
} else {
__m128i F;
F = _mm_set1_epi16(POG_SCORE_MIN);
sz = end[0] - beg[0];
memcpy(row2 + b, row1 + b, sz * sizeof(b2i));
memcpy(btd2 + b, btd1 + b, sz * sizeof(b2i));
for(i=end[0];i<beg[1];i+=8){
_mm_store_si128(((__m128i*)(row2 + i)), F);
}
}
}
if(end[0] > end[1]){
if(beg[0] <= end[1]){
b = end[1];
sz = end[0] - b;
memcpy(row2 + b, row1 + b, sz * sizeof(b2i));
memcpy(btd2 + b, btd1 + b, sz * sizeof(b2i));
} else {
__m128i F;
F = _mm_set1_epi16(POG_SCORE_MIN);
b = beg[0];
sz = end[0] - b;
memcpy(row2 + b, row1 + b, sz * sizeof(b2i));
memcpy(btd2 + b, btd1 + b, sz * sizeof(b2i));
for(i=end[1];i<beg[0];i+=8){
_mm_store_si128(((__m128i*)(row2 + i)), F);
}
}
}
}
row2[seqlex + 1] = num_min(beg[0], beg[1]);
row2[seqlex + 2] = num_max(end[0], end[1]);
if(row1[(row1[seqlex])] > row2[(row2[seqlex])]){
row2[seqlex] = row1[seqlex];
}
}
static inline void set_rd_query_prof(POG *g, u4i rid){
b2i *qp;
u4i seqoff, seqlen, seqlex, slen, seqinc;
u4i i, j, bb, bk;
seqoff = g->seqs->rdoffs->buffer[rid];
seqlen = g->seqs->rdlens->buffer[rid];
seqlex = roundup_times(seqlen, 8);
slen = seqlex >> 3;
seqinc = seqlex + 8;
head_sl_b2v(g->qprof, g->qprof->n_head);
if(g->par->tribase){
clear_and_encap_b2v(g->qprof, seqlex * (POG_BASEMAX + 2) + 8);
head_sr_b2v(g->qprof, (16 - (((u8i)g->qprof->buffer) & 0xF)) >> 1);
for(i=0;i<=POG_BASEMAX;i++){
qp = g->qprof->buffer + i * seqlex;
bk = 0;
for(j=0;j<seqlen;j++){
bb = get_basebank(g->seqs->rdseqs, seqoff + j);
bk = ((bk << 2) | bb) & POG_BASEMAX;
if(bb == (i & 0x03)){
if(bk == i){
qp[j] = g->par->M + g->par->tribase;
} else {
qp[j] = g->par->M;
}
} else {
qp[j] = g->par->X;
}
}
for(;j<seqlex;j++){
qp[j] = POG_SCORE_MIN;
}
}
} else {
clear_and_encap_b2v(g->qprof, seqlex * (4 + 1) + 8);
head_sr_b2v(g->qprof, (16 - (((u8i)g->qprof->buffer) & 0xF)) >> 1);
for(i=0;i<4;i++){
qp = g->qprof->buffer + i * seqlex;
for(j=0;j<seqlen;j++){
bb = get_basebank(g->seqs->rdseqs, seqoff + j);
if(bb == (i & 0x03)){
qp[j] = g->par->M;
} else {
qp[j] = g->par->X;
}
}
for(;j<seqlex;j++){
qp[j] = POG_SCORE_MIN;
}
}
}
{
__m128i X;
qp = g->qprof->buffer + i * seqlex;
X = _mm_set1_epi16(g->par->X);
for(j=0;j<slen;j++){
_mm_stream_si128(((__m128i*)qp) + j, X);
}
}
}
static inline u2i get_rdbase_pog(POG *g, u4i rid, u4i pos){
u4i seqoff;
seqoff = g->seqs->rdoffs->buffer[rid];
if(g->par->tribase == 0){
return get_basebank(g->seqs->rdseqs, seqoff + pos);
}
if(pos >= POG_BASEWIDTH){
return sub32seqbits(g->seqs->rdseqs->bits, seqoff + pos + 1 - POG_BASEWIDTH) >> (64 - (POG_BASEWIDTH << 1));
} else {
return (sub32seqbits(g->seqs->rdseqs->bits, seqoff)) >> (64 - ((pos + 1) << 1));
}
}
static inline int _cal_matches_alignment_pog(POG *g, u4i rid, int *xb, int xe, int *badtail){
pog_node_t *u, *v;
u4i nidx, seqoff, btx, bt, vst;
int x, seqlen, mat, score, x0, x1, flag;
seqlen = g->seqs->rdlens->buffer[rid];
seqoff = g->seqs->rdoffs->buffer[rid];
x = xe;
score = 0;
x0 = x1 = x;
flag = 0;
nidx = POG_TAIL_NODE;
v = ref_pognodev(g->nodes, nidx);
btx = 1;
mat = 0;
while(x >= 0){
u = ref_pognodev(g->nodes, nidx);
bt = g->btvs->buffer[u->voff + x - u->rbeg];
vst = bt >> 2;
bt = bt & 0x03;
if(bt == POG_DP_BT_M){
if(flag) score += ((u->base & 0x03) == get_basebank(g->seqs->rdseqs, seqoff + x))? g->par->M : g->par->X;
else { flag = 1; x1 = x; }
mat ++;
x --;
} else if(bt & POG_DP_BT_I){
if(flag) score += g->par->I;
x --;
} else {
if(flag) score += g->par->D;
else { flag = 1; x1 = x; }
}
if(score == 10 * g->par->M){
x0 = x + 1;
}
if(bt & POG_DP_BT_I){
} else {
nidx = g->btxs->buffer[g->nodes->buffer[nidx].erev + vst];
}
if(x < 0){
break;
}
u = ref_pognodev(g->nodes, nidx);
if(x < u->rbeg || x >= u->rend){
break;
}
}
*xb = x;
*badtail = x1 > x0? x1 - x0 : 0;
return mat;
}
static inline int _alignment2graph_pog(POG *g, u4i rid, int xb, int xe){
pog_node_t *u, *v;
pog_edge_t *e;
u4i nidx, i, seqoff, btx, bt, vst;
int x, seqlen;
seqlen = g->seqs->rdlens->buffer[rid];
seqoff = g->seqs->rdoffs->buffer[rid];
x = xe;
nidx = POG_TAIL_NODE;
v = ref_pognodev(g->nodes, nidx);
int my_print = (cns_debug > 3);
if(x + 1 < (int)seqlen){
for(i=seqlen-1;(int)i>x;i--){
if(my_print){
fprintf(stderr, "BT[%u] y=N%u_R%u_%u_%c x=%d:%c z=%d\n", rid, nidx, v->rid, v->pos, bit_base_table[(v->base) & 0x03], i, bit_base_table[get_basebank(g->seqs->rdseqs, seqoff + i)], -1);
fflush(stderr);
}
u = add_node_pog(g, rid, i, get_rdbase_pog(g, rid, i));
add_edge_pog(g, u, v, 1, 0);
v = u;
}
v->coff = ref_pognodev(g->nodes, POG_TAIL_NODE)->coff;
}
btx = 1;
while(1){
if(x >= 0){
u = ref_pognodev(g->nodes, nidx);
bt = g->btvs->buffer[u->voff + x - u->rbeg];
} else {
bt = 0;
}
vst = bt >> 2;
bt = bt & 0x03;
if(my_print){
pog_node_t *w;
w = ref_pognodev(g->nodes, nidx);
fprintf(stderr, "BT[%u] y=N%u_R%u_%u_%c x=%d:%c z=%d\n", rid, nidx, w->rid, w->pos, bit_base_table[(w->base) & 0x03], x, x>=0? bit_base_table[get_basebank(g->seqs->rdseqs, seqoff + x)] : '*', bt);
fflush(stderr);
}
if(bt == POG_DP_BT_M){
if(btx){
u = add_node_pog(g, rid, x, get_rdbase_pog(g, rid, x));
e = add_edge_pog(g, u, v, 1, 0);
x --;
if(nidx > POG_TAIL_NODE){
u = merge_node_pog(g, ref_pognodev(g->nodes, nidx), u);
}
v = u;
} else {
xb = x;
if(nidx == POG_HEAD_NODE || nidx + 1 >= g->nodes->size || g->nodes->buffer[nidx].rid != g->nodes->buffer[nidx + 1].rid){
nidx = POG_HEAD_NODE;
} else {
nidx ++;
}
u = ref_pognodev(g->nodes, nidx);
e = add_edge_pog(g, u, v, 1, 0);
if(nidx != POG_HEAD_NODE){
v = u;
u = ref_pognodev(g->nodes, POG_HEAD_NODE);
//e = add_edge_pog(g, u, v, 1, 0);
e = add_edge_pog(g, u, v, 0, 1);
}
break;
}
} else if(bt & POG_DP_BT_I){
u = add_node_pog(g, rid, x, get_rdbase_pog(g, rid, x));
e = add_edge_pog(g, u, v, 1, 0);
x --;
v = u;
} else {
}
if(x < 0){ // && nidx
if(bt == POG_DP_BT_I){
} else {
nidx = 0;
}
btx = 0;
continue;
}
if(bt & POG_DP_BT_I){
} else {
nidx = g->btxs->buffer[g->nodes->buffer[nidx].erev + vst];
}
#if DEBUG
if(nidx >= g->nodes->size){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
u = ref_pognodev(g->nodes, nidx);
if(x < u->rbeg || x >= u->rend){
xb = x;
while(x >= 0){
u = add_node_pog(g, rid, x, get_rdbase_pog(g, rid, x));
e = add_edge_pog(g, u, v, 1, 0);
x --;
v = u;
}
{
u = ref_pognodev(g->nodes, POG_HEAD_NODE);
e = add_edge_pog(g, u, v, 1, 0);
v = u;
}
break;
}
}
return xb;
}
static inline int align_rd_pog_core(POG *g, u2i rid, int W, int *xe){
pog_node_t *u, *v;
pog_edge_t *e;
u4i seqoff, seqlen, seqlex, slen, seqinc;
b4i score, x;
b2i *qp, *row;
__m128i SMASK;
u4i nidx, eidx, coff, roff, mnode;
u4i i, j;
seqoff = g->seqs->rdoffs->buffer[rid];
seqlen = g->seqs->rdlens->buffer[rid];
seqlex = roundup_times(seqlen, 8); // 128 = 8 * sizeof(b2i)
slen = seqlex >> 3;
seqinc = seqlex + 8; // seqlex, max_idx, beg, end, and paddings
#if __BYTE_ORDER == 1234
SMASK = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
#else
// TODO: need to test in BIG_ENDIAN
SMASK = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
#endif
// set query prof
// fit buffer to 16 bytes aligned
set_rd_query_prof(g, rid);
prepare_rd_align_pog(g, rid);
clear_u4v(g->stack);
push_u4v(g->stack, POG_HEAD_NODE);
score = POG_SCORE_MIN;
mnode = POG_TAIL_NODE; // point to POG_TAIL
x = seqlen - 1;
int my_print = (cns_debug > 3);
while(pop_u4v(g->stack, &nidx)){
u = ref_pognodev(g->nodes, nidx);
u->rmax = g->rows->buffer[u->roff + seqlex];
u->rbeg = g->rows->buffer[u->roff + seqlex + 1];
u->rend = g->rows->buffer[u->roff + seqlex + 2];
if(my_print){
fprintf(stderr, "NODEALN[%u:R%u:%u] %d-%d:%d:%d %d %d\n", nidx, u->rid, u->pos, u->rbeg, u->rend, u->rmax,
g->rows->buffer[u->roff + u->rmax], g->btds->buffer[u->coff + u->rmax] >> 2, g->btds->buffer[u->coff + u->rmax] & 0x03);
}
if(g->par->alnmode == POG_ALNMODE_OVERLAP){
if(u->rend >= seqlen && g->rows->buffer[u->roff + x] > score){ // overlap alignment
score = g->rows->buffer[u->roff + x];
mnode = nidx;
}
}
eidx = u->edge;
while(eidx){
e = ref_pogedgev(g->edges, eidx);
eidx = e->next;
v = ref_pognodev(g->nodes, e->node);
qp = g->qprof->buffer + v->base * seqlex;
if(g->rowr->size){
coff = roff = g->rowr->buffer[-- g->rowr->size];
} else {
#if 0
if(g->rows->size + seqinc + g->rows->n_head > g->rows->cap){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
roff = g->rows->size;
inc_b2v(g->rows, seqinc);
coff = g->btds->size;
inc_b2v(g->btds, seqinc);
}
g->btxs->buffer[v->erev + v->vst] = nidx; // save backtrace nidx
sse_band_row_rdaln_pog(g, nidx, e->node, seqlen, v->vst, u->coff, coff, u->roff, roff, qp, W, -1);
if(v->vst){
merge_row_rdaln_pog(g, seqlen, coff, v->coff, roff, v->roff);
push_u4v(g->rowr, roff);
} else {
v->coff = coff;
v->roff = roff;
}
v->vst ++;
if(v->vst == v->nin){
push_u4v(g->stack, e->node);
#if DEBUG
} else if(v->vst > v->nin){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
#endif
}
}
push_u4v(g->rowr, u->roff);
{ // compress-copy btds into btvs
u->voff = g->btvs->size;
inc_u2v(g->btvs, u->rend - u->rbeg);
if(0){
for(i=u->rbeg;i<u->rend;i++){
g->btvs->buffer[u->voff + i - u->rbeg] = (u1i)g->btds->buffer[u->coff + i];
}
} else {
memcpy(g->btvs->buffer + u->voff, g->btds->buffer + u->coff + u->rbeg, (u->rend - u->rbeg) * sizeof(u2i));
}
}
}
v = ref_pognodev(g->nodes, POG_TAIL_NODE);
#if DEBUG
if(v->roff == 0){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(v->voff == 0){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
if(W == 0){
row = ref_b2v(g->rows, v->roff);
j = 0;
for(i=1;i<seqlen;i++){
if(row[i] > row[j]){
j = i;
}
}
v->rmax = row[seqlex] = j;
v->rbeg = row[seqlex + 1] = 0;
v->rend = row[seqlex + 2] = seqlex;
}
if(g->par->alnmode == POG_ALNMODE_OVERLAP){
if(g->rows->buffer[v->roff + v->rmax] + g->par->T > score){
score = g->rows->buffer[v->roff + (g->rows->buffer[v->roff + seqlex])];
mnode = POG_TAIL_NODE;
x = (g->rows->buffer[v->roff + seqlex]);
}
} else {
score = g->rows->buffer[v->roff + seqlen - 1];
mnode = POG_TAIL_NODE;
x = seqlen - 1;
}
if(x == Int(seqlen - 1) && mnode != POG_TAIL_NODE){
v = ref_pognodev(g->nodes, POG_TAIL_NODE);
g->btvs->buffer[v->voff + x - v->rbeg] = (2 | (v->vst << 2));
g->btxs->buffer[v->erev + v->vst] = mnode;
v->vst ++;
}
*xe = x;
return score;
}
static inline int align_rd_pog(POG *g, u2i rid){
int xb, xe, rlen, bad, xlen, score, W, mat;
rlen = g->seqs->rdlens->buffer[rid];
xlen = rlen / 8 * 8;
W = g->par->W? g->par->W : xlen;
xlen = num_min(xlen, g->par->Wmax);
mat = 0;
// try increase W when align score is low
while(1){
score = align_rd_pog_core(g, rid, W, &xe);
mat = _cal_matches_alignment_pog(g, rid, &xb, xe, &bad);
if(cns_debug > 1){
fprintf(stderr, "ALIGN[%03d] len=%u ref=%d,%d band=%d aligned=%d,%d tail=%d mat=%d,%0.3f score=%d\n", rid, g->seqs->rdlens->buffer[rid], g->sbegs->buffer[rid], g->sends->buffer[rid], W, xb + 1, xe + 1, bad, mat, 1.0 * mat / rlen, score);
}
if(rid == 0){
break;
}
if(mat >= rlen * g->par->W_mat_rate && bad < 50){
break;
}
if(W >= xlen) break;
W = num_min(W * 2, xlen);
}
xb = 0;
xb = _alignment2graph_pog(g, rid, xb, xe);
return score;
}
static inline u4i realign_msa_pog_core(POG *g, u4i ridx, int W){
u2i *bcnts[7], *hcovs, *bts, *bs, *bs1;
u1i *r, *s;
int wins[20], winl, winy, winx;
u4i i, *dps, f, h, e, *row1, *row2, max, bt;
int j, beg, end, off, x, y;
hcovs = g->hcovs->buffer;
for(i=0;i<7;i++){
clear_and_encap_u2v(g->bcnts[i], g->msa_len);
bcnts[i] = g->bcnts[i]->buffer;
memset(bcnts[i], 0, g->msa_len * sizeof(u2i));
}
for(i=0;i<g->seqs->nseq;i++){
if(i == ridx) continue;
r = ref_u1v(g->msa, g->msa_len * i);
for(j=0;j<(int)g->msa_len;j++){
bcnts[r[j]][j] ++;
}
}
for(i=0;i<g->msa_len;i++){
max = 0;
for(j=1;j<4;j++){
if(bcnts[j][i] > bcnts[max][i]){
max = j;
}
}
bcnts[5][i] = max;
bcnts[6][i] = bcnts[0][i] + bcnts[1][i] + bcnts[2][i] + bcnts[3][i];
}
s = ref_u1v(g->msa, g->msa_len * ridx);
winl = 20;
winx = 0;
winy = 0;
beg = -1;
memset(wins, 0, winl * sizeof(int));
if(0){
for(i=0;i<g->msa_len;i++){
max = bcnts[5][i];
winy -= wins[winx];
if(s[i] < 4 && ((hcovs[i] >= 4 && bcnts[s[i]][i] == 0))){
wins[winx] = 1;
} else {
wins[winx] = 0;
}
winy += wins[winx];
//fprintf(stderr, " -- hcovs = %d s[i] = %c:%d i = %u winy = %d in %s -- %s:%d --\n", hcovs[i], "ACGT-"[s[i]], bcnts[s[i]][i], i, winy, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
if(Int(i) > winl){
if(winy >= Int(0.9 * winl)){
if(beg < 0){
beg = i - winl;
}
} else {
if(beg >= 0){
end = i;
if(cns_debug > 1){
fprintf(stderr, " -- remove low quality fragment %d - %d: ", beg, end);
for(j=beg;j<end;j++){
fputc("ACGT-"[s[j]], stderr);
}
fputc('\n', stderr);
fflush(stderr);
}
for(j=beg;j<end;j++){
s[j] = 4;
}
}
beg = -1;
}
}
winx = (winx + 1) % winl;
}
}
clear_and_encap_b2v(g->btds, (g->msa_len + 1) * W * 2);
bts = (u2i*)g->btds->buffer;
clear_and_encap_u4v(g->btxs, (g->msa_len + 1) * W * 2);
dps = g->btxs->buffer;
memset(bts, 0, 2 * W * sizeof(u2i));
memset(dps, 0, 2 * W * sizeof(u4i));
bts += W * 2;
dps += W * 2;
for(i=0;i<g->msa_len;i++){
bs1 = bts + 2 * W * (Int(i) - 1);
bs = bts + 2 * W * i;
row1 = dps + 2 * W * (Int(i) - 1);
row2 = dps + 2 * W * i;
f = 0;
off = Int(i) - W;
if(Int(i) <= W){
beg = W - i;
} else {
beg = 0;
}
if(i + W - 1 >= g->msa_len){
end = W + g->msa_len - i;
} else {
end = W * 2 - 1;
}
if(s[i] == 4){
for(j=beg;j<end;j++){
e = row1[j + 1];
//h = row1[j] + 0 + 1;
h = row1[j];
if(h >= e){
bs[j] = 0;
} else {
h = e;
bs[j] = 1;
}
if(h >= f){
f = h;
} else {
h = f;
bs[j] = 2;
}
row2[j] = h;
}
} else {
for(j=beg;j<end;j++){
e = row1[j + 1];
//h = row1[j] + (s[i] == bcnts[5][off + j]? bcnts[s[i]][off + j] : 0);
if(Int(bcnts[s[i]][off + j] + 1) >= Int(bcnts[6][off + j] - bcnts[s[i]][off + j])){
h = row1[j] + 2 * bcnts[s[i]][off + j] - bcnts[6][off + j] + 1;
} else {
h = row1[j] + 1;
}
// bonus for putting homo together
if(i && bs1[j] == 0 && s[i] == s[i - 1]){
h += 1;
}
if(h >= e){
bs[j] = 0;
} else {
h = e;
bs[j] = 1;
}
if(h >= f){
f = h;
} else {
h = f;
bs[j] = 2;
}
row2[j] = h;
}
}
if(beg){
row2[beg - 1] = 0;
bs[beg - 1] = 0;
}
row2[end] = 0;
bs[end] = 0;
if(cns_debug > 3){
fprintf(stderr, "ROW[%u] '%c' %03d-%03d:", i, "ACGT-"[s[i]], beg, end);
for(j=beg;j<end;j++){
fprintf(stderr, " %5u[%d]", row2[j], bs[j]);
}
fprintf(stderr, "\n");
}
}
row2 = dps + 2 * W * (g->msa_len - 1);
max = row2[W];
r = ref_u1v(g->msa, g->msa_len * g->seqs->nseq);
x = y = g->msa_len - 1;
while(y >= 0 && x >= 0){
off = x - W;
bs = bts + 2 * W * x;
bt = bs[y - off];
switch(bt){
case 0: r[y] = s[x]; x --; y --; break;
case 1: x --; break;
case 2: r[y] = 4; y --; break;
}
if(cns_debug > 3 && bt){
fprintf(stderr, " -- x = %d y = %d bt = %d in %s -- %s:%d --\n", x, y, bt, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
}
}
if(cns_debug > 1){
fprintf(stderr, " -- max = %d x = %d y = %d in %s -- %s:%d --\n", max, x, y, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
}
while(y >= 0){
r[y] = 4;
y --;
}
return max;
}
static inline void realign_msa_pog(POG *g){
u4i ridx;
if(g->par->rW <= 0 || g->seqs->nseq < 3) return;
if(cns_debug > 1){
fprintf(stderr, "RAW MSA\n");
print_msa_pog(g, stderr);
}
for(ridx=0;ridx<g->seqs->nseq;ridx++){
if(cns_debug > 1){
fprintf(stderr, "REALIGN[%u]\n", ridx);
}
realign_msa_pog_core(g, ridx, g->par->rW);
if(cns_debug > 1){
print_msa_pog(g, stderr);
}
// update ridx alignment
memcpy(g->msa->buffer + g->msa_len * ridx, g->msa->buffer + g->msa_len * g->seqs->nseq, g->msa_len);
}
}
static const float homo_merging_cmp_norm[20] = {
0.95, 0.90, 0.80, 0.75, 0.70,
0.70, 0.65, 0.60, 0.60, 0.55,
0.50, 0.45, 0.40, 0.30, 0.20,
0.20, 0.20, 0.20, 0.20, 0.20
};
/*
static inline void dp_gen_cns_pog(POG *g){
pog_cns_t dps[2 * 5], *dp1, *dp2;
b2i *row1, *row2, *row3;
u1i *s, *r;
u4i mcnt;
int bcnts[5];
u2i *hcovs, *rexs;
u4i rid, beg, end, x, val, fidx, col, dep;
u4i mx, mi, i, j;
float DP_MIN, score0, score1;
DP_MIN = - (MAX_B4 >> 1);
mcnt = num_min(g->par->msa_min_cnt, g->seqs->nseq);
clear_and_encap_u1v(g->cbts, g->msa_len * 5);
// scan read end
clear_u4v(g->btxs);
clear_u2v(g->btvs); // whether read cov current column
hcovs = g->hcovs->buffer;
memset(hcovs, 0, g->msa_len * sizeof(u2i));
for(rid=0;rid<g->seqs->nseq;rid++){
push_u2v(g->btvs, MAX_U2);
r = ref_u1v(g->msa, g->msa_len * rid);
beg = 0;
while(beg < g->msa_len && r[beg] == 4) beg ++;
push_u4v(g->btxs, (0U << 31) | (rid << 16) | beg);
end = g->msa_len;
while(end > beg && r[end - 1] == 4) end --;
push_u4v(g->btxs, (1U << 31) | (rid << 16) | end);
for(i=beg;i<end;i++) hcovs[i] ++;
}
sort_array(g->btxs->buffer, g->btxs->size, u4i, num_cmpgt(a & 0xFFFF, b & 0xFFFF));
mx = 0;
rexs = g->btvs->buffer;
memset(dps, 0, 2 * 5 * sizeof(pog_cns_t));
fidx = 0;
clear_b2v(g->rows);
clear_u4v(g->rowr);
dp1 = dps;
inc_b2v(g->rows, g->par->cW * g->seqs->nseq);
for(i=0;i<5;i++){
dp1[i].coff = g->rows->size;
inc_b2v(g->rows, g->par->cW * g->seqs->nseq);
row1 = g->rows->buffer + dp1[i].coff;
memset(row1, 0, g->par->cW * g->seqs->nseq * sizeof(b2i));
}
dep = 0;
for(col=0;col<g->msa_len;col++){
fidx = !fidx;
dp1 = dps + fidx * 5;
dp2 = dps + (!fidx) * 5;
// collect read coverage
while(mx < g->btxs->size){
val = g->btxs->buffer[mx];
if((val & 0xFFFF) <= col){
rid = (val << 1) >> 17;
if(val & (1U << 31)){
#if DEBUG
if(rexs[rid] == MAX_U2){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
rexs[rid] = MAX_U2;
dep --;
} else {
#if DEBUG
if(rexs[rid] < MAX_U2){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
rexs[rid] = 0;
dep ++;
}
mx ++;
} else {
break;
}
}
for(i=0;i<5;i++){
dp2[i].max = DP_MIN;
dp2[i].bt = 4;
if(g->rowr->size){
dp2[i].coff = pop_u4v(g->rowr);
} else {
dp2.coff = g->rows->size;
inc_b2v(g->rows, g->par->cW * g->seqs->nseq);
}
}
row2 = g->rows->buffer; // temprary
if(dep < mcnt && !(g->par->refmode && rexs[0] != MAX_U2)){
//TODO: coding from here
mi = 4;
for(i=0;i<4;i++){
if(dp1[i] > dp1[mi]){
mi = i;
}
}
dp2[4] = dp1[mi];
bts[4] = mi;
} else {
for(i=0;i<=4;i++){
score0 = dp1[i];
for(j=0;j<=4;j++){
score1 = score0;
for(x=0;x<=4;x++){
if(j == 4){
if(x != 4){
score1 += bcnts[x] * g->par->I;
}
} else {
if(x == 4){
if(i == j){
score1 += bcnts[x] * g->par->H;
} else {
score1 += bcnts[x] * g->par->D;
}
} else if(x == j){
score1 += bcnts[x] * g->par->M;
} else {
score1 += bcnts[x] * g->par->X;
}
}
}
if(score1 > dp2[j]){
dp2[j] = score1;
bts[j] = i;
}
}
}
// Prior to the first read
for(rid=0;rid<g->seqs->nseq;rid++){
if(rexs[i] == 0){
continue;
}
r = ref_u1v(g->msa, g->msa_len * rid);
dp2[r[col]] += 0.1;
break;
}
}
for(i=0;i<=4;i++){
push_u1v(g->cbts, bts[i]);
}
if(cns_debug > 2){
fprintf(stderr, "[DPCNS%03u]\t[%2d, %2d, %2d, %2d, %2d]", col, bcnts[0], bcnts[1], bcnts[2], bcnts[3], bcnts[4]);
//fprintf(stderr, "\t[%0.1f, %0.1f, %0.1f, %0.1f, %0.1f]", dp1[0], dp1[1], dp1[2], dp1[3], dp1[4]);
fprintf(stderr, "\t[%0.1f:%d, %0.1f:%d, %0.1f:%d, %0.1f:%d, %0.1f:%d]\n", dp2[0], bts[0], dp2[1], bts[1], dp2[2], bts[2], dp2[3], bts[3], dp2[4], bts[4]);
}
}
// Backtrace consensus sequence
dp2 = dps + (!fidx) * 5;
mx = 4;
for(i=0;i<4;i++){
if(dp2[i] > dp2[mx]){
mx = i;
}
}
x = g->msa_len;
s = ref_u1v(g->msa, g->msa_len * g->seqs->nseq);
while(x){
x --;
s[x] = mx;
mx = g->cbts->buffer[x * 5 + mx];
}
for(i=0;i<g->msa_len;i++){
if(s[i] < 4){
bit2basebank(g->cns, s[i]);
}
}
}
*/
static inline void dp_call_cns_pog(POG *g){
u1i *s, *r, bts[5];
u4i mcnt;
int bcnts[5];
u2i *hcovs, *rexs;
u4i rid, beg, end, x, val, fidx, col, dep;
u4i mx, mi, i, j;
float dps[2 * 5], *dp1, *dp2, DP_MIN, score0, score1;
DP_MIN = - (MAX_B4 >> 1);
mcnt = num_min(g->par->msa_min_cnt, g->seqs->nseq);
clear_and_encap_u1v(g->cbts, g->msa_len * 5);
// scan read end
clear_u4v(g->btxs);
clear_u2v(g->btvs); // whether read cov current column
hcovs = g->hcovs->buffer;
memset(hcovs, 0, g->msa_len * sizeof(u2i));
for(rid=0;rid<g->seqs->nseq;rid++){
push_u2v(g->btvs, 0);
r = ref_u1v(g->msa, g->msa_len * rid);
beg = 0;
while(beg < g->msa_len && r[beg] == 4) beg ++;
push_u4v(g->btxs, (0U << 31) | (rid << 16) | beg);
end = g->msa_len;
while(end > beg && r[end - 1] == 4) end --;
push_u4v(g->btxs, (1U << 31) | (rid << 16) | end);
for(i=beg;i<end;i++) hcovs[i] ++;
}
sort_array(g->btxs->buffer, g->btxs->size, u4i, num_cmpgt(a & 0xFFFF, b & 0xFFFF));
memset(dps, 0, 2 * 5 * sizeof(float));
fidx = 0;
dep = 0;
mx = 0;
rexs = g->btvs->buffer;
for(col=0;col<g->msa_len;col++){
fidx = !fidx;
dp1 = dps + fidx * 5;
dp2 = dps + (!fidx) * 5;
// collect read coverage
while(mx < g->btxs->size){
val = g->btxs->buffer[mx];
if((val & 0xFFFF) <= col){
rid = (val << 1) >> 17;
if(val & (1U << 31)){
#if DEBUG
if(rexs[rid] == 0){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
rexs[rid] = 0;
dep --;
} else {
#if DEBUG
if(rexs[rid] == 1){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
#endif
rexs[rid] = 1;
dep ++;
}
mx ++;
} else {
break;
}
}
bts[0] = bts[1] = bts[2] = bts[3] = bts[4] = 4;
dp2[0] = DP_MIN;
dp2[1] = DP_MIN;
dp2[2] = DP_MIN;
dp2[3] = DP_MIN;
dp2[4] = DP_MIN;
memset(bcnts, 0, sizeof(int) * 5);
for(rid=0;rid<g->seqs->nseq;rid++){
if(rexs[rid] == 0){
continue;
}
r = ref_u1v(g->msa, g->msa_len * rid);
bcnts[r[col]] ++;
}
if(dep < mcnt && !(g->par->refmode && rexs[0])){
mi = 4;
for(i=0;i<4;i++){
if(dp1[i] > dp1[mi]){
mi = i;
}
}
dp2[4] = dp1[mi];
bts[4] = mi;
} else {
for(i=0;i<=4;i++){
score0 = dp1[i];
for(j=0;j<=4;j++){
score1 = score0;
for(x=0;x<=4;x++){
if(j == 4){
if(x != 4){
score1 += bcnts[x] * g->par->I;
}
} else {
if(x == 4){
if(i == j){
score1 += bcnts[x] * g->par->H;
} else {
score1 += bcnts[x] * g->par->D;
}
} else if(x == j){
score1 += bcnts[x] * g->par->M;
} else {
score1 += bcnts[x] * g->par->X;
}
}
}
if(score1 > dp2[j]){
dp2[j] = score1;
bts[j] = i;
}
}
}
// Prior to the first read
for(rid=0;rid<g->seqs->nseq;rid++){
if(rexs[i] == 0){
continue;
}
r = ref_u1v(g->msa, g->msa_len * rid);
dp2[r[col]] += 0.1;
break;
}
}
for(i=0;i<=4;i++){
push_u1v(g->cbts, bts[i]);
}
if(cns_debug > 2){
fprintf(stderr, "[DPCNS%03u]\t[%2d, %2d, %2d, %2d, %2d]", col, bcnts[0], bcnts[1], bcnts[2], bcnts[3], bcnts[4]);
//fprintf(stderr, "\t[%0.1f, %0.1f, %0.1f, %0.1f, %0.1f]", dp1[0], dp1[1], dp1[2], dp1[3], dp1[4]);
fprintf(stderr, "\t[%0.1f:%d, %0.1f:%d, %0.1f:%d, %0.1f:%d, %0.1f:%d]\n", dp2[0], bts[0], dp2[1], bts[1], dp2[2], bts[2], dp2[3], bts[3], dp2[4], bts[4]);
}
}
// Backtrace consensus sequence
dp2 = dps + (!fidx) * 5;
mx = 4;
for(i=0;i<4;i++){
if(dp2[i] > dp2[mx]){
mx = i;
}
}
x = g->msa_len;
s = ref_u1v(g->msa, g->msa_len * g->seqs->nseq);
while(x){
x --;
s[x] = mx;
mx = g->cbts->buffer[x * 5 + mx];
}
for(i=0;i<g->msa_len;i++){
if(s[i] < 4){
bit2basebank(g->cns, s[i]);
}
}
}
static inline void gen_cns_pog(POG *g){
u1i *s, *r;
u4i idx, lst, lsv, lsx, rid, max, i, mcnt, min_cnt, max_cnt, runlen, cnl, beg, end, cbeg, cend;
u4i freqs[5][11], fmax1, fmax2, tot, cut, *hcnts;
u2i cnts[5], *bls, cl, bx, bl, bm, dl, dm, *hcovs, corr;
//u2i *vsts;
float fl, fx1, fx2, norm, flush;
mcnt = num_min(g->par->msa_min_cnt, g->seqs->nseq);
min_cnt = num_max(mcnt, UInt(g->par->msa_min_freq * g->seqs->nseq));
max_cnt = g->seqs->nseq - min_cnt;
s = ref_u1v(g->msa, g->msa_len * g->seqs->nseq);
memset(s, 4, g->msa_len);
clear_basebank(g->cns);
hcovs = g->hcovs->buffer;
bls = hcovs + 8 + g->msa_len; // bls is all zeros, see end_pog
if(g->par->rW){
memset(hcovs, 0, g->msa_len * sizeof(u2i));
for(rid=0;rid<g->seqs->nseq;rid++){
r = ref_u1v(g->msa, g->msa_len * rid);
beg = 0;
while(beg < g->msa_len && r[beg] == 4) beg ++;
end = g->msa_len;
while(end > beg && r[end - 1] == 4) end --;
for(i=beg;i<end;i++) hcovs[i] ++;
}
}
// revise mcnt
if(g->par->refmode){
} else if(0){
tot = 0;
clear_and_encap_u4v(g->btxs, mcnt + 1);
hcnts = g->btxs->buffer;
//hcnts = alloca((mcnt + 1) * sizeof(u4i));
memset(hcnts, 0, (mcnt + 1) * sizeof(u4i));
for(i=0;i<g->msa_len;i++){
tot += (hcovs[i] + 1) * (hcovs[i]) / 2;
hcnts[num_min(hcovs[i], mcnt)] += (hcovs[i] + 1) * (hcovs[i]) / 2;
}
cut = tot * 0.8;
tot = 0;
for(i=mcnt;i>1;i--){
tot += hcnts[i];
if(tot >= cut){
break;
}
}
if(i < mcnt){
if(cns_debug > 1){
fprintf(stderr, " Revise mcnt %u -> %u\n", mcnt, i); fflush(stderr);
}
mcnt = i;
min_cnt = i;
max_cnt = g->seqs->nseq - min_cnt;
}
}
// gen cns
cbeg = 0;
cend = g->msa_len;
if(g->par->refmode){
rid = 0;
{
r = ref_u1v(g->msa, g->msa_len * rid);
cbeg = 0;
while(cbeg < g->msa_len && r[cbeg] == 4){ hcovs[cbeg] = 0; cbeg ++; }
end = g->msa_len;
while(cend > cbeg && r[cend - 1] == 4){ cend --; hcovs[cend] = 0; }
}
} else {
cbeg = 0;
while(cbeg < g->msa_len && hcovs[cbeg] < min_cnt) cbeg ++;
cend = g->msa_len;
while(cend > cbeg && hcovs[cend - 1] < min_cnt) cend --;
}
fmax1 = 5;
fmax2 = 10;
if(cns_debug > 1){
for(i=0;i<fmax1;i++){
memset(freqs[i], 0, (fmax2 + 1) * sizeof(u4i));
}
}
lst = lsv = MAX_U4;
lsx = 0;
runlen = 0;
cl = 0;
cnl = 0;
end = 0;
flush = 0;
for(idx=cbeg;idx<=cend;idx++){
if(idx < cend){
flush = 0;
memset(cnts, 0, 5 * 2);
if(g->par->refmode){
if((max = g->msa->buffer[g->msa_len * 0 + idx]) == 4){
max = 0;
} else {
if(hcovs[idx] < min_cnt){
flush = 1;
}
}
} else {
max = 0;
}
for(rid=0;rid<g->seqs->nseq;rid++){
cnts[g->msa->buffer[g->msa_len * rid + idx]] ++;
}
for(i=0;i<4;i++){
if(cnts[i] > cnts[max]){
max = i;
}
}
if(flush){
s[idx] = max;
end = idx;
} else if(hcovs[idx] >= min_cnt && cnts[max] >= mcnt && cnts[4] - (g->seqs->nseq - hcovs[idx]) < (g->seqs->nseq - cnts[4])){
s[idx] = max;
end = idx;
flush = 1;
}
} else {
if(end + 1 == idx){
end = idx;
}
max = 4;
flush = 1;
}
if(flush){
if(lst == MAX_U4){ lst = idx; lsv = idx; }
if(idx == cend || s[lst] != max){
runlen = 0;
for(rid=0;rid<g->seqs->nseq;rid++){
r = ref_u1v(g->msa, g->msa_len * rid);
bl = 0;
for(i=lsv;i<end;i++){
if(r[i] == s[lst]) bl ++;
}
bls[rid] = bl;
runlen += bl;
}
sort_array(bls, g->seqs->nseq, u2i, num_cmpgt(b, a));
bx = 1;
bl = bls[0];
bm = 1;
dl = 0;
dm = 0;
for(rid=1;rid<=hcovs[lst];rid++){
if(rid < hcovs[lst] && bls[rid] == bls[rid - 1]){
bx ++;
} else {
if(cns_debug > 1 && cl <= fmax1 && bls[rid - 1] <= fmax2){
freqs[cl - 1][bls[rid - 1]] += bx;
}
if(bx > bm){
if(dm < bm){
dm = bm;
dl = bl;
}
bm = bx;
bl = bls[rid - 1];
} else if(bx > dm){
dm = bx;
dl = bls[rid - 1];
}
bx = 1;
}
}
fl = runlen * 1.0 / hcovs[lst];
if(cns_debug > 1){
fprintf(stderr, "HOMO[%3u] %4u\t%c\t%u\t%u:%u\t%u:%u\t%u\t%u\t%0.2f", lst, cnl, bit_base_table[s[lst]], cl, bl, bm, dl, dm, runlen, hcovs[lst], fl);
if(cl != bl){
fprintf(stderr, " *\n");
} else {
fprintf(stderr, "\n");
}
}
if(cl != bl){
if(bl > cl + 1 && bl > Int(fl) + 1) bl = Int(fl) + 1; // Don't correct too much
if(bl + 1 < cl && (float)bl < fl) bl = cl - 1;
if(bl < cl && cl == dl && dm >= Int(0.8 * bm)){ // prefer increase runlen
norm = 1.0 / homo_merging_cmp_norm[num_min(cl, 19)];
} else {
norm = homo_merging_cmp_norm[num_min(bl, 19)];
}
fx1 = num_abs(cl - fl);
fx2 = num_abs(bl - fl);
if(norm * fx2 <= fx1){
corr = bl;
if(cns_debug > 1){
fprintf(stderr, "CORR[%03u] %4u\t%c\t%u->%u\n", lst, cnl, bit_base_table[s[lst]], cl, bl);
}
} else {
corr = cl;
}
} else {
corr = cl;
}
cnl += cl;
for(i=0;i<corr;i++){
bit2basebank(g->cns, s[lst]);
if(i){
s[lst + i] = s[lst];
}
}
for(i+=lst;i<end;i++){
s[i] = 4;
}
runlen = 0;
lst = idx;
lsv = lsx;
lsx = idx;
cl = 1;
} else {
cl ++;
lsx = idx;
}
}
}
if(cns_debug > 1){
for(cl=0;cl<fmax1;cl++){
fprintf(stderr, "RUNLEN[%u]", cl + 1);
for(bl=0;bl<=fmax2;bl++){
fprintf(stderr, "%10u,", freqs[cl][bl]);
}
fprintf(stderr, "\n");
}
}
}
static inline void check_dup_edges_pog(POG *g){
pog_node_t *u;
pog_edge_t *e;
u32hash *hash;
u4i nidx, eidx, *t;
int exists;
hash = init_u32hash(13);
for(nidx=0;nidx<g->nodes->size;nidx++){
u = ref_pognodev(g->nodes, nidx);
clear_u32hash(hash);
eidx = u->edge;
while(eidx){
e = ref_pogedgev(g->edges, eidx);
eidx = e->next;
t = prepare_u32hash(hash, e->node, &exists);
if(exists){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
} else {
*t = e->node;
}
}
}
free_u32hash(hash);
}
static inline void end_pog(POG *g){
pog_node_t *u, *v, *x;
pog_edge_t *e;
u1i *r;
u4i i, ridx, nidx, eidx, xidx, moff, ready, beg, end, reflen, margin;
int score;
clear_basebank(g->cns);
if(g->seqs->nseq == 0) return;
score = align_rd_pog(g, 0);
if(g->par->refmode){
// add edges to refbeg and refend, to make sure reads can be aligned quickly and correctly within small bandwidth
u = ref_pognodev(g->nodes, POG_HEAD_NODE);
reflen = g->seqs->rdlens->buffer[0];
margin = 5;
for(i=0;i<g->sbegs->size;i++){
if(g->sbegs->buffer[i] < margin || g->sbegs->buffer[i] + margin >= reflen) continue;
nidx = POG_TAIL_NODE + 1 + (reflen - 1 - g->sbegs->buffer[i]);
v = ref_pognodev(g->nodes, nidx);
e = add_edge_pog(g, u, v, 0, 1);
}
v = ref_pognodev(g->nodes, POG_TAIL_NODE);
for(i=0;i<g->sends->size;i++){
if(g->sends->buffer[i] < margin || g->sends->buffer[i] + margin >= reflen) continue;
nidx = POG_TAIL_NODE + 1 + (reflen - g->sends->buffer[i]);
u = ref_pognodev(g->nodes, nidx);
e = add_edge_pog(g, u, v, 0, 1);
}
}
for(ridx=1;ridx<g->seqs->nseq;ridx++){
score = align_rd_pog(g, ridx);
}
for(nidx=0;nidx<g->nodes->size;nidx++){
u = ref_pognodev(g->nodes, nidx);
u->vst = 0;
u->aux = 0;
u->coff = 0;
u->erev = 0;
}
// calcuate msa_len
clear_u4v(g->stack);
push_u4v(g->stack, POG_HEAD_NODE);
nidx = POG_HEAD_NODE;
while(pop_u4v(g->stack, &nidx)){
u = ref_pognodev(g->nodes, nidx);
eidx = u->edge;
while(eidx){
e = ref_pogedgev(g->edges, eidx);
eidx = e->next;
v = ref_pognodev(g->nodes, e->node);
if(u->coff + 1 > v->coff){
v->coff = u->coff + 1;
}
v->vst ++;
if(v->vst > v->nin){
print_vstdot_pog(g, "1.dot");
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
check_dup_edges_pog(g);
abort();
}
}
eidx = u->edge;
while(eidx){
e = ref_pogedgev(g->edges, eidx);
eidx = e->next;
v = ref_pognodev(g->nodes, e->node);
if(v->aux) continue; // already pushed
if(v->vst > v->nin){
print_vstdot_pog(g, "1.dot");
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
check_dup_edges_pog(g);
abort();
}
if(v->vst == v->nin){
ready = 1;
{
xidx = v->aligned;
moff = v->coff;
while(xidx != e->node){
x = ref_pognodev(g->nodes, xidx);
if(x->nin > x->vst){
ready = 0;
break;
}
if(x->coff > moff){
moff = x->coff;
}
xidx = x->aligned;
}
}
if(ready){
v->coff = moff;
v->aux = 1;
push_u4v(g->stack, e->node);
xidx = v->aligned;
while(xidx != e->node){
x = ref_pognodev(g->nodes, xidx);
x->coff = moff;
if(x->edge){
push_u4v(g->stack, xidx);
x->aux = 1;
}
xidx = x->aligned;
}
}
}
}
}
if(nidx != POG_TAIL_NODE){
fprint_dot_pog(g, "1.dot", NULL);
print_seqs_pog(g, "1.seqs.fa", NULL);
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
// generate MSA
u = ref_pognodev(g->nodes, POG_TAIL_NODE);
g->msa_len = u->coff;
for(nidx=0;nidx<g->nodes->size;nidx++){
u = ref_pognodev(g->nodes, nidx);
u->vst = 0;
}
clear_and_encap_u1v(g->msa, g->msa_len * (g->seqs->nseq + 2));
g->msa->size = g->msa_len * (g->seqs->nseq + 1);
memset(g->msa->buffer, 4, g->msa->size);
clear_u4v(g->stack);
push_u4v(g->stack, POG_HEAD_NODE);
while(pop_u4v(g->stack, &nidx)){
u = ref_pognodev(g->nodes, nidx);
eidx = u->edge;
while(eidx){
e = ref_pogedgev(g->edges, eidx);
eidx = e->next;
v = ref_pognodev(g->nodes, e->node);
v->vst ++;
if(v->vst >= v->nin){
ready = 1;
xidx = v->aligned;
while(xidx != e->node){
x = ref_pognodev(g->nodes, xidx);
if(x->nin > x->vst){
ready = 0;
break;
}
xidx = x->aligned;
}
if(ready){
xidx = e->node;
do {
x = ref_pognodev(g->nodes, xidx);
if(xidx != POG_TAIL_NODE){
set_u1v(g->msa, x->rid * g->msa_len + x->coff, (x->base) & 0x03);
}
if(x->edge){
push_u4v(g->stack, xidx);
}
xidx = x->aligned;
} while(xidx != e->node);
}
}
}
}
clear_and_encap_u2v(g->hcovs, g->msa_len + 8 + g->seqs->nseq);
memset(g->hcovs->buffer, 0, (g->msa_len + 8 + g->seqs->nseq) * sizeof(u2i));
for(ridx=0;ridx<g->seqs->nseq;ridx++){
r = ref_u1v(g->msa, g->msa_len * ridx);
beg = 0;
while(beg < g->msa_len && r[beg] == 4) beg ++;
end = g->msa_len;
while(end && r[end - 1] == 4) end --;
for(i=beg;i<end;i++) g->hcovs->buffer[i] ++;
}
// realignment
realign_msa_pog(g);
//// realignment again
//realign_msa_pog(g);
// gen consensus line
if(g->par->cnsmode == 1){
dp_call_cns_pog(g);
} else {
gen_cns_pog(g);
}
if(cns_debug > 1){
print_msa_pog(g, stderr);
}
if(0){
fprintf(stderr, " -- seqs\t%llu --\n", (u8i)g->seqs->rdseqs->cap); fflush(stderr);
fprintf(stderr, " -- nodes\t%llu --\n", (u8i)g->nodes->cap); fflush(stderr);
fprintf(stderr, " -- edges\t%llu --\n", (u8i)g->edges->cap); fflush(stderr);
fprintf(stderr, " -- rows\t%llu/%llu --\n", (u8i)g->rows->size, (u8i)g->rows->cap); fflush(stderr);
fprintf(stderr, " -- btds\t%llu/%llu --\n", (u8i)g->rows->size, (u8i)g->btds->cap); fflush(stderr);
fprintf(stderr, " -- btxs\t%llu/%llu --\n", (u8i)g->btxs->size, (u8i)g->btxs->cap); fflush(stderr);
fprintf(stderr, " -- cns\t%llu --\n", (u8i)g->cns->cap); fflush(stderr);
}
}
#endif
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/Velcon-Zheng/wtdbg2.git
git@gitee.com:Velcon-Zheng/wtdbg2.git
Velcon-Zheng
wtdbg2
wtdbg2
master

搜索帮助