1 Star 0 Fork 0

Velcon-Zheng/bowtie2

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
aligner_sw.cpp 102.12 KB
一键复制 编辑 原始数据 按行查看 历史
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243
/*
* Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
#include <limits>
// -- BTL remove --
//#include <stdlib.h>
//#include <sys/time.h>
// -- --
#include "aligner_sw.h"
#include "aligner_result.h"
#include "search_globals.h"
#include "scoring.h"
#include "mask.h"
/**
* Initialize with a new read.
*/
void SwAligner::initRead(
const BTDnaString& rdfw, // forward read sequence
const BTDnaString& rdrc, // revcomp read sequence
const BTString& qufw, // forward read qualities
const BTString& qurc, // reverse read qualities
size_t rdi, // offset of first read char to align
size_t rdf, // offset of last read char to align
const Scoring& sc) // scoring scheme
{
assert_gt(rdf, rdi);
int nceil = sc.nCeil.f<int>((double)rdfw.length());
rdfw_ = &rdfw; // read sequence
rdrc_ = &rdrc; // read sequence
qufw_ = &qufw; // read qualities
qurc_ = &qurc; // read qualities
rdi_ = rdi; // offset of first read char to align
rdf_ = rdf; // offset of last read char to align
sc_ = &sc; // scoring scheme
nceil_ = nceil; // max # Ns allowed in ref portion of aln
readSse16_ = false; // true -> sse16 from now on for this read
initedRead_ = true;
#ifndef NO_SSE
sseU8fwBuilt_ = false; // built fw query profile, 8-bit score
sseU8rcBuilt_ = false; // built rc query profile, 8-bit score
sseI16fwBuilt_ = false; // built fw query profile, 16-bit score
sseI16rcBuilt_ = false; // built rc query profile, 16-bit score
#endif
if(dpLog_ != NULL) {
if(!firstRead_) {
(*dpLog_) << '\n';
}
(*dpLog_) << rdfw.toZBuf() << '\t' << qufw.toZBuf();
}
firstRead_ = false;
}
/**
* Initialize with a new alignment problem.
*/
void SwAligner::initRef(
bool fw, // whether to forward or revcomp read is aligning
TRefId refidx, // id of reference aligned against
const DPRect& rect, // DP rectangle
char *rf, // reference sequence
size_t rfi, // offset of first reference char to align to
size_t rff, // offset of last reference char to align to
TRefOff reflen, // length of reference sequence
const Scoring& sc, // scoring scheme
TAlScore minsc, // minimum score
bool enable8, // use 8-bit SSE if possible?
size_t cminlen, // minimum length for using checkpointing scheme
size_t cpow2, // interval b/t checkpointed diags; 1 << this
bool doTri, // triangular mini-fills?
bool extend) // is this a seed extension?
{
size_t readGaps = sc.maxReadGaps(minsc, rdfw_->length());
size_t refGaps = sc.maxRefGaps(minsc, rdfw_->length());
assert_geq(readGaps, 0);
assert_geq(refGaps, 0);
assert_gt(rff, rfi);
rdgap_ = readGaps; // max # gaps in read
rfgap_ = refGaps; // max # gaps in reference
state_ = STATE_INITED;
fw_ = fw; // orientation
rd_ = fw ? rdfw_ : rdrc_; // read sequence
qu_ = fw ? qufw_ : qurc_; // quality sequence
refidx_ = refidx; // id of reference aligned against
rf_ = rf; // reference sequence
rfi_ = rfi; // offset of first reference char to align to
rff_ = rff; // offset of last reference char to align to
reflen_ = reflen; // length of entire reference sequence
rect_ = &rect; // DP rectangle
minsc_ = minsc; // minimum score
cural_ = 0; // idx of next alignment to give out
initedRef_ = true; // indicate we've initialized the ref portion
enable8_ = enable8; // use 8-bit SSE if possible?
extend_ = extend; // true iff this is a seed extension
cperMinlen_ = cminlen; // reads shorter than this won't use checkpointer
cperPerPow2_ = cpow2; // interval b/t checkpointed diags; 1 << this
cperEf_ = true; // whether to checkpoint H, E, and F
cperTri_ = doTri; // triangular mini-fills?
bter_.initRef(
fw_ ? rdfw_->buf() : // in: read sequence
rdrc_->buf(),
fw_ ? qufw_->buf() : // in: quality sequence
qurc_->buf(),
rd_->length(), // in: read sequence length
rf_ + rfi_, // in: reference sequence
rff_ - rfi_, // in: in-rectangle reference sequence length
reflen, // in: total reference sequence length
refidx_, // in: reference id
rfi_, // in: reference offset
fw_, // in: orientation
rect_, // in: DP rectangle
&cper_, // in: checkpointer
*sc_, // in: scoring scheme
nceil_); // in: N ceiling
// Record the reference sequence in the log
if(dpLog_ != NULL) {
(*dpLog_) << '\t';
(*dpLog_) << refidx_ << ',';
(*dpLog_) << reflen_ << ',';
(*dpLog_) << minsc_ << ',';
(*dpLog_) << (fw ? '+' : '-') << ',';
rect_->write(*dpLog_);
(*dpLog_) << ',';
for(TRefOff i = rfi_; i < rff_; i++) {
(*dpLog_) << mask2dna[(int)rf[i]];
}
}
}
/**
* Given a read, an alignment orientation, a range of characters in a referece
* sequence, and a bit-encoded version of the reference, set up and execute the
* corresponding dynamic programming problem.
*
* The caller has already narrowed down the relevant portion of the reference
* using, e.g., the location of a seed hit, or the range of possible fragment
* lengths if we're searching for the opposite mate in a pair.
*/
void SwAligner::initRef(
bool fw, // whether to forward or revcomp read is aligning
TRefId refidx, // reference aligned against
const DPRect& rect, // DP rectangle
const BitPairReference& refs, // Reference strings
TRefOff reflen, // length of reference sequence
const Scoring& sc, // scoring scheme
TAlScore minsc, // minimum score
bool enable8, // use 8-bit SSE if possible?
size_t cminlen, // minimum length for using checkpointing scheme
size_t cpow2, // interval b/t checkpointed diags; 1 << this
bool doTri, // triangular mini-fills?
bool extend, // true iff this is a seed extension
size_t upto, // count the number of Ns up to this offset
size_t& nsUpto) // output: the number of Ns up to 'upto'
{
TRefOff rfi = rect.refl;
TRefOff rff = rect.refr + 1;
assert_gt(rff, rfi);
// Capture an extra reference character outside the rectangle so that we
// can check matches in the next column over to the right
rff++;
// rflen = full length of the reference substring to consider, including
// overhang off the boundaries of the reference sequence
const size_t rflen = (size_t)(rff - rfi);
// Figure the number of Ns we're going to add to either side
size_t leftNs =
(rfi >= 0 ? 0 : (size_t)std::abs(static_cast<long>(rfi)));
leftNs = min(leftNs, rflen);
size_t rightNs =
(rff <= (TRefOff)reflen ? 0 : (size_t)std::abs(static_cast<long>(rff - reflen)));
rightNs = min(rightNs, rflen);
// rflenInner = length of just the portion that doesn't overhang ref ends
assert_geq(rflen, leftNs + rightNs);
const size_t rflenInner = rflen - (leftNs + rightNs);
#ifndef NDEBUG
bool haveRfbuf2 = false;
EList<char> rfbuf2(rflen);
// This is really slow, so only do it some of the time
if((rand() % 10) == 0) {
TRefOff rfii = rfi;
for(size_t i = 0; i < rflen; i++) {
if(rfii < 0 || (TRefOff)rfii >= reflen) {
rfbuf2.push_back(4);
} else {
rfbuf2.push_back(refs.getBase(refidx, (size_t)rfii));
}
rfii++;
}
haveRfbuf2 = true;
}
#endif
// rfbuf_ = uint32_t list large enough to accommodate both the reference
// sequence and any Ns we might add to either side.
rfwbuf_.resize((rflen + 16) / 4);
int offset = refs.getStretch(
rfwbuf_.ptr(), // buffer to store words in
refidx, // which reference
(rfi < 0) ? 0 : (size_t)rfi, // starting offset (can't be < 0)
rflenInner // length to grab (exclude overhang)
ASSERT_ONLY(, tmp_destU32_));// for BitPairReference::getStretch()
assert_leq(offset, 16);
rf_ = (char*)rfwbuf_.ptr() + offset;
// Shift ref chars away from 0 so we can stick Ns at the beginning
if(leftNs > 0) {
// Slide everyone down
for(size_t i = rflenInner; i > 0; i--) {
rf_[i+leftNs-1] = rf_[i-1];
}
// Add Ns
for(size_t i = 0; i < leftNs; i++) {
rf_[i] = 4;
}
}
if(rightNs > 0) {
// Add Ns to the end
for(size_t i = 0; i < rightNs; i++) {
rf_[i + leftNs + rflenInner] = 4;
}
}
#ifndef NDEBUG
// Sanity check reference characters
for(size_t i = 0; i < rflen; i++) {
assert(!haveRfbuf2 || rf_[i] == rfbuf2[i]);
assert_range(0, 4, (int)rf_[i]);
}
#endif
// Count Ns and convert reference characters into A/C/G/T masks. Ambiguous
// nucleotides (IUPAC codes) have more than one mask bit set. If a
// reference scanner was provided, use it to opportunistically resolve seed
// hits.
nsUpto = 0;
for(size_t i = 0; i < rflen; i++) {
// rf_[i] gets mask version of refence char, with N=16
if(i < upto && rf_[i] > 3) {
nsUpto++;
}
rf_[i] = (1 << rf_[i]);
}
// Correct for having captured an extra reference character
rff--;
initRef(
fw, // whether to forward or revcomp read is aligning
refidx, // id of reference aligned against
rect, // DP rectangle
rf_, // reference sequence, wrapped up in BTString object
0, // use the whole thing
(size_t)(rff - rfi), // ditto
reflen, // reference length
sc, // scoring scheme
minsc, // minimum score
enable8, // use 8-bit SSE if possible?
cminlen, // minimum length for using checkpointing scheme
cpow2, // interval b/t checkpointed diags; 1 << this
doTri, // triangular mini-fills?
extend); // true iff this is a seed extension
}
/**
* Given a read, an alignment orientation, a range of characters in a
* referece sequence, and a bit-encoded version of the reference, set up
* and execute the corresponding ungapped alignment problem. There can
* only be one solution.
*
* The caller has already narrowed down the relevant portion of the
* reference.
*
* Does not handle the case where we'd like to scan a large section of the
* reference for an ungapped alignment, e.g., if we're searching for the
* opposite mate after finding an alignment for the anchor mate.
*/
int SwAligner::ungappedAlign(
const BTDnaString& rd, // read sequence (could be RC)
const BTString& qu, // qual sequence (could be rev)
const Coord& coord, // coordinate aligned to
const BitPairReference& refs, // Reference strings
size_t reflen, // length of reference sequence
const Scoring& sc, // scoring scheme
bool ohang, // allow overhang?
TAlScore minsc, // minimum score
SwResult& res) // put alignment result here
{
const size_t len = rd.length();
int nceil = sc.nCeil.f<int>((double)len);
int ns = 0;
TRefOff rfi = coord.off();
TRefOff rff = rfi + (TRefOff)len;
TRefId refidx = coord.ref();
assert_gt(rff, rfi);
// Figure the number of Ns we're going to add to either side
size_t leftNs = 0;
if(rfi < 0) {
if(ohang) {
leftNs = (size_t)(-rfi);
} else {
return 0;
}
}
size_t rightNs = 0;
if(rff > (TRefOff)reflen) {
if(ohang) {
rightNs = (size_t)(rff - (TRefOff)reflen);
} else {
return 0;
}
}
if((leftNs + rightNs) > (size_t)nceil) {
return 0;
}
// rflenInner = length of just the portion that doesn't overhang ref ends
assert_geq(len, leftNs + rightNs);
const size_t rflenInner = len - (leftNs + rightNs);
#ifndef NDEBUG
bool haveRfbuf2 = false;
EList<char> rfbuf2(len);
// This is really slow, so only do it some of the time
if((rand() % 10) == 0) {
TRefOff rfii = rfi;
for(size_t i = 0; i < len; i++) {
if(rfii < 0 || (size_t)rfii >= reflen) {
rfbuf2.push_back(4);
} else {
rfbuf2.push_back(refs.getBase(refidx, (size_t)rfii));
}
rfii++;
}
haveRfbuf2 = true;
}
#endif
// rfbuf_ = uint32_t list large enough to accommodate both the reference
// sequence and any Ns we might add to either side.
rfwbuf_.resize((len + 16) / 4);
int offset = refs.getStretch(
rfwbuf_.ptr(), // buffer to store words in
refidx, // which reference
(rfi < 0) ? 0 : (size_t)rfi, // starting offset (can't be < 0)
rflenInner // length to grab (exclude overhang)
ASSERT_ONLY(, tmp_destU32_));// for BitPairReference::getStretch()
assert_leq(offset, 16);
rf_ = (char*)rfwbuf_.ptr() + offset;
// Shift ref chars away from 0 so we can stick Ns at the beginning
if(leftNs > 0) {
// Slide everyone down
for(size_t i = rflenInner; i > 0; i--) {
rf_[i+leftNs-1] = rf_[i-1];
}
// Add Ns
for(size_t i = 0; i < leftNs; i++) {
rf_[i] = 4;
}
}
if(rightNs > 0) {
// Add Ns to the end
for(size_t i = 0; i < rightNs; i++) {
rf_[i + leftNs + rflenInner] = 4;
}
}
#ifndef NDEBUG
// Sanity check reference characters
for(size_t i = 0; i < len; i++) {
assert(!haveRfbuf2 || rf_[i] == rfbuf2[i]);
assert_range(0, 4, (int)rf_[i]);
}
#endif
// Count Ns and convert reference characters into A/C/G/T masks. Ambiguous
// nucleotides (IUPAC codes) have more than one mask bit set. If a
// reference scanner was provided, use it to opportunistically resolve seed
// hits.
TAlScore score = 0;
res.alres.reset();
size_t rowi = 0;
size_t rowf = len-1;
if(sc.monotone) {
for(size_t i = 0; i < len; i++) {
// rf_[i] gets mask version of refence char, with N=16
assert_geq(qu[i], 33);
score += sc.score(rd[i], (int)(1 << rf_[i]), qu[i] - 33, ns);
assert_leq(score, 0);
if(score < minsc || ns > nceil) {
// Fell below threshold
return 0;
}
}
// Got a result! Fill in the rest of the result object.
} else {
// Definitely ways to short-circuit this. E.g. if diff between cur
// score and minsc can't be met by matches.
TAlScore floorsc = 0;
TAlScore scoreMax = floorsc;
size_t lastfloor = 0;
rowi = MAX_SIZE_T;
size_t sols = 0;
for(size_t i = 0; i < len; i++) {
score += sc.score(rd[i], (int)(1 << rf_[i]), qu[i] - 33, ns);
if(score >= minsc && score >= scoreMax) {
scoreMax = score;
rowf = i;
if(rowi != lastfloor) {
rowi = lastfloor;
sols++;
}
}
if(score <= floorsc) {
score = floorsc;
lastfloor = i+1;
}
}
if(ns > nceil || scoreMax < minsc) {
// Too many Ns
return 0;
}
if(sols > 1) {
// >1 distinct solution in this diag; defer to DP aligner
return -1;
}
score = scoreMax;
// Got a result! Fill in the rest of the result object.
}
// Now fill in the edits
assert_geq(rowf, rowi);
EList<Edit>& ned = res.alres.ned();
size_t refns = 0;
ASSERT_ONLY(BTDnaString refstr);
for(size_t i = rowi; i <= rowf; i++) {
ASSERT_ONLY(refstr.append((int)rf_[i]));
if(rf_[i] > 3 || rd[i] != rf_[i]) {
// Add edit
Edit e((int)i,
mask2dna[1 << (int)rf_[i]],
"ACGTN"[(int)rd[i]],
EDIT_TYPE_MM);
ned.push_back(e);
if(rf_[i] > 3) {
refns++;
}
}
}
res.alres.setScore(AlnScore(score,
(int)(rd.length() - ned.size()),
(int)ned.size(), ns, 0));
assert(Edit::repOk(ned, rd));
bool fw = coord.fw();
assert_leq(rowf, len-1);
size_t trimEnd = (len-1) - rowf;
res.alres.setShape(
coord.ref(), // ref id
coord.off()+rowi, // 0-based ref offset
reflen, // length of reference sequence aligned to
fw, // aligned to Watson?
len, // read length
true, // pretrim soft?
0, // pretrim 5' end
0, // pretrim 3' end
true, // alignment trim soft?
fw ? rowi : trimEnd, // alignment trim 5' end
fw ? trimEnd : rowi); // alignment trim 3' end
res.alres.setRefNs(refns);
assert(res.repOk());
#ifndef NDEBUG
BTDnaString editstr;
Edit::toRef(rd, ned, editstr, true, rowi, trimEnd);
if(refstr != editstr) {
cerr << "Decoded nucleotides and edits don't match reference:" << endl;
cerr << " score: " << res.alres.score().score() << endl;
cerr << " edits: ";
Edit::print(cerr, ned);
cerr << endl;
cerr << " decoded nucs: " << rd << endl;
cerr << " edited nucs: " << editstr << endl;
cerr << " reference nucs: " << refstr << endl;
assert(0);
}
#endif
if(!fw) {
// All edits are currently w/r/t upstream end; if read aligned to Crick
// strand, invert them to be w/r/t 5' end instead.
res.alres.invertEdits();
}
return 1;
}
/**
* Align read 'rd' to reference using read & reference information given
* last time init() was called.
*/
bool SwAligner::align(
TAlScore& best) // best alignment score observed in DP matrix
{
assert(initedRef() && initedRead());
assert_eq(STATE_INITED, state_);
state_ = STATE_ALIGNED;
// Reset solutions lists
btncand_.clear();
btncanddone_.clear();
btncanddoneSucc_ = btncanddoneFail_ = 0;
best = std::numeric_limits<TAlScore>::min();
sse8succ_ = sse16succ_ = false;
int flag = 0;
size_t rdlen = rdf_ - rdi_;
bool checkpointed = rdlen >= cperMinlen_;
bool gathered = false; // Did gathering happen along with alignment?
if(sc_->monotone) {
// End-to-end
if(enable8_ && !readSse16_ && minsc_ >= -254) {
// 8-bit end-to-end
if(checkpointed) {
best = alignGatherEE8(flag, false);
if(flag == 0) {
gathered = true;
}
} else {
best = alignNucleotidesEnd2EndSseU8(flag, false);
#ifndef NDEBUG
int flagtmp = 0;
TAlScore besttmp = alignGatherEE8(flagtmp, true); // debug
assert_eq(flagtmp, flag);
assert_eq(besttmp, best);
#endif
}
sse8succ_ = (flag == 0);
#ifndef NDEBUG
{
int flag2 = 0;
TAlScore best2 = alignNucleotidesEnd2EndSseI16(flag2, true);
{
int flagtmp = 0;
TAlScore besttmp = alignGatherEE16(flagtmp, true);
assert_eq(flagtmp, flag2);
assert(flag2 != 0 || best2 == besttmp);
}
assert(flag < 0 || best == best2);
sse16succ_ = (flag2 == 0);
}
#endif /*ndef NDEBUG*/
} else {
// 16-bit end-to-end
if(checkpointed) {
best = alignGatherEE16(flag, false);
if(flag == 0) {
gathered = true;
}
} else {
best = alignNucleotidesEnd2EndSseI16(flag, false);
#ifndef NDEBUG
int flagtmp = 0;
TAlScore besttmp = alignGatherEE16(flagtmp, true);
assert_eq(flagtmp, flag);
assert_eq(besttmp, best);
#endif
}
sse16succ_ = (flag == 0);
}
} else {
// Local
flag = -2;
if(enable8_ && !readSse16_) {
// 8-bit local
if(checkpointed) {
best = alignGatherLoc8(flag, false);
if(flag == 0) {
gathered = true;
}
} else {
best = alignNucleotidesLocalSseU8(flag, false);
#ifndef NDEBUG
int flagtmp = 0;
TAlScore besttmp = alignGatherLoc8(flagtmp, true);
assert_eq(flag, flagtmp);
assert_eq(best, besttmp);
#endif
}
}
if(flag == -2) {
// 16-bit local
flag = 0;
if(checkpointed) {
best = alignNucleotidesLocalSseI16(flag, false);
best = alignGatherLoc16(flag, false);
if(flag == 0) {
gathered = true;
}
} else {
best = alignNucleotidesLocalSseI16(flag, false);
#ifndef NDEBUG
int flagtmp = 0;
TAlScore besttmp = alignGatherLoc16(flagtmp, true);
assert_eq(flag, flagtmp);
assert_eq(best, besttmp);
#endif
}
sse16succ_ = (flag == 0);
} else {
sse8succ_ = (flag == 0);
#ifndef NDEBUG
int flag2 = 0;
TAlScore best2 = alignNucleotidesLocalSseI16(flag2, true);
{
int flagtmp = 0;
TAlScore besttmp = alignGatherLoc16(flagtmp, true);
assert_eq(flag2, flagtmp);
assert(flag2 != 0 || best2 == besttmp);
}
assert(flag2 < 0 || best == best2);
sse16succ_ = (flag2 == 0);
#endif /*ndef NDEBUG*/
}
}
#ifndef NDEBUG
if(!checkpointed && (rand() & 15) == 0 && sse8succ_ && sse16succ_) {
SSEData& d8 = fw_ ? sseU8fw_ : sseU8rc_;
SSEData& d16 = fw_ ? sseI16fw_ : sseI16rc_;
assert_eq(d8.mat_.nrow(), d16.mat_.nrow());
assert_eq(d8.mat_.ncol(), d16.mat_.ncol());
for(size_t i = 0; i < d8.mat_.nrow(); i++) {
for(size_t j = 0; j < colstop_; j++) {
int h8 = d8.mat_.helt(i, j);
int h16 = d16.mat_.helt(i, j);
int e8 = d8.mat_.eelt(i, j);
int e16 = d16.mat_.eelt(i, j);
int f8 = d8.mat_.felt(i, j);
int f16 = d16.mat_.felt(i, j);
TAlScore h8s =
(sc_->monotone ? (h8 - 0xff ) : h8);
TAlScore h16s =
(sc_->monotone ? (h16 - 0x7fff) : (h16 + 0x8000));
TAlScore e8s =
(sc_->monotone ? (e8 - 0xff ) : e8);
TAlScore e16s =
(sc_->monotone ? (e16 - 0x7fff) : (e16 + 0x8000));
TAlScore f8s =
(sc_->monotone ? (f8 - 0xff ) : f8);
TAlScore f16s =
(sc_->monotone ? (f16 - 0x7fff) : (f16 + 0x8000));
if(h8s < minsc_) {
h8s = minsc_ - 1;
}
if(h16s < minsc_) {
h16s = minsc_ - 1;
}
if(e8s < minsc_) {
e8s = minsc_ - 1;
}
if(e16s < minsc_) {
e16s = minsc_ - 1;
}
if(f8s < minsc_) {
f8s = minsc_ - 1;
}
if(f16s < minsc_) {
f16s = minsc_ - 1;
}
if((h8 != 0 || (int16_t)h16 != (int16_t)0x8000) && h8 > 0) {
assert_eq(h8s, h16s);
}
if((e8 != 0 || (int16_t)e16 != (int16_t)0x8000) && e8 > 0) {
assert_eq(e8s, e16s);
}
if((f8 != 0 || (int16_t)f16 != (int16_t)0x8000) && f8 > 0) {
assert_eq(f8s, f16s);
}
}
}
}
#endif
assert(repOk());
cural_ = 0;
if(best == MIN_I64 || best < minsc_) {
if(dpLog_ != NULL) {
(*dpLog_) << ",0,0";
}
return false;
}
if(!gathered) {
// Look for solutions using SSE matrix
assert(sse8succ_ || sse16succ_);
if(sc_->monotone) {
if(sse8succ_) {
gatherCellsNucleotidesEnd2EndSseU8(best);
#ifndef NDEBUG
if(sse16succ_) {
cand_tmp_ = btncand_;
gatherCellsNucleotidesEnd2EndSseI16(best);
cand_tmp_.sort();
btncand_.sort();
assert(cand_tmp_ == btncand_);
}
#endif /*ndef NDEBUG*/
} else {
gatherCellsNucleotidesEnd2EndSseI16(best);
}
} else {
if(sse8succ_) {
gatherCellsNucleotidesLocalSseU8(best);
#ifndef NDEBUG
if(sse16succ_) {
cand_tmp_ = btncand_;
gatherCellsNucleotidesLocalSseI16(best);
cand_tmp_.sort();
btncand_.sort();
assert(cand_tmp_ == btncand_);
}
#endif /*ndef NDEBUG*/
} else {
gatherCellsNucleotidesLocalSseI16(best);
}
}
}
if(!btncand_.empty()) {
btncand_.sort();
}
if(dpLog_ != NULL) {
(*dpLog_) << ",1," << best;
}
return !btncand_.empty();
}
/**
* Populate the given SwResult with information about the "next best"
* alignment if there is one. If there isn't one, false is returned. Note
* that false might be returned even though a call to done() would have
* returned false.
*/
bool SwAligner::nextAlignment(
SwResult& res,
TAlScore minsc,
RandomSource& rnd)
{
assert(initedRead() && initedRef());
assert_eq(STATE_ALIGNED, state_);
assert(repOk());
if(done()) {
res.reset();
return false;
}
assert(!done());
size_t off = 0, nbts = 0;
assert_lt(cural_, btncand_.size());
assert(res.repOk());
// For each candidate cell that we should try to backtrack from...
const size_t candsz = btncand_.size();
size_t SQ = dpRows() >> 4;
if(SQ == 0) SQ = 1;
size_t rdlen = rdf_ - rdi_;
bool checkpointed = rdlen >= cperMinlen_;
while(cural_ < candsz) {
// Doing 'continue' anywhere in here simply causes us to move on to the
// next candidate
if(btncand_[cural_].score < minsc) {
btncand_[cural_].fate = BT_CAND_FATE_FILT_SCORE;
nbtfiltsc_++; cural_++; continue;
}
nbts = 0;
assert(sse8succ_ || sse16succ_);
size_t row = btncand_[cural_].row;
size_t col = btncand_[cural_].col;
assert_lt(row, dpRows());
assert_lt((TRefOff)col, rff_-rfi_);
if(sse16succ_) {
SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
if(!checkpointed && d.mat_.reset_[row] && d.mat_.reportedThrough(row, col)) {
// Skipping this candidate because a previous candidate already
// moved through this cell
btncand_[cural_].fate = BT_CAND_FATE_FILT_START;
//cerr << " skipped becuase starting cell was covered" << endl;
nbtfiltst_++; cural_++; continue;
}
} else if(sse8succ_) {
SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
if(!checkpointed && d.mat_.reset_[row] && d.mat_.reportedThrough(row, col)) {
// Skipping this candidate because a previous candidate already
// moved through this cell
btncand_[cural_].fate = BT_CAND_FATE_FILT_START;
//cerr << " skipped becuase starting cell was covered" << endl;
nbtfiltst_++; cural_++; continue;
}
}
if(sc_->monotone) {
bool ret = false;
if(sse8succ_) {
uint32_t reseed = rnd.nextU32() + 1;
rnd.init(reseed);
res.reset();
if(checkpointed) {
size_t maxiter = MAX_SIZE_T;
size_t niter = 0;
ret = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter, // max # extensions to try
niter, // # extensions tried
rnd); // random gen, to choose among equal paths
} else {
ret = backtraceNucleotidesEnd2EndSseU8(
btncand_[cural_].score, // in: expected score
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
nbts, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
}
#ifndef NDEBUG
// if(...) statement here should check not whether the primary
// alignment was checkpointed, but whether a checkpointed
// alignment was done at all.
if(!checkpointed) {
SwResult res2;
size_t maxiter2 = MAX_SIZE_T;
size_t niter2 = 0;
bool ret2 = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res2, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter2, // max # extensions to try
niter2, // # extensions tried
rnd); // random gen, to choose among equal paths
// After the first alignment, there's no guarantee we'll
// get the same answer from both backtrackers because of
// differences in how they handle marking cells as
// reported-through.
assert(cural_ > 0 || !ret || ret == ret2);
assert(cural_ > 0 || !ret || res.alres == res2.alres);
}
if(sse16succ_ && !checkpointed) {
SwResult res2;
size_t off2, nbts2 = 0;
rnd.init(reseed);
bool ret2 = backtraceNucleotidesEnd2EndSseI16(
btncand_[cural_].score, // in: expected score
res2, // out: store results (edits and scores) here
off2, // out: store diagonal projection of origin
nbts2, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
assert_eq(ret, ret2);
assert_eq(nbts, nbts2);
assert(!ret || res2.alres.score() == res.alres.score());
#if 0
if(!checkpointed && (rand() & 15) == 0) {
// Check that same cells are reported through
SSEData& d8 = fw_ ? sseU8fw_ : sseU8rc_;
SSEData& d16 = fw_ ? sseI16fw_ : sseI16rc_;
for(size_t i = d8.mat_.nrow(); i > 0; i--) {
for(size_t j = 0; j < d8.mat_.ncol(); j++) {
assert_eq(d8.mat_.reportedThrough(i-1, j),
d16.mat_.reportedThrough(i-1, j));
}
}
}
#endif
}
#endif
rnd.init(reseed+1); // debug/release pseudo-randoms in lock step
} else if(sse16succ_) {
uint32_t reseed = rnd.nextU32() + 1;
res.reset();
if(checkpointed) {
size_t maxiter = MAX_SIZE_T;
size_t niter = 0;
ret = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter, // max # extensions to try
niter, // # extensions tried
rnd); // random gen, to choose among equal paths
} else {
ret = backtraceNucleotidesEnd2EndSseI16(
btncand_[cural_].score, // in: expected score
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
nbts, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
}
#ifndef NDEBUG
// if(...) statement here should check not whether the primary
// alignment was checkpointed, but whether a checkpointed
// alignment was done at all.
if(!checkpointed) {
SwResult res2;
size_t maxiter2 = MAX_SIZE_T;
size_t niter2 = 0;
bool ret2 = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res2, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter2, // max # extensions to try
niter2, // # extensions tried
rnd); // random gen, to choose among equal paths
// After the first alignment, there's no guarantee we'll
// get the same answer from both backtrackers because of
// differences in how they handle marking cells as
// reported-through.
assert(cural_ > 0 || !ret || ret == ret2);
assert(cural_ > 0 || !ret || res.alres == res2.alres);
}
#endif
rnd.init(reseed); // debug/release pseudo-randoms in lock step
}
if(ret) {
btncand_[cural_].fate = BT_CAND_FATE_SUCCEEDED;
break;
} else {
btncand_[cural_].fate = BT_CAND_FATE_FAILED;
}
} else {
// Local alignment
// Check if this solution is "dominated" by a prior one.
// Domination is a heuristic designed to eliminate the vast
// majority of valid-but-redundant candidates lying in the
// "penumbra" of a high-scoring alignment.
bool dom = false;
{
size_t donesz = btncanddone_.size();
const size_t col = btncand_[cural_].col;
const size_t row = btncand_[cural_].row;
for(size_t i = 0; i < donesz; i++) {
assert_gt(btncanddone_[i].fate, 0);
size_t colhi = col, rowhi = row;
size_t rowlo = btncanddone_[i].row;
size_t collo = btncanddone_[i].col;
if(colhi < collo) swap(colhi, collo);
if(rowhi < rowlo) swap(rowhi, rowlo);
if(colhi - collo <= SQ && rowhi - rowlo <= SQ) {
// Skipping this candidate because it's "dominated" by
// a previous candidate
dom = true;
break;
}
}
}
if(dom) {
btncand_[cural_].fate = BT_CAND_FATE_FILT_DOMINATED;
nbtfiltdo_++;
cural_++;
continue;
}
bool ret = false;
if(sse8succ_) {
uint32_t reseed = rnd.nextU32() + 1;
res.reset();
rnd.init(reseed);
if(checkpointed) {
size_t maxiter = MAX_SIZE_T;
size_t niter = 0;
ret = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter, // max # extensions to try
niter, // # extensions tried
rnd); // random gen, to choose among equal paths
} else {
ret = backtraceNucleotidesLocalSseU8(
btncand_[cural_].score, // in: expected score
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
nbts, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
}
#ifndef NDEBUG
// if(...) statement here should check not whether the primary
// alignment was checkpointed, but whether a checkpointed
// alignment was done at all.
if(!checkpointed) {
SwResult res2;
size_t maxiter2 = MAX_SIZE_T;
size_t niter2 = 0;
bool ret2 = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res2, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter2, // max # extensions to try
niter2, // # extensions tried
rnd); // random gen, to choose among equal paths
// After the first alignment, there's no guarantee we'll
// get the same answer from both backtrackers because of
// differences in how they handle marking cells as
// reported-through.
assert(cural_ > 0 || !ret || ret == ret2);
// TODO: I find that sometimes there is disagreement here
// where the alignments are in the same place with
// identical scores, but one is more soft-trimmed than the other
//assert(cural_ > 0 || !ret || res.alres == res2.alres);
}
if(!checkpointed && sse16succ_) {
SwResult res2;
size_t off2, nbts2 = 0;
rnd.init(reseed); // same b/t backtrace calls
bool ret2 = backtraceNucleotidesLocalSseI16(
btncand_[cural_].score, // in: expected score
res2, // out: store results (edits and scores) here
off2, // out: store diagonal projection of origin
nbts2, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
assert_eq(ret, ret2);
assert_eq(nbts, nbts2);
assert(!ret || res2.alres.score() == res.alres.score());
#if 0
if(!checkpointed && (rand() & 15) == 0) {
// Check that same cells are reported through
SSEData& d8 = fw_ ? sseU8fw_ : sseU8rc_;
SSEData& d16 = fw_ ? sseI16fw_ : sseI16rc_;
for(size_t i = d8.mat_.nrow(); i > 0; i--) {
for(size_t j = 0; j < d8.mat_.ncol(); j++) {
assert_eq(d8.mat_.reportedThrough(i-1, j),
d16.mat_.reportedThrough(i-1, j));
}
}
}
#endif
}
#endif
rnd.init(reseed+1); // debug/release pseudo-randoms in lock step
} else if(sse16succ_) {
uint32_t reseed = rnd.nextU32() + 1;
res.reset();
if(checkpointed) {
size_t maxiter = MAX_SIZE_T;
size_t niter = 0;
ret = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter, // max # extensions to try
niter, // # extensions tried
rnd); // random gen, to choose among equal paths
} else {
ret = backtraceNucleotidesLocalSseI16(
btncand_[cural_].score, // in: expected score
res, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
nbts, // out: # backtracks
row, // start in this rectangle row
col, // start in this rectangle column
rnd); // random gen, to choose among equal paths
}
#ifndef NDEBUG
// if(...) statement here should check not whether the primary
// alignment was checkpointed, but whether a checkpointed
// alignment was done at all.
if(!checkpointed) {
SwResult res2;
size_t maxiter2 = MAX_SIZE_T;
size_t niter2 = 0;
bool ret2 = backtrace(
btncand_[cural_].score, // in: expected score
true, // in: use mini-fill?
true, // in: use checkpoints?
res2, // out: store results (edits and scores) here
off, // out: store diagonal projection of origin
row, // start in this rectangle row
col, // start in this rectangle column
maxiter2, // max # extensions to try
niter2, // # extensions tried
rnd); // random gen, to choose among equal paths
// After the first alignment, there's no guarantee we'll
// get the same answer from both backtrackers because of
// differences in how they handle marking cells as
// reported-through.
assert(cural_ > 0 || !ret || ret == ret2);
assert(cural_ > 0 || !ret || res.alres == res2.alres);
}
#endif
rnd.init(reseed); // same b/t backtrace calls
}
if(ret) {
btncand_[cural_].fate = BT_CAND_FATE_SUCCEEDED;
btncanddone_.push_back(btncand_[cural_]);
btncanddoneSucc_++;
assert(res.repOk());
break;
} else {
btncand_[cural_].fate = BT_CAND_FATE_FAILED;
btncanddone_.push_back(btncand_[cural_]);
btncanddoneFail_++;
}
}
cural_++;
} // while(cural_ < btncand_.size())
if(cural_ == btncand_.size()) {
assert(res.repOk());
return false;
}
assert(!res.alres.empty());
assert(res.repOk());
if(!fw_) {
// All edits are currently w/r/t upstream end; if read aligned
// to Crick strand, we need to invert them so that they're
// w/r/t the read's 5' end instead.
res.alres.invertEdits();
}
cural_++;
assert(res.repOk());
return true;
}
#ifdef MAIN_ALIGNER_SW
#include <sstream>
#include <utility>
#include <getopt.h>
#include "scoring.h"
#include "aligner_seed_policy.h"
int gGapBarrier;
int gSnpPhred;
static int bonusMatchType; // how to reward matches
static int bonusMatch; // constant if match bonus is a constant
static int penMmcType; // how to penalize mismatches
static int penMmc; // constant if mm pelanty is a constant
static int penNType; // how to penalize Ns in the read
static int penN; // constant if N pelanty is a constant
static bool nPairCat; // true -> concatenate mates before N filter
static int penRdExConst; // constant coeff for cost of gap in read
static int penRfExConst; // constant coeff for cost of gap in ref
static int penRdExLinear; // linear coeff for cost of gap in read
static int penRfExLinear; // linear coeff for cost of gap in ref
static float costMinConst; // constant coeff for min score w/r/t read len
static float costMinLinear; // linear coeff for min score w/r/t read len
static float costFloorConst; // constant coeff for score floor w/r/t read len
static float costFloorLinear;// linear coeff for score floor w/r/t read len
static float nCeilConst; // constant coeff for N ceiling w/r/t read len
static float nCeilLinear; // linear coeff for N ceiling w/r/t read len
static bool nCatPair; // concat mates before applying N filter?
static int multiseedMms; // mismatches permitted in a multiseed seed
static int multiseedLen; // length of multiseed seeds
static int multiseedIvalType;
static float multiseedIvalA;
static float multiseedIvalB;
static float posmin;
static float posfrac;
static float rowmult;
enum {
ARG_TESTS = 256
};
static const char *short_opts = "s:m:r:d:i:";
static struct option long_opts[] = {
{(char*)"snppen", required_argument, 0, 's'},
{(char*)"misspen", required_argument, 0, 'm'},
{(char*)"seed", required_argument, 0, 'r'},
{(char*)"align-policy", no_argument, 0, 'A'},
{(char*)"test", no_argument, 0, ARG_TESTS},
};
static void printUsage(ostream& os) {
os << "Usage: aligner_sw <read-seq> <ref-nuc-seq> [options]*" << endl;
os << "Options:" << endl;
os << " -s/--snppen <int> penalty incurred by SNP; used for decoding"
<< endl;
os << " -m/--misspen <int> quality to use for read chars" << endl;
os << " -r/-seed <int> seed for pseudo-random generator" << endl;
}
/**
* Parse a T from a string 's'
*/
template<typename T>
T parse(const char *s) {
T tmp;
stringstream ss(s);
ss >> tmp;
return tmp;
}
static EList<bool> stbuf, enbuf;
static BTDnaString btread;
static BTString btqual;
static BTString btref;
static BTString btref2;
static BTDnaString readrc;
static BTString qualrc;
/**
* Helper function for running a case consisting of a read (sequence
* and quality), a reference string, and an offset that anchors the 0th
* character of the read to a reference position.
*/
static void doTestCase(
SwAligner& al,
const BTDnaString& read,
const BTString& qual,
const BTString& refin,
TRefOff off,
EList<bool> *en,
const Scoring& sc,
TAlScore minsc,
SwResult& res,
bool nsInclusive,
bool filterns,
uint32_t seed)
{
RandomSource rnd(seed);
btref2 = refin;
assert_eq(read.length(), qual.length());
size_t nrow = read.length();
TRefOff rfi, rff;
// Calculate the largest possible number of read and reference gaps given
// 'minsc' and 'pens'
size_t maxgaps;
size_t padi, padf;
{
int readGaps = sc.maxReadGaps(minsc, read.length());
int refGaps = sc.maxRefGaps(minsc, read.length());
assert_geq(readGaps, 0);
assert_geq(refGaps, 0);
int maxGaps = max(readGaps, refGaps);
padi = 2 * maxGaps;
padf = maxGaps;
maxgaps = (size_t)maxGaps;
}
size_t nceil = (size_t)sc.nCeil.f((double)read.length());
size_t width = 1 + padi + padf;
rfi = off;
off = 0;
// Pad the beginning of the reference with Ns if necessary
if(rfi < padi) {
size_t beginpad = (size_t)(padi - rfi);
for(size_t i = 0; i < beginpad; i++) {
btref2.insert('N', 0);
off--;
}
rfi = 0;
} else {
rfi -= padi;
}
assert_geq(rfi, 0);
// Pad the end of the reference with Ns if necessary
while(rfi + nrow + padi + padf > btref2.length()) {
btref2.append('N');
}
rff = rfi + nrow + padi + padf;
// Convert reference string to masks
for(size_t i = 0; i < btref2.length(); i++) {
if(toupper(btref2[i]) == 'N' && !nsInclusive) {
btref2.set(16, i);
} else {
int num = 0;
int alts[] = {4, 4, 4, 4};
decodeNuc(toupper(btref2[i]), num, alts);
assert_leq(num, 4);
assert_gt(num, 0);
btref2.set(0, i);
for(int j = 0; j < num; j++) {
btref2.set(btref2[i] | (1 << alts[j]), i);
}
}
}
bool fw = true;
uint32_t refidx = 0;
size_t solwidth = width;
if(maxgaps >= solwidth) {
solwidth = 0;
} else {
solwidth -= maxgaps;
}
if(en == NULL) {
enbuf.resize(solwidth);
enbuf.fill(true);
en = &enbuf;
}
assert_geq(rfi, 0);
assert_gt(rff, rfi);
readrc = read;
qualrc = qual;
al.initRead(
read, // read sequence
readrc,
qual, // read qualities
qualrc,
0, // offset of first character within 'read' to consider
read.length(), // offset of last char (exclusive) in 'read' to consider
floorsc); // local-alignment score floor
al.initRef(
fw, // 'read' is forward version of read?
refidx, // id of reference aligned to
off, // offset of upstream ref char aligned against
btref2.wbuf(), // reference sequence (masks)
rfi, // offset of first char in 'ref' to consider
rff, // offset of last char (exclusive) in 'ref' to consider
width, // # bands to do (width of parallelogram)
solwidth, // # rightmost cols where solns can end
sc, // scoring scheme
minsc, // minimum score for valid alignment
maxgaps, // max of max # read gaps, ref gaps
0, // amount to truncate on left-hand side
en); // mask indicating which columns we can end in
if(filterns) {
al.filter((int)nceil);
}
al.align(rnd);
}
/**
* Another interface for running a case.
*/
static void doTestCase2(
SwAligner& al,
const char *read,
const char *qual,
const char *refin,
TRefOff off,
const Scoring& sc,
float costMinConst,
float costMinLinear,
SwResult& res,
bool nsInclusive = false,
bool filterns = false,
uint32_t seed = 0)
{
btread.install(read, true);
TAlScore minsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costMinConst,
costMinLinear));
TAlScore floorsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costFloorConst,
costFloorLinear));
btqual.install(qual);
btref.install(refin);
doTestCase(
al,
btread,
btqual,
btref,
off,
NULL,
sc,
minsc,
floorsc,
res,
nsInclusive,
filterns,
seed
);
}
/**
* Another interface for running a case.
*/
static void doTestCase3(
SwAligner& al,
const char *read,
const char *qual,
const char *refin,
TRefOff off,
Scoring& sc,
float costMinConst,
float costMinLinear,
float nCeilConst,
float nCeilLinear,
SwResult& res,
bool nsInclusive = false,
bool filterns = false,
uint32_t seed = 0)
{
btread.install(read, true);
// Calculate the penalty ceiling for the read
TAlScore minsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costMinConst,
costMinLinear));
TAlScore floorsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costFloorConst,
costFloorLinear));
btqual.install(qual);
btref.install(refin);
sc.nCeil.setType(SIMPLE_FUNC_LINEAR);
sc.nCeil.setConst(costMinConst);
sc.nCeil.setCoeff(costMinLinear);
doTestCase(
al,
btread,
btqual,
btref,
off,
NULL,
sc,
minsc,
floorsc,
res,
nsInclusive,
filterns,
seed
);
}
/**
* Another interface for running a case. Like doTestCase3 but caller specifies
* st_ and en_ lists.
*/
static void doTestCase4(
SwAligner& al,
const char *read,
const char *qual,
const char *refin,
TRefOff off,
EList<bool>& en,
Scoring& sc,
float costMinConst,
float costMinLinear,
float nCeilConst,
float nCeilLinear,
SwResult& res,
bool nsInclusive = false,
bool filterns = false,
uint32_t seed = 0)
{
btread.install(read, true);
// Calculate the penalty ceiling for the read
TAlScore minsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costMinConst,
costMinLinear));
TAlScore floorsc = (TAlScore)(Scoring::linearFunc(
btread.length(),
costFloorConst,
costFloorLinear));
btqual.install(qual);
btref.install(refin);
sc.nCeil.setType(SIMPLE_FUNC_LINEAR);
sc.nCeil.setConst(costMinConst);
sc.nCeil.setCoeff(costMinLinear);
doTestCase(
al,
btread,
btqual,
btref,
off,
&en,
sc,
minsc,
floorsc,
res,
nsInclusive,
filterns,
seed
);
}
/**
* Do a set of unit tests.
*/
static void doTests() {
bonusMatchType = DEFAULT_MATCH_BONUS_TYPE;
bonusMatch = DEFAULT_MATCH_BONUS;
penMmcType = DEFAULT_MM_PENALTY_TYPE;
penMmc = DEFAULT_MM_PENALTY;
penSnp = DEFAULT_SNP_PENALTY;
penNType = DEFAULT_N_PENALTY_TYPE;
penN = DEFAULT_N_PENALTY;
nPairCat = DEFAULT_N_CAT_PAIR;
penRdExConst = DEFAULT_READ_GAP_CONST;
penRfExConst = DEFAULT_REF_GAP_CONST;
penRdExLinear = DEFAULT_READ_GAP_LINEAR;
penRfExLinear = DEFAULT_REF_GAP_LINEAR;
costMinConst = DEFAULT_MIN_CONST;
costMinLinear = DEFAULT_MIN_LINEAR;
costFloorConst = DEFAULT_FLOOR_CONST;
costFloorLinear = DEFAULT_FLOOR_LINEAR;
nCeilConst = 1.0f; // constant factor in N ceil w/r/t read len
nCeilLinear = 0.1f; // coeff of linear term in N ceil w/r/t read len
multiseedMms = DEFAULT_SEEDMMS;
multiseedLen = DEFAULT_SEEDLEN;
// Set up penalities
Scoring sc(
bonusMatch,
penMmcType, // how to penalize mismatches
30, // constant if mm pelanty is a constant
30, // penalty for decoded SNP
costMinConst, // constant factor in N ceiling w/r/t read length
costMinLinear, // coeff of linear term in N ceiling w/r/t read length
costFloorConst, // constant factor in N ceiling w/r/t read length
costFloorLinear, // coeff of linear term in N ceiling w/r/t read length
nCeilConst, // constant factor in N ceiling w/r/t read length
nCeilLinear, // coeff of linear term in N ceiling w/r/t read length
penNType, // how to penalize Ns in the read
penN, // constant if N pelanty is a constant
nPairCat, // true -> concatenate mates before N filtering
25, // constant coeff for cost of gap in read
25, // constant coeff for cost of gap in ref
15, // linear coeff for cost of gap in read
15, // linear coeff for cost of gap in ref
1, // # rows at top/bot can only be entered diagonally
-1, // min row idx to backtrace from; -1 = no limit
false // sort results first by row then by score?
);
// Set up alternative penalities
Scoring sc2(
bonusMatch,
COST_MODEL_QUAL, // how to penalize mismatches
30, // constant if mm pelanty is a constant
30, // penalty for decoded SNP
costMinConst, // constant factor in N ceiling w/r/t read length
costMinLinear, // coeff of linear term in N ceiling w/r/t read length
costFloorConst, // constant factor in N ceiling w/r/t read length
costFloorLinear, // coeff of linear term in N ceiling w/r/t read length
1.0f, // constant factor in N ceiling w/r/t read length
1.0f, // coeff of linear term in N ceiling w/r/t read length
penNType, // how to penalize Ns in the read
penN, // constant if N pelanty is a constant
nPairCat, // true -> concatenate mates before N filtering
25, // constant coeff for cost of gap in read
25, // constant coeff for cost of gap in ref
15, // linear coeff for cost of gap in read
15, // linear coeff for cost of gap in ref
1, // # rows at top/bot can only be entered diagonally
-1, // min row idx to backtrace from; -1 = no limit
false // sort results first by row then by score?
);
SwResult res;
//
// Basic nucleotide-space tests
//
cerr << "Running tests..." << endl;
int tests = 1;
bool nIncl = false;
bool nfilter = false;
SwAligner al;
RandomSource rnd(73);
for(int i = 0; i < 3; i++) {
cerr << " Test " << tests++ << " (nuc space, offset "
<< (i*4) << ", exact)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
sc.rdGapLinear = 15;
sc.rfGapLinear = 15;
// A C G T A C G T
// H E F H E F H E F H E F H E F H E F H E F H E F
// A 0 lo lo -30 lo lo -30 lo lo -30 lo lo 0 lo lo -30 lo lo-30 lo lo-30 lo lo
// C -30 lo -55 0 -85 -85 -55 -55 -85
// G -30 lo -70 -55 -85 -55 0 -100-100
// T -30 lo -85 -60 -85 -70 -55-100 -55
// A 0 lo -85 -55 -55 -85 -70 -70 -70
// C -30 lo -55 0 -85-100 -55 -55 -85
// G -30 lo -70 -55 -85 -55 0 -100-100
// T -30 lo -85 -60 -85 -70 -55-100 -55
doTestCase2(
al,
"ACGTACGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc)...";
sc.setMmPen(COST_MODEL_CONSTANT, 30);
//sc.setMatchBonus(10);
doTestCase2(
al,
"ACGTTCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -30);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc, check qual 1)...";
doTestCase2(
al,
"ACGTTCGT", // read
"ABCDEFGH", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc2, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
size_t lo, hi;
if(i == 0) {
lo = 0; hi = 1;
} else if(i == 1) {
lo = 1; hi = 2;
} else {
lo = 2; hi = 3;
}
for(size_t j = lo; j < hi; j++) {
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(j*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -36);
assert_eq(res.alres.score().ns(), 0);
res.reset();
}
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc, check qual 2)...";
doTestCase2(
al,
"ACGAACGT", // read
"ABCDEFGH", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc2, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -35);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(res.empty());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc, check qual )...";
assert(res.empty());
doTestCase2(
al,
"TCGTACGT", // read
"ABCDEFGH", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc2, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -32);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(res.empty());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm at the beginning, allowed by minsc)...";
doTestCase2(
al,
"CCGTACGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -30);
assert_eq(res.alres.score().ns(), 0);
assert_eq(1, res.alres.ned().size());
assert_eq(0, res.alres.aed().size());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 n in read, allowed)...";
doTestCase3(
al,
"ACGTNCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0f, // allow 1 N
0.0f, // allow 1 N
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -1);
assert_eq(res.alres.score().ns(), 1);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 2 n in read, allowed)...";
doTestCase3(
al,
"ACGNNCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
2.0f, // const coeff for N ceiling
0.0f, // linear coeff for N ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -2);
assert_eq(res.alres.score().ns(), 2);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 2 n in read, 1 at beginning, allowed)...";
doTestCase2(
al,
"NCGTNCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -2);
assert_eq(res.alres.score().ns(), 2);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 n in ref, allowed)...";
doTestCase2(
al,
"ACGTACGT", // read
"IIIIIIII", // qual
"ACGTNCGTACGTANGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), -1);
assert_eq(res.alres.score().ns(), 1);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm disallowed by minsc)...";
doTestCase2(
al,
"ACGTTCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-10.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
// Read gap with equal read and ref gap penalties
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", read gap allowed by minsc)...";
assert(res.empty());
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 15;
sc.rdGapLinear = 15;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", read gap disallowed by minsc)...";
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 15;
sc.rdGapLinear = 15;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
// Ref gap with equal read and ref gap penalties
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap allowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", read gap disallowed by gap barrier)...";
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 15;
sc.rdGapLinear = 15;
sc.gapbar = 4;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
sc.gapbar = 1;
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
// Ref gap with equal read and ref gap penalties
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap allowed by minsc, gapbar=3)...";
sc.gapbar = 3;
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
sc.gapbar = 1;
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
// Ref gap with equal read and ref gap penalties
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap allowed by minsc, gapbar=4)...";
sc.gapbar = 4;
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
sc.gapbar = 1;
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap disallowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(al.done());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", ref gap disallowed by gap barrier)...";
sc.gapbar = 5;
doTestCase2(
al,
"ACGTAACGT", // read
"IIIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
sc.gapbar = 1;
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(al.done());
cerr << "PASSED" << endl;
// Read gap with one read gap and zero ref gaps allowed
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 read gap, ref gaps disallowed by minsc)...";
sc.rfGapConst = 35;
sc.rdGapConst = 25;
sc.rfGapLinear = 20;
sc.rdGapLinear = 10;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -35);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", gaps disallowed by minsc)...";
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 10;
sc.rdGapLinear = 10;
doTestCase2(
al,
"ACGTCGT", // read
"IIIIIII", // qual
"ACGTACGTACGTACGT", // ref
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(res.empty());
cerr << "PASSED" << endl;
// Ref gap with one ref gap and zero read gaps allowed
sc.rfGapConst = 25;
sc.rdGapConst = 35;
sc.rfGapLinear = 12;
sc.rdGapLinear = 22;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 ref gap, read gaps disallowed by minsc)...";
assert(res.empty());
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -37);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", gaps disallowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
cerr << "PASSED" << endl;
// Read gap with one read gap and two ref gaps allowed
sc.rfGapConst = 20;
sc.rdGapConst = 25;
sc.rfGapLinear = 10;
sc.rdGapLinear = 15;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 read gap, 2 ref gaps allowed by minsc)...";
doTestCase2(
al,
"ACGTCGT",
"IIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", gaps disallowed by minsc)...";
doTestCase2(
al,
"ACGTCGT",
"IIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
cerr << "PASSED" << endl;
// Ref gap with one ref gap and two read gaps allowed
sc.rfGapConst = 25;
sc.rdGapConst = 11; // if this were 10, we'd have ties
sc.rfGapLinear = 15;
sc.rdGapLinear = 10;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 ref gap, 2 read gaps allowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4) << ", gaps disallowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(res.empty());
res.reset();
assert(al.done());
cerr << "PASSED" << endl;
// Read gap with two read gaps and two ref gaps allowed
sc.rfGapConst = 15;
sc.rdGapConst = 15;
sc.rfGapLinear = 10;
sc.rdGapLinear = 10;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 2 ref gaps, 2 read gaps allowed by minsc)...";
doTestCase3(
al,
"ACGTCGT",
"IIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-40.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0, // const coeff for N ceiling
0.0, // linear coeff for N ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
true); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
if(!res.empty()) {
//al.printResultStacked(res, cerr); cerr << endl;
}
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -25);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
// The following alignment is possible when i == 2:
// ACGTACGTACGTACGTN
// A x
// C x
// G x
// T x
// C x
// G x
// T x
assert(i == 2 || res.empty());
res.reset();
cerr << "PASSED" << endl;
sc.rfGapConst = 10;
sc.rdGapConst = 10;
sc.rfGapLinear = 10;
sc.rdGapLinear = 10;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 ref gap, 1 read gap allowed by minsc)...";
doTestCase2(
al,
"ACGTCGT",
"IIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -20);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
// Ref gap with two ref gaps and zero read gaps allowed
sc.rfGapConst = 15;
sc.rdGapConst = 15;
sc.rfGapLinear = 5;
sc.rdGapLinear = 5;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 2 ref gaps, 2 read gaps allowed by minsc)...";
// Careful: it might be possible for the read to align with overhang
// instead of with a gap
doTestCase3(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-35.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0f, // needed to avoid overhang alignments
0.0f, // needed to avoid overhang alignments
res, // result
nIncl, // Ns inclusive (not mismatches)
true); // filter Ns
if(i == 0) {
lo = 0; hi = 1;
} else if(i == 1) {
lo = 1; hi = 2;
} else {
lo = 2; hi = 3;
}
for(size_t j = lo; j < hi; j++) {
al.nextAlignment(res, rnd);
assert(!res.empty());
//al.printResultStacked(res, cerr); cerr << endl;
assert(res.alres.refoff() == 0 ||
res.alres.refoff() == 4 ||
res.alres.refoff() == 8);
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -20);
assert_eq(res.alres.score().ns(), 0);
res.reset();
}
al.nextAlignment(res, rnd);
//assert(res.empty());
//res.reset();
cerr << "PASSED" << endl;
sc.rfGapConst = 25;
sc.rdGapConst = 25;
sc.rfGapLinear = 4;
sc.rdGapLinear = 4;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1 ref gap, 1 read gap allowed by minsc)...";
doTestCase2(
al,
"ACGTAACGT",
"IIIIIIIII",
"ACGTACGTACGTACGT",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 1);
assert_eq(res.alres.score().score(), -29);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", short read)...";
doTestCase2(
al,
"A",
"I",
"AAAAAAAAAAAA",
i*4, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
if(i == 0) {
cerr << " Test " << tests++
<< " (nuc space, offset 0, short read & ref)...";
doTestCase2(
al,
"A",
"I",
"A",
0, // off
sc, // scoring scheme
-30.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
}
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", short read, many allowed gaps)...";
doTestCase2(
al,
"A",
"I",
"AAAAAAAAAAAA",
i*4, // off
sc, // scoring scheme
-150.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
if(i == 0) {
cerr << " Test " << tests++
<< " (nuc space, offset 0, short read & ref, "
<< "many allowed gaps)...";
doTestCase2(
al,
"A",
"I",
"A",
0, // off
sc, // scoring scheme
-150.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 0);
assert_eq(res.alres.score().ns(), 0);
res.reset();
cerr << "PASSED" << endl;
}
}
// A test case where a valid alignment with a worse score should be
// accepted over a valid alignment with a better score but too many
// Ns
cerr << " Test " << tests++ << " (N ceiling 1)...";
sc.mmcostType = COST_MODEL_CONSTANT;
sc.mmcost = 10;
sc.snp = 30;
sc.nCeilConst = 0.0f;
sc.nCeilLinear = 0.0f;
sc.rfGapConst = 10;
sc.rdGapLinear = 10;
sc.rfGapConst = 10;
sc.rfGapLinear = 10;
sc.setNPen(COST_MODEL_CONSTANT, 2);
sc.gapbar = 1;
// No Ns allowed, so this hit should be filtered
doTestCase2(
al,
"ACGTACGT", // read seq
"IIIIIIII", // read quals
"NCGTACGT", // ref seq
0, // offset
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
false, // ns are in inclusive
true, // nfilter
0);
al.nextAlignment(res, rnd);
assert(res.empty());
cerr << "PASSED" << endl;
res.reset();
// 1 N allowed, so this hit should stand
cerr << " Test " << tests++ << " (N ceiling 2)...";
doTestCase3(
al,
"ACGTACGT", // read seq
"IIIIIIII", // read quals
"NCGTACGT", // ref seq
0, // offset
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0f, // constant coefficient for # Ns allowed
0.0f, // linear coefficient for # Ns allowed
res, // result
false, // ns are in inclusive
false, // nfilter - NOTE: FILTER OFF
0);
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(0, res.alres.score().gaps());
assert_eq(-2, res.alres.score().score());
assert_eq(1, res.alres.score().ns());
cerr << "PASSED" << endl;
res.reset();
// 1 N allowed, but we set st_ such that this hit should not stand
for(size_t i = 0; i < 2; i++) {
cerr << " Test " << tests++ << " (N ceiling 2 with st_ override)...";
EList<bool> en;
en.resize(3); en.fill(true);
if(i == 1) {
en[1] = false;
}
sc.rfGapConst = 10;
sc.rdGapLinear = 10;
sc.rfGapConst = 10;
sc.rfGapLinear = 10;
doTestCase4(
al,
"ACGTACGT", // read seq
"IIIIIIII", // read quals
"NCGTACGT", // ref seq
0, // offset
en, // rectangle columns where solution can end
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
1.0f, // constant coefficient for # Ns allowed
0.0f, // linear coefficient for # Ns allowed
res, // result
false, // ns are in inclusive
false, // nfilter - NOTE: FILTER OFF
0);
al.nextAlignment(res, rnd);
if(i > 0) {
assert(res.empty());
} else {
assert(!res.empty());
}
cerr << "PASSED" << endl;
res.reset();
}
// No Ns allowed, so this hit should be filtered
cerr << " Test " << tests++ << " (N ceiling 3)...";
sc.nCeilConst = 1.0f;
sc.nCeilLinear = 0.0f;
doTestCase2(
al,
"ACGTACGT", // read seq
"IIIIIIII", // read quals
"NCGTACGT", // ref seq
0, // offset
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
false, // ns are in inclusive
true, // nfilter - NOTE: FILTER ON
0);
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(0, res.alres.score().gaps());
assert_eq(-2, res.alres.score().score());
assert_eq(1, res.alres.score().ns());
cerr << "PASSED" << endl;
res.reset();
// No Ns allowed, so this hit should be filtered
cerr << " Test " << tests++ << " (redundant alignment elimination 1)...";
sc.nCeilConst = 1.0f;
sc.nCeilLinear = 0.0f;
sc.rfGapConst = 25;
sc.rdGapLinear = 15;
sc.rfGapConst = 25;
sc.rfGapLinear = 15;
doTestCase2(
al,
// 1 2 3 4
// 01234567890123456789012345678901234567890123456
"AGGCTATGCCTCTGACGCGATATCGGCGCCCACTTCAGAGCTAACCG",
"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII",
"TTTTTTTTAGGCTATGCCTCTGACGCGATATCGGCGCCCACTTCAGAGCTAACCGTTTTTTT",
// 01234567890123456789012345678901234567890123456789012345678901
// 1 2 3 4 5 6
8, // offset
sc, // scoring scheme
-25.0f, // const coeff for cost ceiling
-5.0f, // linear coeff for cost ceiling
res, // result
false, // ns are in inclusive
true, // nfilter - NOTE: FILTER ON
0);
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(8, res.alres.refoff());
assert_eq(47, res.alres.refExtent());
assert_eq(0, res.alres.score().gaps());
assert_eq(0, res.alres.score().score());
assert_eq(0, res.alres.score().ns());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
cerr << "PASSED" << endl;
res.reset();
}
/**
* Do a set of unit tests for local alignment.
*/
static void doLocalTests() {
bonusMatchType = DEFAULT_MATCH_BONUS_TYPE;
bonusMatch = DEFAULT_MATCH_BONUS_LOCAL;
penMmcType = DEFAULT_MM_PENALTY_TYPE;
penMmc = DEFAULT_MM_PENALTY;
penSnp = DEFAULT_SNP_PENALTY;
penNType = DEFAULT_N_PENALTY_TYPE;
penN = DEFAULT_N_PENALTY;
nPairCat = DEFAULT_N_CAT_PAIR;
penRdExConst = DEFAULT_READ_GAP_CONST;
penRfExConst = DEFAULT_REF_GAP_CONST;
penRdExLinear = DEFAULT_READ_GAP_LINEAR;
penRfExLinear = DEFAULT_REF_GAP_LINEAR;
costMinConst = DEFAULT_MIN_CONST_LOCAL;
costMinLinear = DEFAULT_MIN_LINEAR_LOCAL;
costFloorConst = DEFAULT_FLOOR_CONST_LOCAL;
costFloorLinear = DEFAULT_FLOOR_LINEAR_LOCAL;
nCeilConst = 1.0f; // constant factor in N ceil w/r/t read len
nCeilLinear = 0.1f; // coeff of linear term in N ceil w/r/t read len
multiseedMms = DEFAULT_SEEDMMS;
multiseedLen = DEFAULT_SEEDLEN;
// Set up penalities
Scoring sc(
10,
penMmcType, // how to penalize mismatches
30, // constant if mm pelanty is a constant
penSnp, // penalty for decoded SNP
costMinConst, // constant factor in N ceiling w/r/t read length
costMinLinear, // coeff of linear term in N ceiling w/r/t read length
costFloorConst, // constant factor in N ceiling w/r/t read length
costFloorLinear, // coeff of linear term in N ceiling w/r/t read length
nCeilConst, // constant factor in N ceiling w/r/t read length
nCeilLinear, // coeff of linear term in N ceiling w/r/t read length
penNType, // how to penalize Ns in the read
penN, // constant if N pelanty is a constant
nPairCat, // true -> concatenate mates before N filtering
25, // constant coeff for cost of gap in read
25, // constant coeff for cost of gap in ref
15, // linear coeff for cost of gap in read
15, // linear coeff for cost of gap in ref
1, // # rows at top/bot can only be entered diagonally
-1, // min row idx to backtrace from; -1 = no limit
false // sort results first by row then by score?
);
SwResult res;
//
// Basic nucleotide-space tests
//
cerr << "Running local tests..." << endl;
int tests = 1;
bool nIncl = false;
bool nfilter = false;
SwAligner al;
RandomSource rnd(73);
for(int i = 0; i < 3; i++) {
cerr << " Test " << tests++ << " (short nuc space, offset "
<< (i*4) << ", exact)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"ACGT", // read
"IIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
8.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(4, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 40);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
// A C G T A C G T A C G T A C G T
// 0 C
// 1 C x
// 2 G x
// 3 T x
cerr << " Test " << tests++ << " (short nuc space, offset "
<< (i*4) << ", 1mm)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"CCGT", // read
"IIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
7.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4+1, res.alres.refoff());
assert_eq(3, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 30);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (short nuc space, offset "
<< (i*4) << ", 1mm)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"ACGA", // read
"IIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
7.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(3, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 30);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
if(i == 0) {
cerr << " Test " << tests++ << " (short nuc space, offset "
<< (i*4) << ", 1mm, match bonus=20)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
sc.setMatchBonus(20);
doTestCase2(
al,
"TTGT", // read
"IIII", // qual
"TTGA", // ref in
i*4, // off
sc, // scoring scheme
25.0f, // const coeff for cost ceiling
0.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(3, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 60);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
sc.setMatchBonus(10);
cerr << "PASSED" << endl;
}
cerr << " Test " << tests++ << " (nuc space, offset "
<< (i*4) << ", exact)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"ACGTACGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
8.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 80);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
assert(res.empty());
assert(al.done());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (long nuc space, offset "
<< (i*8) << ", exact)...";
sc.rdGapConst = 40;
sc.rfGapConst = 40;
doTestCase2(
al,
"ACGTACGTACGTACGTACGTA", // read
"IIIIIIIIIIIIIIIIIIIII", // qual
"ACGTACGTACGTACGTACGTACGTACGTACGTACGTA", // ref in
// ACGTACGTACGTACGTACGT
// ACGTACGTACGTACGTACGT
// ACGTACGTACGTACGTACGT
i*8, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
8.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*8, res.alres.refoff());
assert_eq(21, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 210);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
//assert(res.empty());
//assert(al.done());
res.reset();
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (nuc space, offset " << (i*4)
<< ", 1mm allowed by minsc)...";
doTestCase2(
al,
"ACGTTCGT", // read
"IIIIIIII", // qual
"ACGTACGTACGTACGT", // ref in
i*4, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
5.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*4, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 40);
assert_eq(res.alres.score().ns(), 0);
res.reset();
al.nextAlignment(res, rnd);
//assert(res.empty());
//assert(al.done());
cerr << "PASSED" << endl;
cerr << " Test " << tests++ << " (long nuc space, offset "
<< (i*8) << ", 6mm allowed by minsc)...";
sc.rdGapConst = 50;
sc.rfGapConst = 50;
sc.rdGapLinear = 45;
sc.rfGapLinear = 45;
doTestCase2(
al,
"ACGTACGATGCATCGTACGTA", // read
"IIIIIIIIIIIIIIIIIIIII", // qual
"ACGTACGTACGTACGTACGTACGTACGTACGTACGTA", // ref in
// ACGTACGTACGTACGTACGT
// ACGTACGTACGTACGTACGT
// ACGTACGTACGTACGTACGT
i*8, // off
sc, // scoring scheme
0.0f, // const coeff for cost ceiling
1.0f, // linear coeff for cost ceiling
res, // result
nIncl, // Ns inclusive (not mismatches)
nfilter); // filter Ns
assert(!al.done());
al.nextAlignment(res, rnd);
assert(!res.empty());
assert_eq(i*8 + 13, res.alres.refoff());
assert_eq(8, res.alres.refExtent());
assert_eq(res.alres.score().gaps(), 0);
assert_eq(res.alres.score().score(), 80);
assert_eq(res.alres.score().ns(), 0);
assert(res.alres.ned().empty());
assert(res.alres.aed().empty());
res.reset();
al.nextAlignment(res, rnd);
res.reset();
cerr << "PASSED" << endl;
}
}
int main(int argc, char **argv) {
int option_index = 0;
int next_option;
unsigned seed = 0;
gGapBarrier = 1;
gSnpPhred = 30;
bool nsInclusive = false;
bool nfilter = false;
bonusMatchType = DEFAULT_MATCH_BONUS_TYPE;
bonusMatch = DEFAULT_MATCH_BONUS;
penMmcType = DEFAULT_MM_PENALTY_TYPE;
penMmc = DEFAULT_MM_PENALTY;
penSnp = DEFAULT_SNP_PENALTY;
penNType = DEFAULT_N_PENALTY_TYPE;
penN = DEFAULT_N_PENALTY;
penRdExConst = DEFAULT_READ_GAP_CONST;
penRfExConst = DEFAULT_REF_GAP_CONST;
penRdExLinear = DEFAULT_READ_GAP_LINEAR;
penRfExLinear = DEFAULT_REF_GAP_LINEAR;
costMinConst = DEFAULT_MIN_CONST;
costMinLinear = DEFAULT_MIN_LINEAR;
costFloorConst = DEFAULT_FLOOR_CONST;
costFloorLinear = DEFAULT_FLOOR_LINEAR;
nCeilConst = 1.0f; // constant factor in N ceiling w/r/t read length
nCeilLinear = 1.0f; // coeff of linear term in N ceiling w/r/t read length
nCatPair = false;
multiseedMms = DEFAULT_SEEDMMS;
multiseedLen = DEFAULT_SEEDLEN;
multiseedIvalType = DEFAULT_IVAL;
multiseedIvalA = DEFAULT_IVAL_A;
multiseedIvalB = DEFAULT_IVAL_B;
mhits = 1;
do {
next_option = getopt_long(argc, argv, short_opts, long_opts, &option_index);
switch (next_option) {
case 's': gSnpPhred = parse<int>(optarg); break;
case 'r': seed = parse<unsigned>(optarg); break;
case ARG_TESTS: {
doTests();
cout << "PASSED end-to-ends" << endl;
doLocalTests();
cout << "PASSED locals" << endl;
return 0;
}
case 'A': {
bool localAlign = false;
bool noisyHpolymer = false;
bool ignoreQuals = false;
SeedAlignmentPolicy::parseString(
optarg,
localAlign,
noisyHpolymer,
ignoreQuals,
bonusMatchType,
bonusMatch,
penMmcType,
penMmc,
penNType,
penN,
penRdExConst,
penRfExConst,
penRdExLinear,
penRfExLinear,
costMinConst,
costMinLinear,
costFloorConst,
costFloorLinear,
nCeilConst,
nCeilLinear,
nCatPair,
multiseedMms,
multiseedLen,
multiseedIvalType,
multiseedIvalA,
multiseedIvalB,
posmin);
break;
}
case -1: break;
default: {
cerr << "Unknown option: " << (char)next_option << endl;
printUsage(cerr);
exit(1);
}
}
} while(next_option != -1);
srand(seed);
if(argc - optind < 4) {
cerr << "Need at least 4 arguments" << endl;
printUsage(cerr);
exit(1);
}
BTDnaString read;
BTString ref, qual;
// Get read
read.installChars(argv[optind]);
// Get qualities
qual.install(argv[optind+1]);
assert_eq(read.length(), qual.length());
// Get reference
ref.install(argv[optind+2]);
// Get reference offset
size_t off = parse<size_t>(argv[optind+3]);
// Set up penalities
Scoring sc(
false, // local alignment?
false, // bad homopolymer?
bonusMatchType,
bonusMatch,
penMmcType, // how to penalize mismatches
penMmc, // constant if mm pelanty is a constant
costMinConst,
costMinLinear,
costFloorConst,
costFloorLinear,
nCeilConst, // N ceiling constant coefficient
nCeilLinear, // N ceiling linear coefficient
penNType, // how to penalize Ns in the read
penN, // constant if N pelanty is a constant
nCatPair, // true -> concatenate mates before N filtering
penRdExConst, // constant cost of extending a gap in the read
penRfExConst, // constant cost of extending a gap in the reference
penRdExLinear, // coeff of linear term for cost of gap extension in read
penRfExLinear // coeff of linear term for cost of gap extension in ref
);
// Calculate the penalty ceiling for the read
TAlScore minsc = Scoring::linearFunc(
read.length(),
costMinConst,
costMinLinear);
TAlScore floorsc = Scoring::linearFunc(
read.length(),
costFloorConst,
costFloorLinear);
SwResult res;
SwAligner al;
doTestCase(
al,
read,
qual,
ref,
off,
NULL,
sc,
minsc,
res,
nsInclusive,
nfilter,
seed);
}
#endif /*MAIN_ALIGNER_SW*/
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/Velcon-Zheng/bowtie2.git
git@gitee.com:Velcon-Zheng/bowtie2.git
Velcon-Zheng
bowtie2
bowtie2
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385