代码拉取完成,页面将自动刷新
#ifndef EBWT_SEARCH_BACKTRACK_H_
#define EBWT_SEARCH_BACKTRACK_H_
#include <stdexcept>
#include <seqan/sequence.h>
#include "pat.h"
#include "qual.h"
#include "ebwt_search_util.h"
#include "range.h"
#include "range_source.h"
#include "aligner_metrics.h"
#include "search_globals.h"
/**
* Class that coordinates quality- and quantity-aware backtracking over
* some range of a read sequence.
*
* The creator can configure the BacktrackManager to treat different
* stretches of the read differently.
*/
class GreedyDFSRangeSource {
typedef std::pair<int, int> TIntPair;
typedef seqan::String<seqan::Dna> DnaString;
public:
GreedyDFSRangeSource(
const Ebwt<DnaString>* ebwt,
const EbwtSearchParams<DnaString>& params,
const BitPairReference* refs,
uint32_t qualThresh, /// max acceptable q-distance
const int maxBts, /// maximum # backtracks allowed
uint32_t reportPartials = 0,
bool reportExacts = true,
bool reportRanges = false,
PartialAlignmentManager* partials = NULL,
String<QueryMutation>* muts = NULL,
bool verbose = true,
vector<String<Dna5> >* os = NULL,
bool considerQuals = true, // whether to consider quality values when making backtracking decisions
bool halfAndHalf = false, // hacky way of supporting separate revisitable regions
bool maqPenalty = true) :
_refs(refs),
_qry(NULL),
_qlen(0),
_qual(NULL),
_name(NULL),
_ebwt(ebwt),
_params(params),
_unrevOff(0),
_1revOff(0),
_2revOff(0),
_3revOff(0),
_maqPenalty(maqPenalty),
_qualThresh(qualThresh),
_pairs(NULL),
_elims(NULL),
_mms(),
_refcs(),
_chars(NULL),
_reportPartials(reportPartials),
_reportExacts(reportExacts),
_reportRanges(reportRanges),
_partials(partials),
_muts(muts),
_os(os),
_sanity(_os != NULL && _os->size() > 0),
_considerQuals(considerQuals),
_halfAndHalf(halfAndHalf),
_5depth(0),
_3depth(0),
_numBts(0),
_totNumBts(0),
_maxBts(maxBts),
_precalcedSideLocus(false),
_preLtop(),
_preLbot(),
_verbose(verbose),
_ihits(0llu)
{ }
~GreedyDFSRangeSource() {
if(_pairs != NULL) delete[] _pairs;
if(_elims != NULL) delete[] _elims;
if(_chars != NULL) delete[] _chars;
}
/**
* Set a new query read.
*/
void setQuery(Read& r) {
const bool fw = _params.fw();
const bool ebwtFw = _ebwt->fw();
if(ebwtFw) {
_qry = fw ? &r.patFw : &r.patRc;
_qual = fw ? &r.qual : &r.qualRev;
} else {
_qry = fw ? &r.patFwRev : &r.patRcRev;
_qual = fw ? &r.qualRev : &r.qual;
}
_name = &r.name;
// Reset _qlen
if(length(*_qry) > _qlen) {
try {
_qlen = length(*_qry);
// Resize _pairs
if(_pairs != NULL) { delete[] _pairs; }
_pairs = new TIndexOffU[_qlen*_qlen*8];
// Resize _elims
if(_elims != NULL) { delete[] _elims; }
_elims = new uint8_t[_qlen*_qlen];
memset(_elims, 0, _qlen*_qlen);
// Resize _chars
if(_chars != NULL) { delete[] _chars; }
_chars = new char[_qlen];
assert(_pairs != NULL && _elims != NULL && _chars != NULL);
} catch(std::bad_alloc& e) {
ThreadSafe _ts(&gLock);
cerr << "Unable to allocate memory for depth-first "
<< "backtracking search; new length = " << length(*_qry)
<< endl;
throw 1;
}
} else {
// New length is less than old length, so there's no need
// to resize any data structures.
assert(_pairs != NULL && _elims != NULL && _chars != NULL);
_qlen = length(*_qry);
}
_mms.clear();
_refcs.clear();
assert_geq(length(*_qual), _qlen);
if(_verbose) {
cout << "setQuery(_qry=" << (*_qry) << ", _qual=" << (*_qual) << ")" << endl;
}
// Initialize the random source using new read as part of the
// seed.
_color = r.color;
_seed = r.seed;
_patid = r.patid;
_primer = r.primer;
_trimc = r.trimc;
_rand.init(r.seed);
}
/**
* Apply a batch of mutations to this read, possibly displacing a
* previous batch of mutations.
*/
void setMuts(String<QueryMutation>* muts) {
if(_muts != NULL) {
// Undo previous mutations
assert_gt(length(*_muts), 0);
undoPartialMutations();
}
_muts = muts;
if(_muts != NULL) {
assert_gt(length(*_muts), 0);
applyPartialMutations();
}
}
/**
* Set backtracking constraints.
*/
void setOffs(uint32_t depth5, // depth of far edge of hi-half
uint32_t depth3, // depth of far edge of lo-half
uint32_t unrevOff, // depth above which we cannot backtrack
uint32_t revOff1, // depth above which we may backtrack just once
uint32_t revOff2, // depth above which we may backtrack just twice
uint32_t revOff3) // depth above which we may backtrack just three times
{
_5depth = depth5;
_3depth = depth3;
assert_geq(depth3, depth5);
_unrevOff = unrevOff;
_1revOff = revOff1;
_2revOff = revOff2;
_3revOff = revOff3;
}
/**
* Reset number of backtracks to 0.
*/
void resetNumBacktracks() {
_totNumBts = 0;
}
/**
* Return number of backtracks since the last time the count was
* reset.
*/
uint32_t numBacktracks() {
return _totNumBts;
}
/**
* Set whether to report exact hits.
*/
void setReportExacts(int stratum) {
_reportExacts = stratum;
}
/**
* Set the Bowtie index to search against.
*/
void setEbwt(const Ebwt<String<Dna> >* ebwt) {
_ebwt = ebwt;
}
/**
* Return the current range
*/
Range& range() {
return _curRange;
}
/**
* Set _qlen. Don't let it exceed length of query.
*/
void setQlen(uint32_t qlen) {
assert(_qry != NULL);
_qlen = min<uint32_t>((uint32_t)length(*_qry), qlen);
}
/// Return the maximum number of allowed backtracks in a given call
/// to backtrack()
uint32_t maxBacktracks() {
return _maxBts;
}
/**
* Initiate the recursive backtracking routine starting at the
* extreme right-hand side of the pattern. Use the ftab to match
* the first several characters in one chomp, as long as doing so
* does not "jump over" any legal backtracking targets.
*
* Return true iff the HitSink has indicated that we're done with
* this read.
*/
bool backtrack(uint32_t ham = 0) {
assert_gt(length(*_qry), 0);
assert_leq(_qlen, length(*_qry));
assert_geq(length(*_qual), length(*_qry));
const Ebwt<String<Dna> >& ebwt = *_ebwt;
int ftabChars = ebwt._eh._ftabChars;
int nsInSeed = 0; int nsInFtab = 0;
if(!tallyNs(nsInSeed, nsInFtab)) {
// No alignments are possible because of the distribution
// of Ns in the read in combination with the backtracking
// constraints.
return false;
}
bool ret;
// m = depth beyond which ftab must not extend or else we might
// miss some legitimate paths
uint32_t m = min<uint32_t>(_unrevOff, (uint32_t)_qlen);
if(nsInFtab == 0 && m >= (uint32_t)ftabChars) {
uint32_t ftabOff = calcFtabOff();
TIndexOffU top = ebwt.ftabHi(ftabOff);
TIndexOffU bot = ebwt.ftabLo(ftabOff+1);
if(_qlen == (TIndexOffU)ftabChars && bot > top) {
// We have a match!
if(_reportPartials > 0) {
// Oops - we're trying to find seedlings, so we've
// gone too far; start again
ret = backtrack(0, // depth
0, // top
0, // bot
ham,
nsInFtab > 0);
} else {
// We have a match!
ret = reportAlignment(0, top, bot, ham);
}
} else if (bot > top) {
// We have an arrow pair from which we can backtrack
ret = backtrack(ftabChars, // depth
top, // top
bot, // bot
ham,
nsInFtab > 0);
} else {
// The arrows are already closed; give up
ret = false;
}
} else {
// The ftab *does* extend past the unrevisitable portion;
// we can't use it in this case, because we might jump past
// a legitimate mismatch
ret = backtrack(0, // depth
0, // top
0, // bot
ham,
// disable ftab jumping if there is more
// than 1 N in it
nsInFtab > 0);
}
if(finalize()) ret = true;
return ret;
}
/**
* If there are any buffered results that have yet to be committed,
* commit them. This happens when looking for partial alignments.
*/
bool finalize() {
bool ret = false;
if(_reportPartials > 0) {
// We're in partial alignment mode; take elements of the
// _partialBuf and install them in the _partials database
assert(_partials != NULL);
if(_partialsBuf.size() > 0) {
#ifndef NDEBUG
for(size_t i = 0; i < _partialsBuf.size(); i++) {
assert(_partialsBuf[i].repOk(_qualThresh, (uint32_t)_qlen, (*_qual), _maqPenalty));
}
#endif
_partials->addPartials(_params.patId(), _partialsBuf);
_partialsBuf.clear();
ret = true;
} else {
assert(!ret);
}
}
assert_eq(0, _partialsBuf.size());
return ret;
}
/**
* Starting at the given "depth" relative to the 5' end, and the
* given top and bot indexes (where top=0 and bot=0 means it's up
* to us to calculate the initial range), and initial weighted
* hamming distance iham, find a hit using randomized, quality-
* aware backtracking.
*/
bool backtrack(uint32_t depth,
TIndexOffU top,
TIndexOffU bot,
uint32_t iham = 0,
bool disableFtab = false)
{
HitSinkPerThread& sink = _params.sink();
_ihits = sink.retainedHits().size();
// Initiate the recursive, randomized quality-aware backtracker
// with a stack depth of 0 (no backtracks so far)
_bailedOnBacktracks = false;
bool done = backtrack(0, depth, _unrevOff, _1revOff, _2revOff, _3revOff,
top, bot, iham, iham, _pairs, _elims, disableFtab);
_totNumBts += _numBts;
_numBts = 0;
_precalcedSideLocus = false;
_bailedOnBacktracks = false;
return done;
}
/**
* Recursive routine for progressing to the next backtracking
* decision given some initial conditions. If a hit is found, it
* is recorded and true is returned. Otherwise, if there are more
* backtracking opportunities, the function will call itself
* recursively and return the result. As soon as there is a
* mismatch and no backtracking opportunities, false is returned.
*/
bool backtrack(uint32_t stackDepth, // depth of the recursion stack; = # mismatches so far
uint32_t depth, // next depth where a post-pair needs to be calculated
uint32_t unrevOff, // depths < unrevOff are unrevisitable
uint32_t oneRevOff,// depths < oneRevOff are 1-revisitable
uint32_t twoRevOff,// depths < twoRevOff are 2-revisitable
uint32_t threeRevOff,// depths < threeRevOff are 3-revisitable
TIndexOffU top, // top arrow in pair prior to 'depth'
TIndexOffU bot, // bottom arrow in pair prior to 'depth'
uint32_t ham, // weighted hamming distance so far
uint32_t iham, // initial weighted hamming distance
TIndexOffU* pairs, // portion of pairs array to be used for this backtrack frame
uint8_t* elims, // portion of elims array to be used for this backtrack frame
bool disableFtab = false)
{
// Can't have already exceeded weighted hamming distance threshold
assert_leq(stackDepth, depth);
assert_gt(length(*_qry), 0);
assert_leq(_qlen, length(*_qry));
assert_geq(length(*_qual), length(*_qry));
assert(_qry != NULL);
assert(_qual != NULL);
assert(_name != NULL);
assert(_qlen != 0);
assert_leq(ham, _qualThresh);
assert_lt(depth, _qlen); // can't have run off the end of qry
assert_geq(bot, top); // could be that both are 0
assert(pairs != NULL);
assert(elims != NULL);
assert_leq(stackDepth, _qlen);
const Ebwt<String<Dna> >& ebwt = *_ebwt;
HitSinkPerThread& sink = _params.sink();
uint64_t prehits = sink.numValidHits();
if(_halfAndHalf) {
assert_eq(0, _reportPartials);
assert_gt(_3depth, _5depth);
}
if(_reportPartials) {
assert(!_halfAndHalf);
}
if(_verbose) {
cout << " backtrack(stackDepth=" << stackDepth << ", "
<< "depth=" << depth << ", "
<< "top=" << top << ", "
<< "bot=" << bot << ", "
<< "ham=" << ham << ", "
<< "iham=" << iham << ", "
<< "pairs=" << pairs << ", "
<< "elims=" << (void*)elims << "): \"";
for(int i = (int)depth - 1; i >= 0; i--) {
cout << _chars[i];
}
cout << "\"" << endl;
}
// Do this early on so that we can clear _precalcedSideLocus
// before we have too many opportunities to bail and leave it
// 'true'
SideLocus ltop, lbot;
if(_precalcedSideLocus) {
ltop = _preLtop;
lbot = _preLbot;
_precalcedSideLocus = false;
} else if(top != 0 || bot != 0) {
SideLocus::initFromTopBot(top, bot, ebwt._eh, ebwt._ebwt, ltop, lbot);
}
// Check whether we've exceeded any backtracking limit
if(_halfAndHalf) {
if(_maxBts > 0 && _numBts == _maxBts) {
_bailedOnBacktracks = true;
return false;
}
_numBts++;
}
// # positions with at least one legal outgoing path
uint32_t altNum = 0;
// # positions tied for "best" outgoing qual
uint32_t eligibleNum = 0;
// total range-size for all eligibles
TIndexOffU eligibleSz = 0;
// If there is just one eligible slot at the moment (a common
// case), these are its parameters
uint32_t eli = 0;
bool elignore = true; // ignore the el values because they didn't come from a recent override
TIndexOffU eltop = 0;
TIndexOffU elbot = 0;
uint32_t elham = ham;
char elchar = 0;
int elcint = 0;
// The lowest quality value associated with any alternative
// ranges; all alternative ranges with this quality are
// eligible
uint8_t lowAltQual = 0xff;
uint32_t d = depth;
uint32_t cur = (uint32_t)_qlen - d - 1; // current offset into _qry
while(cur < _qlen) {
// Try to advance further given that
if(_verbose) {
cout << " cur=" << cur << " \"";
for(int i = (int)d - 1; i >= 0; i--) {
cout << _chars[i];
}
cout << "\"";
}
// If we're searching for a half-and-half solution, then
// enforce the boundary-crossing constraints here.
if(_halfAndHalf && !hhCheckTop(stackDepth, d, iham, _mms, prehits)) {
return false;
}
bool curIsEligible = false;
// Reset eligibleNum and eligibleSz if there are any
// eligible pairs discovered at this spot
bool curOverridesEligible = false;
// Determine whether ranges at this location are
// candidates for backtracking
int c = (int)(*_qry)[cur];
assert_leq(c, 4);
uint8_t q = qualAt(cur);
// The current query position is a legit alternative if it a) is
// not in the unrevisitable region, and b) the quality ceiling (if
// one exists) is not exceeded
bool curIsAlternative =
(d >= unrevOff) &&
(!_considerQuals ||
(ham + mmPenalty(_maqPenalty, q) <= _qualThresh));
if(curIsAlternative) {
if(_considerQuals) {
// Is it the best alternative?
if(q < lowAltQual) {
// Ranges at this depth in this backtracking frame are
// eligible, unless we learn otherwise. Ranges previously
// thought to be eligible are not any longer.
curIsEligible = true;
curOverridesEligible = true;
} else if(q == lowAltQual) {
// Ranges at this depth in this backtracking frame
// are eligible, unless we learn otherwise
curIsEligible = true;
}
} else {
// When quality values are not considered, all positions
// are eligible
curIsEligible = true;
}
}
if(curIsEligible) assert(curIsAlternative);
if(curOverridesEligible) assert(curIsEligible);
if(curIsAlternative && !curIsEligible) {
assert_gt(eligibleSz, 0);
assert_gt(eligibleNum, 0);
}
if(_verbose) {
cout << " alternative: " << curIsAlternative;
cout << ", eligible: " << curIsEligible;
if(curOverridesEligible) cout << "(overrides)";
cout << endl;
}
// If c is 'N', then it's guaranteed to be a mismatch
if(c == 4 && d > 0) {
// Force the 'else if(curIsAlternative)' branch below
top = bot = 1;
} else if(c == 4) {
// We'll take the 'if(top == 0 && bot == 0)' branch below
assert_eq(0, top);
assert_eq(0, bot);
}
// Calculate the ranges for this position
if(top == 0 && bot == 0) {
// Calculate first quartet of ranges using the _fchr[]
// array
pairs[0 + 0] = ebwt._fchr[0];
pairs[0 + 4] = pairs[1 + 0] = ebwt._fchr[1];
pairs[1 + 4] = pairs[2 + 0] = ebwt._fchr[2];
pairs[2 + 4] = pairs[3 + 0] = ebwt._fchr[3];
pairs[3 + 4] = ebwt._fchr[4];
// Update top and bot
if(c < 4) {
top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c);
assert_geq(bot, top);
}
} else if(curIsAlternative) {
// Clear pairs
memset(&pairs[d*8], 0, 8 * OFF_SIZE);
// Calculate next quartet of ranges
ebwt.mapLFEx(ltop, lbot, &pairs[d*8], &pairs[(d*8)+4]);
// Update top and bot
if(c < 4) {
top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c);
assert_geq(bot, top);
}
} else {
// This query character is not even a legitimate
// alternative (because backtracking here would blow
// our mismatch quality budget), so no need to do the
// bookkeeping for the entire quartet, just do c
if(c < 4) {
if(top+1 == bot) {
bot = top = ebwt.mapLF1(top, ltop, c);
if(bot != OFF_MASK) bot++;
} else {
top = ebwt.mapLF(ltop, c); bot = ebwt.mapLF(lbot, c);
assert_geq(bot, top);
}
}
}
if(top != bot) {
// Calculate loci from row indices; do it now so that
// those prefetches are fired off as soon as possible.
// This eventually calls SideLocus.initfromRow().
SideLocus::initFromTopBot(top, bot, ebwt._eh, ebwt._ebwt, ltop, lbot);
}
// Update the elim array
eliminate(elims, d, c);
if(curIsAlternative) {
// Given the just-calculated range quartet, update
// elims, altNum, eligibleNum, eligibleSz
for(int i = 0; i < 4; i++) {
if(i == c) continue;
assert_leq(pairTop(pairs, d, i), pairBot(pairs, d, i));
TIndexOffU spread = pairSpread(pairs, d, i);
if(spread == 0) {
// Indicate this char at this position is
// eliminated as far as this backtracking frame is
// concerned, since its range is empty
elims[d] |= (1 << i);
assert_lt(elims[d], 16);
}
if(spread > 0 && ((elims[d] & (1 << i)) == 0)) {
// This char at this position is an alternative
if(curIsEligible) {
if(curOverridesEligible) {
// Only now that we know there is at least
// one potential backtrack target at this
// most-eligible position should we reset
// these eligibility parameters
lowAltQual = q;
eligibleNum = 0;
eligibleSz = 0;
curOverridesEligible = false;
// Remember these parameters in case
// this turns out to be the only
// eligible target
eli = d;
eltop = pairTop(pairs, d, i);
elbot = pairBot(pairs, d, i);
assert_eq(elbot-eltop, spread);
elham = mmPenalty(_maqPenalty, q);
elchar = "acgt"[i];
elcint = i;
elignore = false;
}
eligibleSz += spread;
eligibleNum++;
}
assert_gt(eligibleSz, 0);
assert_gt(eligibleNum, 0);
altNum++;
}
}
}
if(altNum > 0) {
assert_gt(eligibleSz, 0);
assert_gt(eligibleNum, 0);
}
assert_leq(eligibleNum, eligibleSz);
assert_leq(eligibleNum, altNum);
assert_lt(elims[d], 16);
assert(sanityCheckEligibility(depth, d, unrevOff, lowAltQual, eligibleSz, eligibleNum, pairs, elims));
// Achieved a match, but need to keep going
bool backtrackDespiteMatch = false;
bool reportedPartial = false;
if(cur == 0 && // we've consumed the entire pattern
top < bot && // there's a hit to report
stackDepth < _reportPartials && // not yet used up our mismatches
_reportPartials > 0) // there are still legel backtracking targets
{
assert(!_halfAndHalf);
if(altNum > 0) backtrackDespiteMatch = true;
if(stackDepth > 0) {
// This is a legit seedling; report it
reportPartial(stackDepth);
reportedPartial = true;
}
// Now continue on to find legitimate seedlings with
// more mismatches than this one
}
// Check whether we've obtained an exact alignment when
// we've been instructed not to report exact alignments
bool invalidExact = false;
if(cur == 0 && stackDepth == 0 && bot > top && !_reportExacts) {
invalidExact = true;
backtrackDespiteMatch = true;
}
// Set this to true if the only way to make legal progress
// is via one or more additional backtracks. This is
// helpful in half-and-half mode.
bool mustBacktrack = false;
bool invalidHalfAndHalf = false;
if(_halfAndHalf) {
ASSERT_ONLY(uint32_t lim = (_3revOff == _2revOff)? 2 : 3);
if((d == (_5depth-1)) && top < bot) {
// We're crossing the boundary separating the hi-half
// from the non-seed portion of the read.
// We should induce a mismatch if we haven't mismatched
// yet, so that we don't waste time pursuing a match
// that was covered by a previous phase
assert_eq(0, _reportPartials);
assert_leq(stackDepth, lim-1);
invalidHalfAndHalf = (stackDepth == 0);
if(stackDepth == 0 && altNum > 0) {
backtrackDespiteMatch = true;
mustBacktrack = true;
} else if(stackDepth == 0) {
// We're returning from the bottommost frame
// without having found any hits; let's
// sanity-check that there really aren't any
return false;
}
}
else if((d == (_3depth-1)) && top < bot) {
// We're crossing the boundary separating the lo-half
// from the non-seed portion of the read
assert_eq(0, _reportPartials);
assert_leq(stackDepth, lim);
assert_gt(stackDepth, 0);
// Count the mismatches in the lo and hi halves
uint32_t loHalfMms = 0, hiHalfMms = 0;
assert_geq(_mms.size(), stackDepth);
for(size_t i = 0; i < stackDepth; i++) {
uint32_t d = (uint32_t)_qlen - _mms[i] - 1;
if (d < _5depth) hiHalfMms++;
else if(d < _3depth) loHalfMms++;
else assert(false);
}
assert_leq(loHalfMms + hiHalfMms, lim);
invalidHalfAndHalf = (loHalfMms == 0 || hiHalfMms == 0);
if((stackDepth < 2 || invalidHalfAndHalf) && altNum > 0) {
// We backtracked fewer times than necessary;
// force a backtrack
mustBacktrack = true;
backtrackDespiteMatch = true;
} else if(stackDepth < 2) {
return false;
}
}
if(d < _5depth-1) {
assert_leq(stackDepth, lim-1);
}
else if(d >= _5depth && d < _3depth-1) {
assert_gt(stackDepth, 0);
assert_leq(stackDepth, lim);
}
}
// This is necessary for the rare case where we're about
// to declare success because bot > top and we've consumed
// the final character, but all hits between top and bot
// are spurious. This check ensures that we keep looking
// for non-spurious hits in that case.
if(cur == 0 && // we made it to the left-hand-side of the read
bot > top && // there are alignments to report
!invalidHalfAndHalf && // alignment isn't disqualified by half-and-half requirement
!invalidExact && // alignment isn't disqualified by no-exact-hits setting
!reportedPartial) // for when it's a partial alignment we've already reported
{
bool ret = reportAlignment(stackDepth, top, bot, ham);
if(!ret) {
// reportAlignment returned false, so enter the
// backtrack loop and keep going
top = bot;
} else {
// reportAlignment returned true, so stop
return true;
}
}
//
// Mismatch with alternatives
//
while((top == bot || backtrackDespiteMatch) && altNum > 0) {
if(_verbose) cout << " top (" << top << "), bot ("
<< bot << ") with " << altNum
<< " alternatives, eligible: "
<< eligibleNum << ", " << eligibleSz
<< endl;
assert_gt(eligibleSz, 0);
assert_gt(eligibleNum, 0);
// Mismatch! Must now choose where we are going to
// take our quality penalty. We can only look as far
// back as our last decision point.
assert(sanityCheckEligibility(depth, d, unrevOff, lowAltQual, eligibleSz, eligibleNum, pairs, elims));
// Pick out the arrow pair we selected and target it
// for backtracking
ASSERT_ONLY(uint32_t eligiblesVisited = 0);
size_t i = d, j = 0;
assert_geq(i, depth);
TIndexOffU bttop = 0;
TIndexOffU btbot = 0;
uint32_t btham = ham;
char btchar = 0;
int btcint = 0;
uint32_t icur = 0;
// The common case is that eligibleSz == 1
if(eligibleNum > 1 || elignore) {
ASSERT_ONLY(bool foundTarget = false);
// Walk from left to right
for(; i >= depth; i--) {
assert_geq(i, unrevOff);
icur = (uint32_t)(_qlen - i - 1); // current offset into _qry
uint8_t qi = qualAt(icur);
assert_lt(elims[i], 16);
if((qi == lowAltQual || !_considerQuals) && elims[i] != 15) {
// This is the leftmost eligible position with at
// least one remaining backtrack target
TIndexOffU posSz = 0;
// Add up the spreads for A, C, G, T
for(j = 0; j < 4; j++) {
if((elims[i] & (1 << j)) == 0) {
assert_gt(pairSpread(pairs, i, j), 0);
posSz += pairSpread(pairs, i, j);
}
}
// Generate a random number
assert_gt(posSz, 0);
uint32_t r = _rand.nextU32() % posSz;
for(j = 0; j < 4; j++) {
if((elims[i] & (1 << j)) == 0) {
// This range has not been eliminated
ASSERT_ONLY(eligiblesVisited++);
uint32_t spread = pairSpread(pairs, i, j);
if(r < spread) {
// This is our randomly-selected
// backtrack target
ASSERT_ONLY(foundTarget = true);
bttop = pairTop(pairs, i, j);
btbot = pairBot(pairs, i, j);
btham += mmPenalty(_maqPenalty, qi);
btcint = (uint32_t)j;
btchar = "acgt"[j];
assert_leq(btham, _qualThresh);
break; // found our target; we can stop
}
r -= spread;
}
}
assert(foundTarget);
break; // escape left-to-right walk
}
}
assert_leq(i, d);
assert_lt(j, 4);
assert_leq(eligiblesVisited, eligibleNum);
assert(foundTarget);
assert_neq(0, btchar);
assert_gt(btbot, bttop);
assert_leq(btbot-bttop, eligibleSz);
} else {
// There was only one eligible target; we can just
// copy its parameters
assert_eq(1, eligibleNum);
assert(!elignore);
i = eli;
bttop = eltop;
btbot = elbot;
btham += elham;
j = btcint = elcint;
btchar = elchar;
assert_neq(0, btchar);
assert_gt(btbot, bttop);
assert_leq(btbot-bttop, eligibleSz);
}
// This is the earliest that we know what the next top/
// bot combo is going to be
SideLocus::initFromTopBot(bttop, btbot,
ebwt._eh, ebwt._ebwt,
_preLtop, _preLbot);
icur = (uint32_t)(_qlen - i - 1); // current offset into _qry
// Slide over to the next backtacking frame within
// pairs and elims; won't interfere with our frame or
// any of our parents' frames
TIndexOffU *newPairs = pairs + (_qlen*8);
uint8_t *newElims = elims + (_qlen);
// If we've selected a backtracking target that's in
// the 1-revisitable region, then we ask the recursive
// callee to consider the 1-revisitable region as also
// being unrevisitable (since we just "used up" all of
// our visits)
uint32_t btUnrevOff = unrevOff;
uint32_t btOneRevOff = oneRevOff;
uint32_t btTwoRevOff = twoRevOff;
uint32_t btThreeRevOff = threeRevOff;
assert_geq(i, unrevOff);
assert_geq(oneRevOff, unrevOff);
assert_geq(twoRevOff, oneRevOff);
assert_geq(threeRevOff, twoRevOff);
if(i < oneRevOff) {
// Extend unrevisitable region to include former 1-
// revisitable region
btUnrevOff = oneRevOff;
// Extend 1-revisitable region to include former 2-
// revisitable region
btOneRevOff = twoRevOff;
// Extend 2-revisitable region to include former 3-
// revisitable region
btTwoRevOff = threeRevOff;
}
else if(i < twoRevOff) {
// Extend 1-revisitable region to include former 2-
// revisitable region
btOneRevOff = twoRevOff;
// Extend 2-revisitable region to include former 3-
// revisitable region
btTwoRevOff = threeRevOff;
}
else if(i < threeRevOff) {
// Extend 2-revisitable region to include former 3-
// revisitable region
btTwoRevOff = threeRevOff;
}
// Note the character that we're backtracking on in the
// mm array:
if(_mms.size() <= stackDepth) {
assert_eq(_mms.size(), stackDepth);
_mms.push_back(icur);
} else {
_mms[stackDepth] = icur;
}
assert_eq(1, dna4Cat[(int)btchar]);
if(_refcs.size() <= stackDepth) {
assert_eq(_refcs.size(), stackDepth);
_refcs.push_back(btchar);
} else {
_refcs[stackDepth] = btchar;
}
#ifndef NDEBUG
for(uint32_t j = 0; j < stackDepth; j++) {
assert_neq(_mms[j], icur);
}
#endif
_chars[i] = btchar;
assert_leq(i+1, _qlen);
bool ret;
if(i+1 == _qlen) {
ret = reportAlignment(stackDepth+1, bttop, btbot, btham);
} else if(_halfAndHalf &&
!disableFtab &&
_2revOff == _3revOff &&
i+1 < (uint32_t)ebwt._eh._ftabChars &&
(uint32_t)ebwt._eh._ftabChars <= _5depth)
{
// The ftab doesn't extend past the unrevisitable portion,
// so we can go ahead and use it
// Rightmost char gets least significant bit-pairs
int ftabChars = ebwt._eh._ftabChars;
TIndexOffU ftabOff = (TIndexOffU)(int)(*_qry)[_qlen - ftabChars];
assert_lt(ftabOff, 4);
assert_lt(ftabOff, ebwt._eh._ftabLen-1);
for(int j = ftabChars - 1; j > 0; j--) {
ftabOff <<= 2;
if(_qlen-j == icur) {
ftabOff |= btcint;
} else {
assert_lt((int)(*_qry)[_qlen-j], 4);
ftabOff |= (int)(*_qry)[_qlen-j];
}
assert_lt(ftabOff, ebwt._eh._ftabLen-1);
}
assert_lt(ftabOff, ebwt._eh._ftabLen-1);
TIndexOffU ftabTop = ebwt.ftabHi(ftabOff);
TIndexOffU ftabBot = ebwt.ftabLo(ftabOff+1);
assert_geq(ftabBot, ftabTop);
if(ftabTop == ftabBot) {
ret = false;
} else {
assert(!_precalcedSideLocus);
assert_leq(iham, _qualThresh);
ret = backtrack(stackDepth+1,
ebwt._eh._ftabChars,
btUnrevOff, // new unrevisitable boundary
btOneRevOff, // new 1-revisitable boundary
btTwoRevOff, // new 2-revisitable boundary
btThreeRevOff, // new 3-revisitable boundary
ftabTop, // top arrow in range prior to 'depth'
ftabBot, // bottom arrow in range prior to 'depth'
btham, // weighted hamming distance so far
iham, // initial weighted hamming distance
newPairs,
newElims);
}
} else {
// We already called initFromTopBot for the range
// we're going to continue from
_precalcedSideLocus = true;
assert_leq(iham, _qualThresh);
// Continue from selected alternative range
ret = backtrack(stackDepth+1,// added 1 mismatch to alignment
(uint32_t)i+1, // start from next position after
btUnrevOff, // new unrevisitable boundary
btOneRevOff, // new 1-revisitable boundary
btTwoRevOff, // new 2-revisitable boundary
btThreeRevOff, // new 3-revisitable boundary
bttop, // top arrow in range prior to 'depth'
btbot, // bottom arrow in range prior to 'depth'
btham, // weighted hamming distance so far
iham, // initial weighted hamming distance
newPairs,
newElims);
}
if(ret) {
assert_gt(sink.numValidHits(), prehits);
return true; // return, signaling that we're done
}
if(_bailedOnBacktracks ||
(_halfAndHalf && (_maxBts > 0) && (_numBts >= _maxBts)))
{
_bailedOnBacktracks = true;
return false;
}
// No hit was reported; update elims[], eligibleSz,
// eligibleNum, altNum
_chars[i] = (*_qry)[icur];
assert_neq(15, elims[i]);
ASSERT_ONLY(uint8_t oldElim = elims[i]);
elims[i] |= (1 << j);
assert_lt(elims[i], 16);
assert_gt(elims[i], oldElim);
eligibleSz -= (btbot-bttop);
eligibleNum--;
elignore = true;
assert_geq(eligibleNum, 0);
altNum--;
assert_geq(altNum, 0);
if(altNum == 0) {
// No alternative backtracking points; all legal
// backtracking targets have been exhausted
assert_eq(0, altNum);
assert_eq(0, eligibleSz);
assert_eq(0, eligibleNum);
return false;
}
else if(eligibleNum == 0 && _considerQuals) {
// Find the next set of eligible backtrack points
// by re-scanning this backtracking frame (from
// 'depth' up to 'd')
lowAltQual = 0xff;
for(size_t k = d; k >= depth && k <= _qlen; k--) {
size_t kcur = _qlen - k - 1; // current offset into _qry
uint8_t kq = qualAt(kcur);
if(k < unrevOff) break; // already visited all revisitable positions
bool kCurIsAlternative = (ham + mmPenalty(_maqPenalty, kq) <= _qualThresh);
bool kCurOverridesEligible = false;
if(kCurIsAlternative) {
if(kq < lowAltQual) {
// This target is more eligible than
// any targets that came before, so we
// set it to supplant/override them
kCurOverridesEligible = true;
}
if(kq <= lowAltQual) {
// Position is eligible
for(int l = 0; l < 4; l++) {
if((elims[k] & (1 << l)) == 0) {
// Not yet eliminated
TIndexOffU spread = pairSpread(pairs, k, l);
if(kCurOverridesEligible) {
// Clear previous eligible results;
// this one's better
lowAltQual = kq;
kCurOverridesEligible = false;
// Keep these parameters in
// case this target turns
// out to be the only
// eligible target and we
// can avoid having to
// recalculate them
eligibleNum = 0;
eligibleSz = 0;
eli = (uint32_t)k;
eltop = pairTop(pairs, k, l);
elbot = pairBot(pairs, k, l);
assert_eq(elbot-eltop, spread);
elham = mmPenalty(_maqPenalty, kq);
elchar = "acgt"[l];
elcint = l;
elignore = false;
}
eligibleNum++;
assert_gt(spread, 0);
eligibleSz += spread;
}
}
}
}
}
}
assert_gt(eligibleNum, 0);
assert_leq(eligibleNum, altNum);
assert_gt(eligibleSz, 0);
assert_geq(eligibleSz, eligibleNum);
assert(sanityCheckEligibility(depth, d, unrevOff, lowAltQual, eligibleSz, eligibleNum, pairs, elims));
// Try again
} // while(top == bot && altNum > 0)
if(mustBacktrack || invalidHalfAndHalf || invalidExact) {
return false;
}
// Mismatch with no alternatives
if(top == bot && altNum == 0) {
assert_eq(0, altNum);
assert_eq(0, eligibleSz);
assert_eq(0, eligibleNum);
return false;
}
// Match!
_chars[d] = (*_qry)[cur];
d++; cur--;
} // while(cur < _qlen)
assert_eq(0xffffffff, cur);
assert_gt(bot, top);
if(_reportPartials > 0) {
// Stack depth should not exceed given hamming distance
assert_leq(stackDepth, _reportPartials);
}
bool ret = false;
if(stackDepth >= _reportPartials) {
ret = reportAlignment(stackDepth, top, bot, ham);
}
return ret;
}
/**
* Pretty print a hit along with the backtracking constraints.
*/
void printHit(const Hit& h) {
::printHit(*_os, h, *_qry, _qlen, _unrevOff, _1revOff, _2revOff, _3revOff, _ebwt->fw());
}
/**
* Return true iff we're enforcing a half-and-half constraint
* (forced edits in both seed halves).
*/
bool halfAndHalf() const {
return _halfAndHalf;
}
protected:
/**
* Return true iff we're OK to continue after considering which
* half-seed boundary we're passing through, together with the
* number of mismatches accumulated so far. Return false if we
* should stop because a half-and-half constraint is violated. If
* we're not currently passing a half-seed boundary, just return
* true.
*/
bool hhCheck(uint32_t stackDepth, uint32_t depth,
const std::vector<uint32_t>& mms, bool empty)
{
ASSERT_ONLY(uint32_t lim = (_3revOff == _2revOff)? 2 : 3);
if((depth == (_5depth-1)) && !empty) {
// We're crossing the boundary separating the hi-half
// from the non-seed portion of the read.
// We should induce a mismatch if we haven't mismatched
// yet, so that we don't waste time pursuing a match
// that was covered by a previous phase
assert_eq(0, _reportPartials);
assert_leq(stackDepth, lim-1);
return stackDepth > 0;
} else if((depth == (_3depth-1)) && !empty) {
// We're crossing the boundary separating the lo-half
// from the non-seed portion of the read
assert_eq(0, _reportPartials);
assert_leq(stackDepth, lim);
assert_gt(stackDepth, 0);
// Count the mismatches in the lo and hi halves
uint32_t loHalfMms = 0, hiHalfMms = 0;
for(size_t i = 0; i < stackDepth; i++) {
uint32_t depth = (uint32_t)(_qlen - mms[i] - 1);
if (depth < _5depth) hiHalfMms++;
else if(depth < _3depth) loHalfMms++;
else assert(false);
}
assert_leq(loHalfMms + hiHalfMms, lim);
bool invalidHalfAndHalf = (loHalfMms == 0 || hiHalfMms == 0);
return (stackDepth >= 2 && !invalidHalfAndHalf);
}
if(depth < _5depth-1) {
assert_leq(stackDepth, lim-1);
}
else if(depth >= _5depth && depth < _3depth-1) {
assert_gt(stackDepth, 0);
assert_leq(stackDepth, lim);
}
return true;
}
/**
* Calculate the stratum of the partial (or full) alignment
* currently under consideration. Stratum is equal to the number
* of mismatches in the seed portion of the alignment.
*/
int calcStratum(const std::vector<TIndexOffU>& mms, uint32_t stackDepth) {
int stratum = 0;
for(size_t i = 0; i < stackDepth; i++) {
if(mms[i] >= (_qlen - _3revOff)) {
// This mismatch falls within the seed; count it
// toward the stratum to report
stratum++;
// Don't currently support more than 3
// mismatches in the seed
assert_leq(stratum, 3);
}
}
return stratum;
}
/**
* Mark character c at depth d as being eliminated with respect to
* future backtracks.
*/
void eliminate(uint8_t *elims, uint32_t d, int c) {
if(c < 4) {
elims[d] = (1 << c);
assert_gt(elims[d], 0);
assert_lt(elims[d], 16);
} else {
elims[d] = 0;
}
assert_lt(elims[d], 16);
}
/**
* Return true iff the state of the backtracker as encoded by
* stackDepth, d and iham is compatible with the current half-and-
* half alignment mode. prehits is for sanity checking when
* bailing.
*/
bool hhCheckTop(uint32_t stackDepth,
uint32_t d,
uint32_t iham,
const std::vector<TIndexOffU>& mms,
uint64_t prehits = 0xffffffffffffffffllu)
{
assert_eq(0, _reportPartials);
// Crossing from the hi-half into the lo-half
if(d == _5depth) {
if(_3revOff == _2revOff) {
// Total of 2 mismatches allowed: 1 hi, 1 lo
// The backtracking logic should have prevented us from
// backtracking more than once into this region
assert_leq(stackDepth, 1);
// Reject if we haven't encountered mismatch by this point
if(stackDepth == 0) {
return false;
}
} else { // if(_3revOff != _2revOff)
// Total of 3 mismatches allowed: 1 hi, 1 or 2 lo
// The backtracking logic should have prevented us from
// backtracking more than twice into this region
assert_leq(stackDepth, 2);
// Reject if we haven't encountered mismatch by this point
if(stackDepth < 1) {
return false;
}
}
} else if(d == _3depth) {
// Crossing from lo-half to outside of the seed
if(_3revOff == _2revOff) {
// Total of 2 mismatches allowed: 1 hi, 1 lo
// The backtracking logic should have prevented us from
// backtracking more than twice within this region
assert_leq(stackDepth, 2);
// Must have encountered two mismatches by this point
if(stackDepth < 2) {
// We're returning from the bottommost frame
// without having found any hits; let's
// sanity-check that there really aren't any
return false;
}
} else { // if(_3revOff != _2revOff)
// Total of 3 mismatches allowed: 1 hi, 1 or 2 lo
// Count the mismatches in the lo and hi halves
int loHalfMms = 0, hiHalfMms = 0;
assert_geq(mms.size(), stackDepth);
for(size_t i = 0; i < stackDepth; i++) {
TIndexOffU d = (TIndexOffU)(_qlen - mms[i] - 1);
if (d < _5depth) hiHalfMms++;
else if(d < _3depth) loHalfMms++;
else assert(false);
}
assert_leq(loHalfMms + hiHalfMms, 3);
assert_gt(hiHalfMms, 0);
if(loHalfMms == 0) {
// We're returning from the bottommost frame
// without having found any hits; let's
// sanity-check that there really aren't any
return false;
}
assert_geq(stackDepth, 2);
// The backtracking logic should have prevented us from
// backtracking more than twice within this region
assert_leq(stackDepth, 3);
}
} else {
// We didn't just cross a boundary, so do an in-between check
if(d >= _5depth) {
assert_geq(stackDepth, 1);
} else if(d >= _3depth) {
assert_geq(stackDepth, 2);
}
}
return true;
}
/**
* Return the Phred quality value for the most likely base at
* offset 'off' in the read.
*/
inline uint8_t qualAt(size_t off) {
return phredCharToPhredQual((*_qual)[off]);
}
/// Get the top offset for character c at depth d
inline TIndexOffU pairTop(TIndexOffU* pairs, size_t d, size_t c) {
return pairs[d*8 + c + 0];
}
/// Get the bot offset for character c at depth d
inline TIndexOffU pairBot(TIndexOffU* pairs, size_t d, size_t c) {
return pairs[d*8 + c + 4];
}
/// Get the spread between the bot and top offsets for character c
/// at depth d
inline TIndexOffU pairSpread(TIndexOffU* pairs, size_t d, size_t c) {
assert_geq(pairBot(pairs, d, c), pairTop(pairs, d, c));
return pairBot(pairs, d, c) - pairTop(pairs, d, c);
}
/**
* Tally how many Ns occur in the seed region and in the ftab-
* jumpable region of the read. Check whether the mismatches
* induced by the Ns already violates the current policy. Return
* false if the policy is already violated, true otherwise.
*/
bool tallyNs(int& nsInSeed, int& nsInFtab) {
const Ebwt<String<Dna> >& ebwt = *_ebwt;
int ftabChars = ebwt._eh._ftabChars;
// Count Ns in the seed region of the read and short-circuit if
// the configuration of Ns guarantees that there will be no
// valid alignments given the backtracking constraints.
for(size_t i = 0; i < _3revOff; i++) {
if((int)(*_qry)[_qlen-i-1] == 4) {
nsInSeed++;
if(nsInSeed == 1) {
if(i < _unrevOff) {
return false; // Exceeded mm budget on Ns alone
}
} else if(nsInSeed == 2) {
if(i < _1revOff) {
return false; // Exceeded mm budget on Ns alone
}
} else if(nsInSeed == 3) {
if(i < _2revOff) {
return false; // Exceeded mm budget on Ns alone
}
} else {
assert_gt(nsInSeed, 3);
return false; // Exceeded mm budget on Ns alone
}
}
}
// Calculate the number of Ns there are in the region that
// would get jumped over if the ftab were used.
for(size_t i = 0; i < (size_t)ftabChars && i < _qlen; i++) {
if((int)(*_qry)[_qlen-i-1] == 4) nsInFtab++;
}
return true;
}
/**
* Calculate the offset into the ftab for the rightmost 'ftabChars'
* characters of the current query. Rightmost char gets least
* significant bit-pair.
*/
uint32_t calcFtabOff() {
const Ebwt<String<Dna> >& ebwt = *_ebwt;
int ftabChars = ebwt._eh._ftabChars;
uint32_t ftabOff = (*_qry)[_qlen - ftabChars];
assert_lt(ftabOff, 4);
assert_lt(ftabOff, ebwt._eh._ftabLen-1);
for(int i = ftabChars - 1; i > 0; i--) {
ftabOff <<= 2;
assert_lt((uint32_t)(*_qry)[_qlen-i], 4);
ftabOff |= (uint32_t)(*_qry)[_qlen-i];
assert_lt(ftabOff, ebwt._eh._ftabLen-1);
}
assert_lt(ftabOff, ebwt._eh._ftabLen-1);
return ftabOff;
}
/**
* Mutate the _qry string according to the contents of the _muts
* array, which represents a partial alignment.
*/
void applyPartialMutations() {
if(_muts == NULL) {
// No mutations to apply
return;
}
for(size_t i = 0; i < length(*_muts); i++) {
const QueryMutation& m = (*_muts)[i];
assert_lt(m.pos, _qlen);
assert_leq(m.oldBase, 4);
assert_lt(m.newBase, 4);
assert_neq(m.oldBase, m.newBase);
assert_eq((uint32_t)((*_qry)[m.pos]), (uint32_t)m.oldBase);
(*_qry)[m.pos] = (Dna5)(int)m.newBase; // apply it
}
}
/**
* Take partial-alignment mutations present in the _muts list and
* place them on the _mm list so that they become part of the
* reported alignment.
*/
void promotePartialMutations(int stackDepth) {
if(_muts == NULL) {
// No mutations to undo
return;
}
size_t numMuts = length(*_muts);
assert_leq(numMuts, _qlen);
for(size_t i = 0; i < numMuts; i++) {
// Entries in _mms[] are in terms of offset into
// _qry - not in terms of offset from 3' or 5' end
assert_lt(stackDepth + i, _qlen);
// All partial-alignment mutations should fall
// within bounds
assert_lt((*_muts)[i].pos, _qlen);
// All partial-alignment mutations should fall
// within unrevisitable region
assert_lt(_qlen - (*_muts)[i].pos - 1, _unrevOff);
#ifndef NDEBUG
// Shouldn't be any overlap between mismatched positions
// and positions that mismatched in the partial alignment.
for(size_t j = 0; j < stackDepth + i; j++) {
assert_neq(_mms[j], (uint32_t)(*_muts)[i].pos);
}
#endif
if(_mms.size() <= stackDepth + i) {
assert_eq(_mms.size(), stackDepth + i);
_mms.push_back((*_muts)[i].pos);
} else {
_mms[stackDepth + i] = (*_muts)[i].pos;
}
if(_refcs.size() <= stackDepth + i) {
assert_eq(_refcs.size(), stackDepth + i);
_refcs.push_back("ACGT"[(*_muts)[i].newBase]);
} else {
_refcs[stackDepth + i] = "ACGT"[(*_muts)[i].newBase];
}
}
}
/**
* Undo mutations to the _qry string, returning it to the original
* read.
*/
void undoPartialMutations() {
if(_muts == NULL) {
// No mutations to undo
return;
}
for(size_t i = 0; i < length(*_muts); i++) {
const QueryMutation& m = (*_muts)[i];
assert_lt(m.pos, _qlen);
assert_leq(m.oldBase, 4);
assert_lt(m.newBase, 4);
assert_neq(m.oldBase, m.newBase);
assert_eq((uint32_t)((*_qry)[m.pos]), (uint32_t)m.newBase);
(*_qry)[m.pos] = (Dna5)(int)m.oldBase; // undo it
}
}
/**
* Report a range of alignments with # mismatches = stackDepth and
* with the mutations (also mismatches) contained in _muts. The
* range is delimited by top and bot. Returns true iff one or more
* full alignments were successfully reported and the caller can
* stop searching.
*/
bool reportAlignment(uint32_t stackDepth, TIndexOffU top,
TIndexOffU bot, uint16_t cost)
{
#ifndef NDEBUG
// No two elements of _mms[] should be the same
assert_geq(_mms.size(), stackDepth);
for(size_t i = 0; i < stackDepth; i++) {
for(size_t j = i+1; j < stackDepth; j++) {
assert_neq(_mms[j], _mms[i]);
}
// All elements of _mms[] should fall within bounds
assert_lt(_mms[i], _qlen);
}
#endif
if(_reportPartials) {
assert_leq(stackDepth, _reportPartials);
if(stackDepth > 0) {
// Report this partial alignment. A partial alignment
// is defined purely by its mismatches; top and bot are
// ignored.
reportPartial(stackDepth);
}
return false; // keep going - we want to find all partial alignments
}
int stratum = 0;
if(stackDepth > 0) {
stratum = calcStratum(_mms, stackDepth);
}
assert_lt(stratum, 4);
assert_geq(stratum, 0);
bool hit;
// If _muts != NULL then this alignment extends a partial
// alignment, so we have to account for the differences present
// in the partial.
if(_muts != NULL) {
// Undo partial-alignment mutations to get original _qry
ASSERT_ONLY(String<Dna5> tmp = (*_qry));
undoPartialMutations();
assert_neq(tmp, (*_qry));
// Add the partial-alignment mutations to the _mms[] array
promotePartialMutations(stackDepth);
// All muts are in the seed, so they count toward the stratum
size_t numMuts = length(*_muts);
stratum += numMuts;
cost |= (stratum << 14);
assert_geq(cost, (uint32_t)(stratum << 14));
// Report the range of full alignments
hit = reportFullAlignment((uint32_t)(stackDepth + numMuts), top, bot, stratum, cost);
// Re-apply partial-alignment mutations
applyPartialMutations();
assert_eq(tmp, (*_qry));
} else {
// Report the range of full alignments
cost |= (stratum << 14);
assert_geq(cost, (uint32_t)(stratum << 14));
hit = reportFullAlignment(stackDepth, top, bot, stratum, cost);
}
return hit;
}
/**
* Report a range of full alignments with # mismatches = stackDepth.
* The range is delimited by top and bot. Returns true if one or
* more alignments were successfully reported. Returns true iff
* one or more full alignments were successfully reported and the
* caller can stop searching.
*/
bool reportFullAlignment(uint32_t stackDepth,
TIndexOffU top,
TIndexOffU bot,
int stratum,
uint16_t cost)
{
assert_gt(bot, top);
if(stackDepth == 0 && !_reportExacts) {
// We are not reporting exact hits (usually because we've
// already reported them as part of a previous invocation
// of the backtracker)
return false;
}
assert(!_reportRanges);
TIndexOffU spread = bot - top;
// Pick a random spot in the range to begin report
TIndexOffU r = top + (_rand.nextU<TIndexOffU>() % spread);
for(TIndexOffU i = 0; i < spread; i++) {
TIndexOffU ri = r + i;
if(ri >= bot) ri -= spread;
// reportChaseOne takes the _mms[] list in terms of
// their indices into the query string; not in terms
// of their offset from the 3' or 5' end.
assert_geq(cost, (uint32_t)(stratum << 14));
if(_ebwt->reportChaseOne((*_qry), _qual, _name,
_color, _primer, _trimc, colorExEnds,
snpPhred, _refs, _mms, _refcs,
stackDepth, ri, top, bot,
(uint32_t)_qlen, stratum, cost, _patid,
_seed, _params))
{
// Return value of true means that we can stop
return true;
}
// Return value of false means that we should continue
// searching. This could happen if we the call to
// reportChaseOne() reported a hit, but the user asked for
// multiple hits and we haven't reached the ceiling yet.
// This might also happen if the call to reportChaseOne()
// didn't report a hit because the alignment was spurious
// (i.e. overlapped some padding).
}
// All range elements were examined and we should keep going
return false;
}
/**
* Report the partial alignment represented by the current stack
* state (_mms[] and stackDepth).
*/
bool reportPartial(uint32_t stackDepth) {
// Sanity-check stack depth
if(_3revOff != _2revOff) {
assert_leq(stackDepth, 3);
} else if(_2revOff != _1revOff) {
assert_leq(stackDepth, 2);
} else {
assert_leq(stackDepth, 1);
}
// Possibly report
assert_gt(_reportPartials, 0);
assert(_partials != NULL);
ASSERT_ONLY(uint32_t qualTot = 0);
PartialAlignment al;
al.u64.u64 = 0xffffffffffffffffllu;
assert_leq(stackDepth, 3);
assert_gt(stackDepth, 0);
// First mismatch
assert_gt(_mms.size(), 0);
assert_lt(_mms[0], _qlen);
// First, append the mismatch position in the read
al.entry.pos0 = (uint16_t)_mms[0]; // pos
ASSERT_ONLY(uint8_t qual0 = mmPenalty(_maqPenalty, phredCharToPhredQual((*_qual)[_mms[0]])));
ASSERT_ONLY(qualTot += qual0);
uint32_t ci = (uint32_t)(_qlen - _mms[0] - 1);
// _chars[] is index in terms of RHS-relative depth
int c = (int)(Dna5)_chars[ci];
assert_lt(c, 4);
assert_neq(c, (int)(*_qry)[_mms[0]]);
// Second, append the substituted character for the position
al.entry.char0 = c;
if(stackDepth > 1) {
assert_gt(_mms.size(), 1);
// Second mismatch
assert_lt(_mms[1], _qlen);
// First, append the mismatch position in the read
al.entry.pos1 = (uint16_t)_mms[1]; // pos
ASSERT_ONLY(uint8_t qual1 = mmPenalty(_maqPenalty, phredCharToPhredQual((*_qual)[_mms[1]])));
ASSERT_ONLY(qualTot += qual1);
ci = (uint32_t)(_qlen - _mms[1] - 1);
// _chars[] is index in terms of RHS-relative depth
c = (int)(Dna5)_chars[ci];
assert_lt(c, 4);
assert_neq(c, (int)(*_qry)[_mms[1]]);
// Second, append the substituted character for the position
al.entry.char1 = c;
if(stackDepth > 2) {
assert_gt(_mms.size(), 2);
// Second mismatch
assert_lt(_mms[2], _qlen);
// First, append the mismatch position in the read
al.entry.pos2 = (uint16_t)_mms[2]; // pos
ASSERT_ONLY(uint8_t qual2 = mmPenalty(_maqPenalty, phredCharToPhredQual((*_qual)[_mms[2]])));
ASSERT_ONLY(qualTot += qual2);
ci = (uint32_t)(_qlen - _mms[2] - 1);
// _chars[] is index in terms of RHS-relative depth
c = (int)(Dna5)_chars[ci];
assert_lt(c, 4);
assert_neq(c, (int)(*_qry)[_mms[2]]);
// Second, append the substituted character for the position
al.entry.char2 = c;
} else {
// Signal that the '2' slot is empty
al.entry.pos2 = 0xffff;
}
} else {
// Signal that the '1' slot is empty
al.entry.pos1 = 0xffff;
}
assert_leq(qualTot, _qualThresh);
assert(validPartialAlignment(al));
#ifndef NDEBUG
assert(al.repOk(_qualThresh, (uint32_t)_qlen, (*_qual), _maqPenalty));
for(size_t i = 0; i < _partialsBuf.size(); i++) {
assert(validPartialAlignment(_partialsBuf[i]));
assert(!samePartialAlignment(_partialsBuf[i], al));
}
#endif
_partialsBuf.push_back(al);
return true;
}
/**
* Check that the given eligibility parameters (lowAltQual,
* eligibleSz, eligibleNum) are correct, given the appropriate
* inputs (pairs, elims, depth, d, unrevOff)
*/
bool sanityCheckEligibility(uint32_t depth,
uint32_t d,
uint32_t unrevOff,
uint32_t lowAltQual,
uint32_t eligibleSz,
uint32_t eligibleNum,
TIndexOffU* pairs,
uint8_t* elims)
{
// Sanity check that the lay of the land is as we
// expect given eligibleNum and eligibleSz
size_t i = max(depth, unrevOff), j = 0;
uint32_t cumSz = 0;
uint32_t eligiblesVisited = 0;
for(; i <= d; i++) {
uint32_t icur = (uint32_t)(_qlen - i - 1); // current offset into _qry
uint8_t qi = qualAt(icur);
assert_lt(elims[i], 16);
if((qi == lowAltQual || !_considerQuals) && elims[i] != 15) {
// This is an eligible position with at least
// one remaining backtrack target
for(j = 0; j < 4; j++) {
if((elims[i] & (1 << j)) == 0) {
// This pair has not been eliminated
assert_gt(pairBot(pairs, i, j), pairTop(pairs, i, j));
cumSz += pairSpread(pairs, i, j);
eligiblesVisited++;
}
}
}
}
assert_eq(cumSz, eligibleSz);
assert_eq(eligiblesVisited, eligibleNum);
return true;
}
const BitPairReference* _refs; // reference sequences (or NULL if not colorspace)
String<Dna5>* _qry; // query (read) sequence
size_t _qlen; // length of _qry
String<char>* _qual; // quality values for _qry
String<char>* _name; // name of _qry
bool _color; // whether read is colorspace
const Ebwt<String<Dna> >* _ebwt; // Ebwt to search in
const EbwtSearchParams<String<Dna> >& _params; // Ebwt to search in
uint32_t _unrevOff; // unrevisitable chunk
uint32_t _1revOff; // 1-revisitable chunk
uint32_t _2revOff; // 2-revisitable chunk
uint32_t _3revOff; // 3-revisitable chunk
/// Whether to round qualities off Maq-style when calculating penalties
bool _maqPenalty;
uint32_t _qualThresh; // only accept hits with weighted
// hamming distance <= _qualThresh
TIndexOffU *_pairs; // ranges, leveled in parallel
// with decision stack
uint8_t *_elims; // which ranges have been
// eliminated, leveled in parallel
// with decision stack
std::vector<TIndexOffU> _mms; // array for holding mismatches
std::vector<uint8_t> _refcs; // array for holding mismatches
// Entries in _mms[] are in terms of offset into
// _qry - not in terms of offset from 3' or 5' end
char *_chars; // characters selected so far
// If > 0, report partial alignments up to this many mismatches
uint32_t _reportPartials;
/// Do not report alignments with stratum < this limit
bool _reportExacts;
/// When reporting a full alignment, report top/bot; don't chase
/// any of the results
bool _reportRanges;
/// Append partial alignments here
PartialAlignmentManager *_partials;
/// Set of mutations that apply for a partial alignment
String<QueryMutation> *_muts;
/// Reference texts (NULL if they are unavailable
vector<String<Dna5> >* _os;
/// Whether to use the _os array together with a naive matching
/// algorithm to double-check reported alignments (or the lack
/// thereof)
bool _sanity;
/// Whether to consider quality values when deciding where to
/// backtrack
bool _considerQuals;
bool _halfAndHalf;
/// Depth of 5'-seed-half border
uint32_t _5depth;
/// Depth of 3'-seed-half border
uint32_t _3depth;
/// Default quals
String<char> _qualDefault;
/// Number of backtracks in last call to backtrack()
uint32_t _numBts;
/// Number of backtracks since last reset
uint32_t _totNumBts;
/// Max # of backtracks to allow before giving up
uint32_t _maxBts;
/// Whether we precalcualted the Ebwt locus information for the
/// next top/bot pair
bool _precalcedSideLocus;
/// Precalculated top locus
SideLocus _preLtop;
/// Precalculated bot locus
SideLocus _preLbot;
/// Flag to record whether a 'false' return from backtracker is due
/// to having exceeded one or more backrtacking limits
bool _bailedOnBacktracks;
/// Source of pseudo-random numbers
RandomSource _rand;
/// Be talkative
bool _verbose;
uint64_t _ihits;
// Holding area for partial alignments
vector<PartialAlignment> _partialsBuf;
// Current range to expose to consumers
Range _curRange;
uint32_t _patid;
char _primer;
char _trimc;
uint32_t _seed;
#ifndef NDEBUG
std::set<TIndexOff> allTops_;
#endif
};
/**
* Class that coordinates quality- and quantity-aware backtracking over
* some range of a read sequence.
*
* The creator can configure the BacktrackManager to treat different
* stretches of the read differently.
*/
class EbwtRangeSource : public RangeSource {
typedef Ebwt<String<Dna> > TEbwt;
typedef std::pair<int, int> TIntPair;
public:
EbwtRangeSource(
const TEbwt* ebwt,
bool fw,
TIndexOffU qualLim,
bool reportExacts,
bool verbose,
bool quiet,
int halfAndHalf,
bool partial,
bool maqPenalty,
bool qualOrder,
AlignerMetrics *metrics = NULL) :
RangeSource(),
qry_(NULL),
qlen_(0),
qual_(NULL),
name_(NULL),
ebwt_(ebwt),
fw_(fw),
offRev0_(0),
offRev1_(0),
offRev2_(0),
offRev3_(0),
maqPenalty_(maqPenalty),
qualOrder_(qualOrder),
qualLim_(qualLim),
reportExacts_(reportExacts),
halfAndHalf_(halfAndHalf),
partial_(partial),
depth5_(0),
depth3_(0),
verbose_(verbose),
quiet_(quiet),
skippingThisRead_(false),
metrics_(metrics)
{ curEbwt_ = ebwt_; }
/**
* Set a new query read.
*/
virtual void setQuery(Read& r, Range *seedRange) {
const bool ebwtFw = ebwt_->fw();
if(ebwtFw) {
qry_ = fw_ ? &r.patFw : &r.patRc;
qual_ = fw_ ? &r.qual : &r.qualRev;
} else {
qry_ = fw_ ? &r.patFwRev : &r.patRcRev;
qual_ = fw_ ? &r.qualRev : &r.qual;
}
name_ = &r.name;
if(seedRange != NULL) seedRange_ = *seedRange;
else seedRange_.invalidate();
qlen_ = length(*qry_);
skippingThisRead_ = false;
// Apply edits from the partial alignment to the query pattern
if(seedRange_.valid()) {
qryBuf_ = *qry_;
const size_t srSz = seedRange_.mms.size();
assert_gt(srSz, 0);
assert_eq(srSz, seedRange_.refcs.size());
for(size_t i = 0; i < srSz; i++) {
assert_lt(seedRange_.mms[i], qlen_);
char rc = (char)seedRange_.refcs[i];
assert(rc == 'A' || rc == 'C' || rc == 'G' || rc == 'T');
ASSERT_ONLY(char oc = (char)qryBuf_[qlen_ - seedRange_.mms[i] - 1]);
assert_neq(rc, oc);
qryBuf_[qlen_ - seedRange_.mms[i] - 1] = (Dna5)rc;
assert_neq((Dna5)rc, (*qry_)[qlen_ - seedRange_.mms[i] - 1]);
}
qry_ = &qryBuf_;
}
// Make sure every qual is a valid qual ASCII character (>= 33)
for(size_t i = 0; i < length(*qual_); i++) {
assert_geq((*qual_)[i], 33);
}
assert_geq(length(*qual_), qlen_);
this->done = false;
this->foundRange = false;
color_ = r.color;
rand_.init(r.seed);
}
/**
* Set backtracking constraints.
*/
void setOffs(uint32_t depth5, // depth of far edge of hi-half
uint32_t depth3, // depth of far edge of lo-half
uint32_t unrevOff, // depth above which we cannot backtrack
uint32_t revOff1, // depth above which we may backtrack just once
uint32_t revOff2, // depth above which we may backtrack just twice
uint32_t revOff3) // depth above which we may backtrack just three times
{
depth5_ = depth5;
depth3_ = depth3;
assert_geq(depth3_, depth5_);
offRev0_ = unrevOff;
offRev1_ = revOff1;
offRev2_ = revOff2;
offRev3_ = revOff3;
}
/**
* Return true iff this RangeSource is allowed to report exact
* alignments (exact = no edits).
*/
bool reportExacts() const {
return reportExacts_;
}
/// Return the current range
virtual Range& range() {
return curRange_;
}
/**
* Set qlen_ according to parameter, except don't let it fall below
* the length of the query.
*/
void setQlen(uint32_t qlen) {
assert(qry_ != NULL);
qlen_ = min<uint32_t>((uint32_t)length(*qry_), qlen);
}
/**
* Initiate continuations so that the next call to advance() begins
* a new search. Note that contMan is empty upon return if there
* are no valid continuations to begin with. Also note that
* calling initConts() may result in finding a range (i.e., if we
* immediately jump to a valid range using the ftab).
*/
virtual void
initBranch(PathManager& pm) {
assert(curEbwt_ != NULL);
assert_gt(length(*qry_), 0);
assert_leq(qlen_, length(*qry_));
assert_geq(length(*qual_), length(*qry_));
const Ebwt<String<Dna> >& ebwt = *ebwt_;
int ftabChars = ebwt._eh._ftabChars;
this->foundRange = false;
int nsInSeed = 0; int nsInFtab = 0;
ASSERT_ONLY(allTops_.clear());
if(skippingThisRead_) {
this->done = true;
return;
}
if(qlen_ < 4) {
uint32_t maxmms = 0;
if(offRev0_ != offRev1_) maxmms = 1;
if(offRev1_ != offRev2_) maxmms = 2;
if(offRev2_ != offRev3_) maxmms = 3;
if(qlen_ <= maxmms) {
if(!quiet_) {
ThreadSafe _ts(&gLock);
cerr << "Warning: Read (" << (*name_) << ") is less than " << (maxmms+1) << " characters long; skipping..." << endl;
}
this->done = true;
skippingThisRead_ = true;
return;
}
}
if(!tallyNs(nsInSeed, nsInFtab)) {
// No alignments are possible because of the distribution
// of Ns in the read in combination with the backtracking
// constraints.
return;
}
// icost = total cost penalty (major bits = stratum, minor bits =
// quality penalty) incurred so far by partial alignment
uint16_t icost = (seedRange_.valid()) ? seedRange_.cost : 0;
// iham = total quality penalty incurred so far by partial alignment
uint16_t iham = (seedRange_.valid() && qualOrder_) ? (seedRange_.cost & ~0xc000): 0;
assert_leq(iham, qualLim_);
// m = depth beyond which ftab must not extend or else we might
// miss some legitimate paths
uint32_t m = min<uint32_t>(offRev0_, (uint32_t)qlen_);
// Let skipInvalidExact = true if using the ftab would be a
// waste because it would jump directly to an alignment we
// couldn't use.
bool ftabSkipsToEnd = (qlen_ == (uint32_t)ftabChars);
bool skipInvalidExact = (!reportExacts_ && ftabSkipsToEnd);
// If it's OK to use the ftab...
if(nsInFtab == 0 && m >= (uint32_t)ftabChars && !skipInvalidExact) {
// Use the ftab to jump 'ftabChars' chars into the read
// from the right
uint32_t ftabOff = calcFtabOff();
TIndexOffU top = ebwt.ftabHi(ftabOff);
TIndexOffU bot = ebwt.ftabLo(ftabOff+1);
if(qlen_ == (uint32_t)ftabChars && bot > top) {
// We found a range with 0 mismatches immediately. Set
// fields to indicate we found a range.
assert(reportExacts_);
curRange_.top = top;
curRange_.bot = bot;
curRange_.stratum = (icost >> 14);
curRange_.cost = icost;
curRange_.numMms = 0;
curRange_.ebwt = ebwt_;
curRange_.fw = fw_;
curRange_.mms.clear(); // no mismatches
curRange_.refcs.clear(); // no mismatches
// Lump in the edits from the partial alignment
addPartialEdits();
assert(curRange_.repOk());
// no need to do anything with curRange_.refcs
this->foundRange = true;
//this->done = true;
return;
} else if (bot > top) {
// We have a range to extend
assert_leq(top, ebwt._eh._len);
assert_leq(bot, ebwt._eh._len);
Branch *b = pm.bpool.alloc();
if(b == NULL) {
assert(pm.empty());
return;
}
if(!b->init(
pm.rpool, pm.epool, pm.bpool.lastId(), (uint32_t)qlen_,
offRev0_, offRev1_, offRev2_, offRev3_,
0, ftabChars, icost, iham, top, bot,
ebwt._eh, ebwt._ebwt))
{
// Negative result from b->init() indicates we ran
// out of best-first chunk memory
assert(pm.empty());
return;
}
assert(!b->curtailed_);
assert(!b->exhausted_);
assert_gt(b->depth3_, 0);
pm.push(b); // insert into priority queue
assert(!pm.empty());
} else {
// The arrows are already closed within the
// unrevisitable region; give up
}
} else {
// We can't use the ftab, so we start from the rightmost
// position and use _fchr
Branch *b = pm.bpool.alloc();
if(b == NULL) {
assert(pm.empty());
return;
}
if(!b->init(pm.rpool, pm.epool, pm.bpool.lastId(), (uint32_t)qlen_,
offRev0_, offRev1_, offRev2_, offRev3_,
0, 0, icost, iham, 0, 0, ebwt._eh, ebwt._ebwt))
{
// Negative result from b->init() indicates we ran
// out of best-first chunk memory
assert(pm.empty());
return;
}
assert(!b->curtailed_);
assert(!b->exhausted_);
assert_gt(b->depth3_, 0);
pm.push(b); // insert into priority queue
assert(!pm.empty());
}
return;
}
/**
* Advance along the lowest-cost branch managed by the given
* PathManager. Keep advancing until condition 'until' is
* satisfied. Typically, the stopping condition 'until' is
* set to stop whenever pm's minCost changes.
*/
virtual void
advanceBranch(int until, uint16_t minCost, PathManager& pm) {
assert(curEbwt_ != NULL);
// Let this->foundRange = false; we'll set it to true iff this call
// to advance yielded a new valid-alignment range.
this->foundRange = false;
// Can't have already exceeded weighted hamming distance threshold
assert_gt(length(*qry_), 0);
assert_leq(qlen_, length(*qry_));
assert_geq(length(*qual_), length(*qry_));
assert(!pm.empty());
do {
assert(pm.repOk());
// Get the highest-priority branch according to the priority
// queue in 'pm'
Branch* br = pm.front();
// Shouldn't be curtailed or exhausted
assert(!br->exhausted_);
assert(!br->curtailed_);
assert_gt(br->depth3_, 0);
assert_leq(br->ham_, qualLim_);
if(verbose_) {
br->print((*qry_), (*qual_), minCost, cout, (halfAndHalf_>0), partial_, fw_, ebwt_->fw());
if(!br->edits_.empty()) {
cout << "Edit: ";
for(size_t i = 0; i < br->edits_.size(); i++) {
Edit e = br->edits_.get(i);
cout << (curEbwt_->fw() ? (qlen_ - e.pos - 1) : e.pos)
<< (char)e.chr;
if(i < br->edits_.size()-1) cout << " ";
}
cout << endl;
}
}
assert(br->repOk((uint32_t)qlen_));
ASSERT_ONLY(int stratum = br->cost_ >> 14); // shift the stratum over
assert_lt(stratum, 4);
// Not necessarily true with rounding
uint32_t depth = br->tipDepth();
const Ebwt<String<Dna> >& ebwt = *ebwt_;
if(halfAndHalf_ > 0) assert_gt(depth3_, depth5_);
bool reportedPartial = false;
bool invalidExact = false;
bool empty = false;
bool hit = false;
uint16_t cost = br->cost_;
uint32_t cur = 0;
uint32_t nedits = 0;
if(halfAndHalf_ && !hhCheckTop(br, depth, 0)) {
// Stop extending this branch because it violates a half-
// and-half constraint
if(metrics_ != NULL) metrics_->curBacktracks_++;
pm.curtail(br, (uint32_t)qlen_, depth3_, qualOrder_);
goto bail;
}
cur = (uint32_t)(qlen_ - depth - 1); // current offset into qry_
if(depth < qlen_) {
// Determine whether ranges at this location are candidates
// for backtracking
int c = (int)(*qry_)[cur]; // get char at this position
int nextc = -1;
if(cur < qlen_-1) nextc = (int)(*qry_)[cur+1];
assert_leq(c, 4);
// If any uncalled base's penalty is still under
// the ceiling, then this position is an alternative
uint8_t q[4] = {'!', '!', '!', '!'};
uint8_t bestq;
// get unrounded penalties at this position
bestq = q[0] = q[1] = q[2] = q[3] =
mmPenalty(maqPenalty_, qualAt(cur));
// The current query position is a legit alternative if it a) is
// not in the unrevisitable region, and b) its selection would
// not necessarily cause the quality ceiling (if one exists) to
// be exceeded
bool curIsAlternative = (depth >= br->depth0_) &&
(br->ham_ + bestq <= qualLim_);
ASSERT_ONLY(TIndexOffU obot = br->bot_);
TIndexOffU otop = br->top_;
// If c is 'N', then it's a mismatch
if(c == 4 && depth > 0) {
// Force the 'else if(curIsAlternative)' or 'else'
// branches below
br->top_ = br->bot_ = 1;
} else if(c == 4) {
// We'll take the 'if(br->top == 0 && br->bot == 0)'
// branch below
assert_eq(0, br->top_);
assert_eq(0, br->bot_);
}
// Get the range state for the current position
RangeState *rs = br->rangeState();
assert(rs != NULL);
// Calculate the ranges for this position
if(br->top_ == 0 && br->bot_ == 0) {
// Calculate first quartet of ranges using the _fchr[]
// array
rs->tops[0] = ebwt._fchr[0];
rs->bots[0] = rs->tops[1] = ebwt._fchr[1];
rs->bots[1] = rs->tops[2] = ebwt._fchr[2];
rs->bots[2] = rs->tops[3] = ebwt._fchr[3];
rs->bots[3] = ebwt._fchr[4];
ASSERT_ONLY(int r =)
br->installRanges(c, nextc, qualLim_ - br->ham_, q);
assert(r < 4 || c == 4);
// Update top and bot
if(c < 4) {
br->top_ = rs->tops[c];
br->bot_ = rs->bots[c];
}
} else if(curIsAlternative && (br->bot_ > br->top_ || c == 4)) {
// Calculate next quartet of ranges. We hope that the
// appropriate cache lines are prefetched.
assert(br->ltop_.valid());
rs->tops[0] =
rs->bots[0] = rs->tops[1] =
rs->bots[1] = rs->tops[2] =
rs->bots[2] = rs->tops[3] =
rs->bots[3] = 0;
if(br->lbot_.valid()) {
if(metrics_ != NULL) metrics_->curBwtOps_++;
ebwt.mapLFEx(br->ltop_, br->lbot_, (TIndexOffU*)rs->tops, (TIndexOffU*)rs->bots);
} else {
#ifndef NDEBUG
TIndexOffU tmptops[] = {0, 0, 0, 0};
TIndexOffU tmpbots[] = {0, 0, 0, 0};
SideLocus ltop, lbot;
ltop.initFromRow(otop, ebwt_->_eh, ebwt_->_ebwt);
lbot.initFromRow(obot, ebwt_->_eh, ebwt_->_ebwt);
ebwt.mapLFEx(ltop, lbot, tmptops, tmpbots);
#endif
if(metrics_ != NULL) metrics_->curBwtOps_++;
int cc = ebwt.mapLF1((TIndexOffU&)otop, br->ltop_);
br->top_ = otop;
assert(cc == -1 || (cc >= 0 && cc < 4));
if(cc >= 0) {
assert_lt(cc, 4);
rs->tops[cc] = br->top_;
rs->bots[cc] = (br->top_ + 1);
}
#ifndef NDEBUG
for(int i = 0; i < 4; i++) {
assert_eq(tmpbots[i] - tmptops[i],
rs->bots[i] - rs->tops[i]);
}
#endif
}
ASSERT_ONLY(int r =)
br->installRanges(c, nextc, qualLim_ - br->ham_, q);
assert(r < 4 || c == 4);
// Update top and bot
if(c < 4) {
br->top_ = rs->tops[c];
br->bot_ = rs->bots[c];
} else {
br->top_ = br->bot_ = 1;
}
} else if(br->bot_ > br->top_) {
// This read position is not a legitimate backtracking
// alternative. No need to do the bookkeeping for the
// entire quartet, just do c. We hope that the
// appropriate cache lines are prefetched before now;
// otherwise, we're about to take an expensive cache
// miss.
assert(br->ltop_.valid());
rs->eliminated_ = true; // eliminate all alternatives leaving this node
assert(br->eliminated(br->len_));
if(c < 4) {
if(br->top_ + 1 == br->bot_) {
if(metrics_ != NULL) metrics_->curBwtOps_++;
br->bot_ = br->top_ = ebwt.mapLF1(br->top_, br->ltop_, c);
if(br->bot_ != OFF_MASK) br->bot_++;
} else {
if(metrics_ != NULL) metrics_->curBwtOps_++;
br->top_ = ebwt.mapLF(br->ltop_, c);
assert(br->lbot_.valid());
if(metrics_ != NULL) metrics_->curBwtOps_++;
br->bot_ = ebwt.mapLF(br->lbot_, c);
}
}
} else {
rs->eliminated_ = true;
}
assert(rs->repOk());
// br->top_ and br->bot_ now contain the next top and bot
} else {
// The continuation had already processed the whole read
assert_eq(qlen_, depth);
cur = 0;
}
empty = (br->top_ == br->bot_);
hit = (cur == 0 && !empty);
// Check whether we've obtained an exact alignment when
// we've been instructed not to report exact alignments
nedits = (uint32_t)br->edits_.size();
invalidExact = (hit && nedits == 0 && !reportExacts_);
assert_leq(br->ham_, qualLim_);
// Set this to true if the only way to make legal progress
// is via one or more additional backtracks.
if(halfAndHalf_ && !hhCheck(br, depth, empty)) {
// This alignment doesn't satisfy the half-and-half
// requirements; reject it
if(metrics_ != NULL) metrics_->curBacktracks_++;
pm.curtail(br, (uint32_t)qlen_, depth3_, qualOrder_);
goto bail;
}
if(hit && // there is a range to report
!invalidExact && // not disqualified by no-exact-hits setting
!reportedPartial) // not an already-reported partial alignment
{
if(verbose_) {
if(partial_) {
cout << " Partial alignment:" << endl;
} else {
cout << " Final alignment:" << endl;
}
br->len_++;
br->print((*qry_), (*qual_), minCost, cout, halfAndHalf_ > 0, partial_, fw_, ebwt_->fw());
br->len_--;
cout << endl;
}
assert_gt(br->bot_, br->top_);
assert_leq(br->ham_, qualLim_);
assert_leq((uint32_t)(br->cost_ & ~0xc000), qualLim_);
if(metrics_ != NULL) metrics_->setReadHasRange();
curRange_.top = br->top_;
curRange_.bot = br->bot_;
curRange_.cost = br->cost_;
curRange_.stratum = (br->cost_ >> 14);
curRange_.numMms = nedits;
curRange_.fw = fw_;
curRange_.mms.clear();
curRange_.refcs.clear();
for(size_t i = 0; i < nedits; i++) {
curRange_.mms.push_back((uint32_t)(qlen_ - br->edits_.get(i).pos - 1));
curRange_.refcs.push_back((char)br->edits_.get(i).chr);
}
addPartialEdits();
curRange_.ebwt = ebwt_;
this->foundRange = true;
#ifndef NDEBUG
TIndexOff top2 = (TIndexOff)br->top_;
top2++; // ensure it's not 0
if(ebwt_->fw()) top2 = -top2;
assert(allTops_.find(top2) == allTops_.end());
allTops_.insert(top2);
#endif
assert(curRange_.repOk());
// Must curtail because we've consumed the whole pattern
if(metrics_ != NULL) metrics_->curBacktracks_++;
pm.curtail(br, (uint32_t)qlen_, depth3_, qualOrder_);
} else if(empty || cur == 0) {
// The branch couldn't be extended further
if(metrics_ != NULL) metrics_->curBacktracks_++;
pm.curtail(br, (uint32_t)qlen_, depth3_, qualOrder_);
} else {
// Extend the branch by one position; no change to its cost
// so there's no need to reconsider where it lies in the
// priority queue
assert_neq(0, cur);
br->extend();
}
bail:
// Make sure the front element of the priority queue is
// extendable (i.e. not curtailed) and then prep it.
if(!pm.splitAndPrep(rand_, (uint32_t)qlen_, qualLim_, depth3_,
qualOrder_,
ebwt_->_eh, ebwt_->_ebwt, ebwt_->_fw))
{
pm.reset(0);
assert(pm.empty());
}
if(pm.empty()) {
// No more branches
break;
}
assert(!pm.front()->curtailed_);
assert(!pm.front()->exhausted_);
if(until == ADV_COST_CHANGES && pm.front()->cost_ != cost) break;
else if(until == ADV_STEP) break;
} while(!this->foundRange);
if(!pm.empty()) {
assert(!pm.front()->curtailed_);
assert(!pm.front()->exhausted_);
}
}
/**
* Return true iff we're enforcing a half-and-half constraint
* (forced edits in both seed halves).
*/
int halfAndHalf() const {
return halfAndHalf_;
}
protected:
/**
* Lump all the seed-alignment edits from the seedRange_ range
* found previously to the curRange_ range just found.
*/
void addPartialEdits() {
// Lump in the edits from the partial alignment
if(seedRange_.valid()) {
const size_t srSz = seedRange_.mms.size();
for(size_t i = 0; i < srSz; i++) {
curRange_.mms.push_back((uint32_t)(qlen_ - seedRange_.mms[i] - 1));
curRange_.refcs.push_back(seedRange_.refcs[i]);
}
curRange_.numMms += srSz;
}
}
/**
* Return true iff we're OK to continue after considering which
* half-seed boundary we're passing through, together with the
* number of mismatches accumulated so far. Return false if we
* should stop because a half-and-half constraint is violated. If
* we're not currently passing a half-seed boundary, just return
* true.
*/
bool hhCheck(Branch *b, uint32_t depth, bool empty) {
const uint32_t nedits = (uint32_t)b->edits_.size();
ASSERT_ONLY(uint32_t lim3 = (offRev3_ == offRev2_)? 2 : 3);
ASSERT_ONLY(uint32_t lim5 = (offRev1_ == offRev0_)? 2 : 1);
if((depth == (depth5_-1)) && !empty) {
// We're crossing the boundary separating the hi-half
// from the non-seed portion of the read.
// We should induce a mismatch if we haven't mismatched
// yet, so that we don't waste time pursuing a match
// that was covered by a previous phase
assert_leq(nedits, lim5);
return nedits > 0;
} else if((depth == (depth3_-1)) && !empty) {
// We're crossing the boundary separating the lo-half
// from the non-seed portion of the read
assert_leq(nedits, lim3);
assert_gt(nedits, 0);
// Count the mismatches in the lo and hi halves
uint32_t loHalfMms = 0, hiHalfMms = 0;
for(size_t i = 0; i < nedits; i++) {
uint32_t depth = b->edits_.get(i).pos;
if (depth < depth5_) hiHalfMms++;
else if(depth < depth3_) loHalfMms++;
else assert(false);
}
assert_leq(loHalfMms + hiHalfMms, lim3);
bool invalidHalfAndHalf = (loHalfMms == 0 || hiHalfMms == 0);
return (nedits >= (uint32_t)halfAndHalf_ && !invalidHalfAndHalf);
}
#ifndef NDEBUG
if(depth < depth5_-1) {
assert_leq(nedits, lim5);
}
else if(depth >= depth5_ && depth < depth3_-1) {
assert_gt(nedits, 0);
assert_leq(nedits, lim3);
}
#endif
return true;
}
/**
* Return true iff the state of the backtracker as encoded by
* stackDepth, d and iham is compatible with the current half-and-
* half alignment mode. prehits is for sanity checking when
* bailing.
*/
bool hhCheckTop(Branch* b,
uint32_t d,
uint32_t iham,
uint64_t prehits = 0xffffffffffffffffllu)
{
// Crossing from the hi-half into the lo-half
ASSERT_ONLY(uint32_t lim3 = (offRev3_ == offRev2_)? 2 : 3);
ASSERT_ONLY(uint32_t lim5 = (offRev1_ == offRev0_)? 2 : 1);
const uint32_t nedits = (uint32_t)b->edits_.size();
if(d == depth5_) {
assert_leq(nedits, lim5);
if(nedits == 0) {
return false;
}
} else if(d == depth3_) {
assert_leq(nedits, lim3);
if(nedits < (uint32_t)halfAndHalf_) {
return false;
}
}
#ifndef NDEBUG
else {
// We didn't just cross a boundary, so do an in-between check
if(d >= depth5_) {
assert_geq(nedits, 1);
} else if(d >= depth3_) {
assert_geq(nedits, lim3);
}
}
#endif
return true;
}
/**
* Return the Phred-scale quality value at position 'off'
*/
inline uint8_t qualAt(size_t off) {
return phredCharToPhredQual((*qual_)[off]);
}
/**
* Tally how many Ns occur in the seed region and in the ftab-
* jumpable region of the read. Check whether the mismatches
* induced by the Ns already violates the current policy. Return
* false if the policy is already violated, true otherwise.
*/
bool tallyNs(int& nsInSeed, int& nsInFtab) {
const Ebwt<String<Dna> >& ebwt = *ebwt_;
int ftabChars = ebwt._eh._ftabChars;
// Count Ns in the seed region of the read and short-circuit if
// the configuration of Ns guarantees that there will be no
// valid alignments given the backtracking constraints.
for(size_t i = 0; i < offRev3_; i++) {
if((int)(*qry_)[qlen_-i-1] == 4) {
nsInSeed++;
if(nsInSeed == 1) {
if(i < offRev0_) {
return false; // Exceeded mm budget on Ns alone
}
} else if(nsInSeed == 2) {
if(i < offRev1_) {
return false; // Exceeded mm budget on Ns alone
}
} else if(nsInSeed == 3) {
if(i < offRev2_) {
return false; // Exceeded mm budget on Ns alone
}
} else {
assert_gt(nsInSeed, 3);
return false; // Exceeded mm budget on Ns alone
}
}
}
// Calculate the number of Ns there are in the region that
// would get jumped over if the ftab were used.
for(size_t i = 0; i < (size_t)ftabChars && i < qlen_; i++) {
if((int)(*qry_)[qlen_-i-1] == 4) nsInFtab++;
}
return true;
}
/**
* Calculate the offset into the ftab for the rightmost 'ftabChars'
* characters of the current query. Rightmost char gets least
* significant bit-pair.
*/
uint32_t calcFtabOff() {
const Ebwt<String<Dna> >& ebwt = *ebwt_;
int ftabChars = ebwt._eh._ftabChars;
uint32_t ftabOff = (*qry_)[qlen_ - ftabChars];
assert_lt(ftabOff, 4);
assert_lt(ftabOff, ebwt._eh._ftabLen-1);
for(int i = ftabChars - 1; i > 0; i--) {
ftabOff <<= 2;
assert_lt((uint32_t)(*qry_)[qlen_-i], 4);
ftabOff |= (uint32_t)(*qry_)[qlen_-i];
assert_lt(ftabOff, ebwt._eh._ftabLen-1);
}
assert_lt(ftabOff, ebwt._eh._ftabLen-1);
return ftabOff;
}
String<Dna5>* qry_; // query (read) sequence
String<Dna5> qryBuf_; // for composing modified qry_ strings
size_t qlen_; // length of _qry
String<char>* qual_; // quality values for _qry
String<char>* name_; // name of _qry
bool color_; // true -> read is colorspace
const Ebwt<String<Dna> >* ebwt_; // Ebwt to search in
bool fw_;
uint32_t offRev0_; // unrevisitable chunk
uint32_t offRev1_; // 1-revisitable chunk
uint32_t offRev2_; // 2-revisitable chunk
uint32_t offRev3_; // 3-revisitable chunk
/// Whether to round qualities off Maq-style when calculating penalties
bool maqPenalty_;
/// Whether to order paths on our search in a way that takes
/// qualities into account. If this is false, the effect is that
/// the first path reported is guaranteed to be in the best
/// stratum, but it's not guaranteed to have the best quals.
bool qualOrder_;
/// Reject alignments where sum of qualities at mismatched
/// positions is greater than qualLim_
uint32_t qualLim_;
/// Report exact alignments iff this is true
bool reportExacts_;
/// Whether to use the _os array together with a naive matching
/// algorithm to double-check reported alignments (or the lack
/// thereof)
int halfAndHalf_;
/// Whether we're generating partial alignments for a longer
/// alignment in the opposite index.
bool partial_;
/// Depth of 5'-seed-half border
uint32_t depth5_;
/// Depth of 3'-seed-half border
uint32_t depth3_;
/// Source of pseudo-random numbers
RandomSource rand_;
/// Be talkative
bool verbose_;
/// Suppress unnecessary output
bool quiet_;
// Current range to expose to consumers
Range curRange_;
// Range for the partial alignment we're extending (NULL if we
// aren't extending a partial)
Range seedRange_;
// Starts as false; set to true as soon as we know we want to skip
// all further processing of this read
bool skippingThisRead_;
// Object encapsulating metrics
AlignerMetrics* metrics_;
#ifndef NDEBUG
std::set<TIndexOff> allTops_;
#endif
};
/**
* Concrete factory for EbwtRangeSource objects.
*/
class EbwtRangeSourceFactory {
typedef Ebwt<String<Dna> > TEbwt;
public:
EbwtRangeSourceFactory(
const TEbwt* ebwt,
bool fw,
uint32_t qualThresh,
bool reportExacts,
bool verbose,
bool quiet,
bool halfAndHalf,
bool seeded,
bool maqPenalty,
bool qualOrder,
AlignerMetrics *metrics = NULL) :
ebwt_(ebwt),
fw_(fw),
qualThresh_(qualThresh),
reportExacts_(reportExacts),
verbose_(verbose),
quiet_(quiet),
halfAndHalf_(halfAndHalf),
seeded_(seeded),
maqPenalty_(maqPenalty),
qualOrder_(qualOrder),
metrics_(metrics) { }
/**
* Return new EbwtRangeSource with predefined params.s
*/
EbwtRangeSource *create() {
return new EbwtRangeSource(ebwt_, fw_, qualThresh_,
reportExacts_, verbose_, quiet_,
halfAndHalf_, seeded_, maqPenalty_,
qualOrder_, metrics_);
}
protected:
const TEbwt* ebwt_;
bool fw_;
uint32_t qualThresh_;
bool reportExacts_;
bool verbose_;
bool quiet_;
bool halfAndHalf_;
bool seeded_;
bool maqPenalty_;
bool qualOrder_;
AlignerMetrics *metrics_;
};
/**
* What boundary within the alignment to "pin" a particular
* backtracking constraint to.
*/
enum SearchConstraintExtent {
PIN_TO_BEGINNING = 1, // depth 0; i.e., constraint is inactive
PIN_TO_LEN, // constraint applies to while alignment
PIN_TO_HI_HALF_EDGE, // constraint applies to hi-half of seed region
PIN_TO_SEED_EDGE // constraint applies to entire seed region
};
/**
* Concrete RangeSourceDriver that deals properly with
* GreedyDFSRangeSource by calling setOffs() with the appropriate
* parameters when initializing it;
*/
class EbwtRangeSourceDriver :
public SingleRangeSourceDriver<EbwtRangeSource>
{
public:
EbwtRangeSourceDriver(
EbwtSearchParams<String<Dna> >& params,
EbwtRangeSource* rs,
bool fw,
bool seed,
bool maqPenalty,
bool qualOrder,
HitSink& sink,
HitSinkPerThread* sinkPt,
uint32_t seedLen,
bool nudgeLeft,
SearchConstraintExtent rev0Off,
SearchConstraintExtent rev1Off,
SearchConstraintExtent rev2Off,
SearchConstraintExtent rev3Off,
vector<String<Dna5> >& os,
bool verbose,
bool quiet,
bool mate1,
ChunkPool* pool,
int *btCnt) :
SingleRangeSourceDriver<EbwtRangeSource>(
params, rs, fw, sink, sinkPt, os, verbose,
quiet, mate1, 0, pool, btCnt),
seed_(seed),
maqPenalty_(maqPenalty),
qualOrder_(qualOrder),
rs_(rs), seedLen_(seedLen),
nudgeLeft_(nudgeLeft),
rev0Off_(rev0Off), rev1Off_(rev1Off),
rev2Off_(rev2Off), rev3Off_(rev3Off),
verbose_(verbose), quiet_(quiet)
{
if(seed_) assert_gt(seedLen, 0);
}
virtual ~EbwtRangeSourceDriver() { }
bool seed() const { return seed_; }
bool ebwtFw() const { return rs_->curEbwt()->fw(); }
/**
* Called every time setQuery() is called in the parent class,
* after setQuery() has been called on the RangeSource but before
* initConts() has been called.
*/
virtual void initRangeSource(const String<char>& qual) {
// If seedLen_ is huge, then it will always cover the whole
// alignment
assert_eq(len_, seqan::length(qual));
uint32_t s = (seedLen_ > 0 ? min(seedLen_, len_) : len_);
uint32_t sLeft = s >> 1;
uint32_t sRight = s >> 1;
// If seed has odd length, then nudge appropriate half up by 1
if((s & 1) != 0) { if(nudgeLeft_) sLeft++; else sRight++; }
uint32_t rev0Off = cextToDepth(rev0Off_, sRight, s, len_);
uint32_t rev1Off = cextToDepth(rev1Off_, sRight, s, len_);
uint32_t rev2Off = cextToDepth(rev2Off_, sRight, s, len_);
uint32_t rev3Off = cextToDepth(rev3Off_, sRight, s, len_);
// Truncate the pattern if necessary
uint32_t qlen = (uint32_t)seqan::length(qual);
if(seed_) {
if(len_ > s) {
rs_->setQlen(s);
qlen = s;
}
assert(!rs_->reportExacts());
}
// If there are any Ns in the unrevisitable region, then this
// driver is guaranteed to yield no fruit.
uint16_t minCost = 0;
if(rs_->reportExacts()) {
// Keep minCost at 0
} else if (!rs_->halfAndHalf() && rev0Off < s) {
// Exacts not allowed, so there must be at least 1 mismatch
// outside of the unrevisitable area
minCost = 1 << 14;
if(qualOrder_) {
uint8_t lowQual = 0xff;
for(uint32_t d = rev0Off; d < s; d++) {
uint8_t lowAtPos;
lowAtPos = qual[qlen - d - 1];
if(lowAtPos < lowQual) lowQual = lowAtPos;
}
assert_lt(lowQual, 0xff);
minCost += mmPenalty(maqPenalty_, phredCharToPhredQual(lowQual));
}
} else if(rs_->halfAndHalf() && sRight > 0 && sRight < (s-1)) {
// Half-and-half constraints are active, so there must be
// at least 1 mismatch in both halves of the seed
assert(rs_->halfAndHalf());
minCost = (seed_ ? 3 : 2) << 14;
if(qualOrder_) {
assert(rs_->halfAndHalf() == 2 || rs_->halfAndHalf() == 3);
uint8_t lowQual1 = 0xff;
for(uint32_t d = 0; d < sRight; d++) {
uint8_t lowAtPos;
lowAtPos = qual[qlen - d - 1];
if(lowAtPos < lowQual1) lowQual1 = lowAtPos;
}
assert_lt(lowQual1, 0xff);
minCost += mmPenalty(maqPenalty_, phredCharToPhredQual(lowQual1));
uint8_t lowQual2_1 = 0xff;
uint8_t lowQual2_2 = 0xff;
for(uint32_t d = sRight; d < s; d++) {
uint8_t lowAtPos;
lowAtPos = qual[qlen - d - 1];
if(lowAtPos < lowQual2_1) {
if(lowQual2_1 != 0xff) {
lowQual2_2 = lowQual2_1;
}
lowQual2_1 = lowAtPos;
} else if(lowAtPos < lowQual2_2) {
lowQual2_2 = lowAtPos;
}
}
assert_lt(lowQual2_1, 0xff);
minCost += mmPenalty(maqPenalty_, phredCharToPhredQual(lowQual2_1));
if(rs_->halfAndHalf() > 2 && lowQual2_2 != 0xff) {
minCost += mmPenalty(maqPenalty_, phredCharToPhredQual(lowQual2_2));
}
}
}
if(verbose_) cout << "initRangeSource minCost: " << minCost << endl;
this->minCostAdjustment_ = minCost;
rs_->setOffs(sRight, // depth of far edge of hi-half (only matters where half-and-half is possible)
s, // depth of far edge of lo-half (only matters where half-and-half is possible)
rev0Off, // depth above which we cannot backtrack
rev1Off, // depth above which we may backtrack just once
rev2Off, // depth above which we may backtrack just twice
rev3Off); // depth above which we may backtrack just three times
}
protected:
/**
* Convert a search constraint extent to an actual depth into the
* read.
*/
inline uint32_t cextToDepth(SearchConstraintExtent cext,
uint32_t sRight,
uint32_t s,
uint32_t len)
{
if(cext == PIN_TO_SEED_EDGE) return s;
if(cext == PIN_TO_HI_HALF_EDGE) return sRight;
if(cext == PIN_TO_BEGINNING) return 0;
if(cext == PIN_TO_LEN) return len;
cerr << "Bad SearchConstraintExtent: " << cext;
throw 1;
}
bool seed_;
bool maqPenalty_;
bool qualOrder_;
EbwtRangeSource* rs_;
uint32_t seedLen_;
bool nudgeLeft_;
SearchConstraintExtent rev0Off_;
SearchConstraintExtent rev1Off_;
SearchConstraintExtent rev2Off_;
SearchConstraintExtent rev3Off_;
bool verbose_;
bool quiet_;
};
/**
* Concrete RangeSourceDriver that deals properly with
* GreedyDFSRangeSource by calling setOffs() with the appropriate
* parameters when initializing it;
*/
class EbwtRangeSourceDriverFactory {
public:
EbwtRangeSourceDriverFactory(
EbwtSearchParams<String<Dna> >& params,
EbwtRangeSourceFactory* rs,
bool fw,
bool seed,
bool maqPenalty,
bool qualOrder,
HitSink& sink,
HitSinkPerThread* sinkPt,
uint32_t seedLen,
bool nudgeLeft,
SearchConstraintExtent rev0Off,
SearchConstraintExtent rev1Off,
SearchConstraintExtent rev2Off,
SearchConstraintExtent rev3Off,
vector<String<Dna5> >& os,
bool verbose,
bool quiet,
bool mate1,
ChunkPool* pool,
int *btCnt = NULL) :
params_(params),
rs_(rs),
fw_(fw),
seed_(seed),
maqPenalty_(maqPenalty),
qualOrder_(qualOrder),
sink_(sink),
sinkPt_(sinkPt),
seedLen_(seedLen),
nudgeLeft_(nudgeLeft),
rev0Off_(rev0Off),
rev1Off_(rev1Off),
rev2Off_(rev2Off),
rev3Off_(rev3Off),
os_(os),
verbose_(verbose),
quiet_(quiet),
mate1_(mate1),
pool_(pool),
btCnt_(btCnt)
{ }
~EbwtRangeSourceDriverFactory() {
delete rs_; rs_ = NULL;
}
/**
* Return a newly-allocated EbwtRangeSourceDriver with the given
* parameters.
*/
EbwtRangeSourceDriver *create() const {
return new EbwtRangeSourceDriver(
params_, rs_->create(), fw_, seed_, maqPenalty_,
qualOrder_, sink_, sinkPt_, seedLen_, nudgeLeft_,
rev0Off_, rev1Off_, rev2Off_, rev3Off_, os_, verbose_,
quiet_, mate1_, pool_, btCnt_);
}
protected:
EbwtSearchParams<String<Dna> >& params_;
EbwtRangeSourceFactory* rs_;
bool fw_;
bool seed_;
bool maqPenalty_;
bool qualOrder_;
HitSink& sink_;
HitSinkPerThread* sinkPt_;
uint32_t seedLen_;
bool nudgeLeft_;
SearchConstraintExtent rev0Off_;
SearchConstraintExtent rev1Off_;
SearchConstraintExtent rev2Off_;
SearchConstraintExtent rev3Off_;
vector<String<Dna5> >& os_;
bool verbose_;
bool quiet_;
bool mate1_;
ChunkPool* pool_;
int *btCnt_;
};
/**
* A RangeSourceDriver that manages two child EbwtRangeSourceDrivers,
* one for searching for seed strings with mismatches in the hi-half,
* and one for extending those seed strings toward the 3' end.
*/
class EbwtSeededRangeSourceDriver : public RangeSourceDriver<EbwtRangeSource> {
typedef RangeSourceDriver<EbwtRangeSourceDriver>* TRangeSrcDrPtr;
typedef CostAwareRangeSourceDriver<EbwtRangeSource> TCostAwareRangeSrcDr;
public:
EbwtSeededRangeSourceDriver(
EbwtRangeSourceDriverFactory* rsFact,
EbwtRangeSourceDriver* rsSeed,
bool fw,
uint32_t seedLen,
bool verbose,
bool quiet,
bool mate1) :
RangeSourceDriver<EbwtRangeSource>(true, 0),
rsFact_(rsFact), rsFull_(false, NULL, verbose, quiet, true),
rsSeed_(rsSeed), patsrc_(NULL), seedLen_(seedLen), fw_(fw),
mate1_(mate1), seedRange_(0)
{
assert(rsSeed_->seed());
}
virtual ~EbwtSeededRangeSourceDriver() {
delete rsFact_; rsFact_ = NULL;
delete rsSeed_; rsSeed_ = NULL;
}
/**
* Prepare this aligner for the next read.
*/
virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *partial) {
this->done = false;
rsSeed_->setQuery(patsrc, partial);
this->minCostAdjustment_ = max(rsSeed_->minCostAdjustment_, rsSeed_->minCost);
this->minCost = this->minCostAdjustment_;
rsFull_.clearSources();
rsFull_.setQuery(patsrc, partial);
rsFull_.minCost = this->minCost;
assert_gt(rsFull_.minCost, 0);
patsrc_ = patsrc;
// The minCostAdjustment comes from the seed range source
// driver, based on Ns and quals in the hi-half
this->foundRange = false;
ASSERT_ONLY(allTops_.clear());
assert_eq(this->minCost, min<uint16_t>(rsSeed_->minCost, rsFull_.minCost));
}
/**
* Advance the aligner by one memory op. Return true iff we're
* done with this read.
*/
virtual void advance(int until) {
assert(!this->foundRange);
until = max<int>(until, ADV_COST_CHANGES);
ASSERT_ONLY(uint16_t preCost = this->minCost);
advanceImpl(until);
if(this->foundRange) {
assert_eq(range().cost, preCost);
}
#ifndef NDEBUG
if(this->foundRange) {
// Assert that we have not yet dished out a range with this
// top offset
assert_gt(range().bot, range().top);
assert(range().ebwt != NULL);
TIndexOff top = (TIndexOff)range().top;
top++; // ensure it's not 0
if(!range().ebwt->fw()) top = -top;
assert(allTops_.find(top) == allTops_.end());
allTops_.insert(top);
}
#endif
}
/**
* Advance the aligner by one memory op. Return true iff we're
* done with this read.
*/
virtual void advanceImpl(int until) {
assert(!this->done);
assert(!this->foundRange);
assert_gt(rsFull_.minCost, 0);
// Advance the seed range source
if(rsSeed_->done && rsFull_.done &&
!rsSeed_->foundRange && !rsFull_.foundRange)
{
this->done = true;
return;
}
if(rsSeed_->done && !rsSeed_->foundRange) {
rsSeed_->minCost = 0xffff;
if(rsFull_.minCost > this->minCost) {
this->minCost = rsFull_.minCost;
// Cost changed, so return
return;
}
}
if(rsFull_.done && !rsFull_.foundRange) {
rsFull_.minCost = 0xffff;
if(rsSeed_->minCost > this->minCost) {
this->minCost = rsSeed_->minCost;
// Cost changed, so return
return;
}
}
assert(rsSeed_->minCost != 0xffff || rsFull_.minCost != 0xffff);
// Extend a partial alignment
ASSERT_ONLY(uint16_t oldMinCost = this->minCost);
assert_eq(this->minCost, min<uint16_t>(rsSeed_->minCost, rsFull_.minCost));
bool doFull = rsFull_.minCost <= rsSeed_->minCost;
if(!doFull) {
// Advance the partial-alignment generator
assert_eq(rsSeed_->minCost, this->minCost);
if(!rsSeed_->foundRange) {
rsSeed_->advance(until);
}
if(rsSeed_->foundRange) {
assert_eq(this->minCost, rsSeed_->range().cost);
assert_eq(oldMinCost, rsSeed_->range().cost);
seedRange_ = &rsSeed_->range();
rsSeed_->foundRange = false;
assert_geq(seedRange_->cost, this->minCostAdjustment_);
this->minCostAdjustment_ = seedRange_->cost;
assert_gt(seedRange_->numMms, 0);
// Keep the range for the hi-half partial alignment so
// that the driver can (a) modify the pattern string
// and (b) modify results from the RangeSource to
// include these edits.
EbwtRangeSourceDriver *partial = rsFact_->create();
partial->minCost = seedRange_->cost;
rsFull_.minCost = seedRange_->cost;
rsFull_.addSource(partial, seedRange_);
if(rsFull_.foundRange) {
this->foundRange = true;
rsFull_.foundRange = false;
assert(rsFull_.range().repOk());
assert_eq(range().cost, oldMinCost);
}
}
if(rsSeed_->minCost > this->minCost) {
this->minCost = rsSeed_->minCost;
if(!rsFull_.done) {
this->minCost = min(this->minCost, rsFull_.minCost);
assert_eq(this->minCost, min<uint16_t>(rsSeed_->minCost, rsFull_.minCost));
}
}
} else {
// Extend a full alignment
assert(!rsFull_.done);
assert(!rsFull_.foundRange);
uint16_t oldFullCost = rsFull_.minCost;
if(!rsFull_.foundRange) {
rsFull_.advance(until);
}
// Found a minimum-cost range
if(rsFull_.foundRange) {
this->foundRange = true;
rsFull_.foundRange = false;
assert(rsFull_.range().repOk());
assert_eq(range().cost, oldMinCost);
}
assert_geq(rsFull_.minCost, oldFullCost);
// Did the min cost change?
if(rsFull_.minCost > oldFullCost) {
// If a range was found, hold on to it and save it for
// later. Update the minCost.
assert(!rsSeed_->done || rsSeed_->minCost == 0xffff);
this->minCost = min(rsFull_.minCost, rsSeed_->minCost);
}
}
}
/**
* Return the range found.
*/
virtual Range& range() {
Range& r = rsFull_.range();
r.fw = fw_;
r.mate1 = mate1_;
return r;
}
/**
* Return whether we're generating ranges for the first or the
* second mate.
*/
virtual bool mate1() const {
return mate1_;
}
/**
* Return true iff current pattern is forward-oriented.
*/
virtual bool fw() const {
return fw_;
}
protected:
EbwtRangeSourceDriverFactory* rsFact_;
TCostAwareRangeSrcDr rsFull_;
EbwtRangeSourceDriver* rsSeed_;
PatternSourcePerThread* patsrc_;
uint32_t seedLen_;
bool fw_;
bool mate1_;
bool generating_;
Range *seedRange_;
};
#endif /*EBWT_SEARCH_BACKTRACK_H_*/
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。