1 Star 0 Fork 0

Velcon-Zheng/bowtie

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
range_cache.h 15.07 KB
一键复制 编辑 原始数据 按行查看 历史
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
/*
* range_cache.h
*
* Classes that encapsulate the caching of
*/
#ifndef RANGE_CACHE_H_
#define RANGE_CACHE_H_
#include <stdint.h>
#include <utility>
#include <iostream>
#include <stdexcept>
#include <map>
#include "ebwt.h"
#include "row_chaser.h"
#define RANGE_NOT_SET OFF_MASK
#define RANGE_CACHE_BAD_ALLOC OFF_MASK
/**
* Manages a pool of memory used exclusively for range cache entries.
* This manager is allocate-only; it exists mainly so that we can avoid
* lots of new[]s and delete[]s.
*
* A given stretch of words may be one of two types: a cache entry, or
* a cache entry wrapper. A cache entry has a length and a list of
* already-resolved reference positions. A cache entry wrapper has a
* pointer to a cache entry for a different range, along with an
* integer indicating how many "jumps" to the left that range is from
* the one that owns the wrapper.
*/
class RangeCacheMemPool {
public:
RangeCacheMemPool(TIndexOffU lim /* max cache size in bytes */) :
lim_(lim >> 2 /* convert to words */), occ_(0), buf_(NULL),
closed_(false)
{
if(lim_ > 0) {
try {
buf_ = new TIndexOffU[lim_];
if(buf_ == NULL) throw std::bad_alloc();
} catch(std::bad_alloc& e) {
cerr << "Allocation error allocating " << lim
<< " words of range-cache memory" << endl;
throw 1;
}
assert(buf_ != NULL);
// Fill with 1s to signal that these elements are
// uninitialized
memset(buf_, 0xff, lim_ << 2 /* convert back to bytes */);
}
}
~RangeCacheMemPool() {
// Release all word memory!
if(lim_ > 0) delete[] buf_;
}
/**
* Allocate numElts elements from the word pool.
*/
TIndexOffU alloc(TIndexOffU numElts) {
assert_gt(numElts, 0);
assert_leq(occ_, lim_);
if(occ_ + numElts > lim_ || numElts >= CACHE_WRAPPER_BIT) {
return RANGE_CACHE_BAD_ALLOC;
}
assert_gt(lim_, 0);
TIndexOffU ret = occ_;
assert(allocs_.find(ret) == allocs_.end());
ASSERT_ONLY(allocs_.insert(ret));
// Clear the first elt so that we don't think there's already
// something there
#ifndef NDEBUG
for(TIndexOffU i = 0; i < numElts; i++) {
assert_eq(OFF_MASK, buf_[occ_ + i]);
}
#endif
buf_[occ_] = 0;
occ_ += numElts;
assert_leq(occ_, lim_);
if(lim_ - occ_ < 10) {
// No more room - don't try anymore
closed_ = true;
}
return ret;
}
/**
* Turn a pool-array index into a pointer; check that it doesn't
* fall outside the pool first.
*/
inline TIndexOffU *get(TIndexOffU off) {
assert_gt(lim_, 0);
assert_lt(off, lim_);
assert(allocs_.find(off) != allocs_.end());
TIndexOffU *ret = buf_ + off;
assert_neq(CACHE_WRAPPER_BIT, ret[0]);
assert_neq(OFF_MASK, ret[0]);
return ret;
}
/**
* Return true iff there's no more room in the cache.
*/
inline bool closed() {
return closed_;
}
private:
TIndexOffU lim_; /// limit on number of 32-bit words to dish out in total
TIndexOffU occ_; /// number of occupied words
TIndexOffU *buf_; /// buffer of 32-bit words
bool closed_; ///
#ifndef NDEBUG
std::set<TIndexOffU> allocs_; // elements allocated
#endif
};
/**
* A view to a range of cached reference positions.
*/
class RangeCacheEntry {
typedef Ebwt<String<Dna> > TEbwt;
typedef RowChaser<String<Dna> > TRowChaser;
public:
/**
*
*/
RangeCacheEntry(bool sanity = false) :
top_(OFF_MASK), jumps_(0), len_(0), ents_(NULL), ebwt_(NULL),
sanity_(sanity)
{ }
/**
* Create a new RangeCacheEntry from the data in the pool at 'ents'.
*/
RangeCacheEntry(RangeCacheMemPool& pool, TIndexOffU top,
TIndexOffU ent, TEbwt* ebwt, bool sanity = false) :
sanity_(sanity)
{
init(pool, top, ent, ebwt);
}
/**
* Initialize a RangeCacheEntry from the data in the pool at 'ents'.
*/
void init(RangeCacheMemPool& pool, TIndexOffU top, TIndexOffU ent, TEbwt* ebwt) {
assert(ebwt != NULL);
top_ = top;
ebwt_ = ebwt;
TIndexOffU *ents = pool.get(ent);
assert_neq(CACHE_WRAPPER_BIT, ents[0]);
// Is hi bit set?
if((ents[0] & CACHE_WRAPPER_BIT) != 0) {
// If so, the target is a wrapper and the non-hi bits
// contain the # jumps
jumps_ = (ents[0] & ~CACHE_WRAPPER_BIT);
assert_gt(jumps_, 0);
assert_leq(jumps_, ebwt_->_eh._len);
// Get the target entry
TIndexOffU *dest = pool.get(ents[1]);
// Get the length from the target entry
len_ = dest[0];
assert_leq(top_ + len_, ebwt_->_eh._len);
assert_gt(len_, 0);
assert_leq(len_, ebwt_->_eh._len);
// Get the pointer to the entries themselves
ents_ = dest + 1;
} else {
// Not a wrapper, so there are no jumps
jumps_ = 0;
// Get the length from the target entry
len_ = ents[0];
assert_leq(top_ + len_, ebwt_->_eh._len);
assert_gt(len_, 0);
assert_leq(len_, ebwt_->_eh._len);
// Get the pointer to the entries themselves
ents_ = ents + 1;
}
assert(sanityCheckEnts());
}
/**
* Initialize a wrapper with given number of jumps and given target
* entry index.
*/
void init(RangeCacheMemPool& pool, TIndexOffU top, TIndexOffU jumps,
TIndexOffU ent, TEbwt* ebwt)
{
assert(ebwt != NULL);
ebwt_ = ebwt;
top_ = top;
jumps_ = jumps;
TIndexOffU *ents = pool.get(ent);
// Must not be a wrapper
assert_eq(0, ents[0] & CACHE_WRAPPER_BIT);
// Get the length from the target entry
len_ = ents[0];
assert_gt(len_, 0);
assert_leq(len_, ebwt_->_eh._len);
// Get the pointer to the entries themselves
ents_ = ents + 1;
assert_leq(top_ + len_, ebwt_->_eh._len);
assert(sanityCheckEnts());
}
TIndexOffU len() const {
assert(ents_ != NULL);
assert(ebwt_ != NULL);
return len_;
}
TIndexOffU jumps() const {
assert(ents_ != NULL);
assert(ebwt_ != NULL);
return jumps_;
}
/**
*
*/
void reset() {
ents_ = NULL;
}
/**
* Return true iff this object represents a valid cache entry.
*/
bool valid() const {
return ents_ != NULL;
}
TEbwt *ebwt() {
return ebwt_;
}
/**
* Install a result obtained by a client of this cache; be sure to
* adjust for how many jumps down the tunnel the cache entry is
* situated.
*/
void install(TIndexOffU elt, TIndexOffU val) {
if(ents_ == NULL) {
// This is not a valid cache entry; do nothing
return;
}
assert(ents_ != NULL);
assert(ebwt_ != NULL);
assert_leq(jumps_, val);
assert_neq(OFF_MASK, val);
assert_leq(top_ + len_, ebwt_->_eh._len);
if(elt < len_) {
val -= jumps_;
if(verbose_) cout << "Installed reference offset: " << (top_ + elt) << endl;
ASSERT_ONLY(TIndexOffU sanity = TRowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
assert_eq(sanity, val);
#ifndef NDEBUG
for(size_t i = 0; i < len_; i++) {
if(i == elt) continue;
assert_neq(val, ents_[i]);
}
#endif
ents_[elt] = val;
} else {
// ignore install request
if(verbose_) cout << "Fell off end of cache entry for install: " << (top_ + elt) << endl;
}
}
/**
* Get an element from the cache, adjusted for tunnel jumps.
*/
inline TIndexOffU get(TIndexOffU elt) const {
if(ents_ == NULL) {
// This is not a valid cache entry; do nothing
return RANGE_NOT_SET;
}
assert(ents_ != NULL);
assert(ebwt_ != NULL);
assert_leq(top_ + len_, ebwt_->_eh._len);
if(elt < len_ && ents_[elt] != RANGE_NOT_SET) {
if(verbose_) cout << "Retrieved result from cache: " << (top_ + elt) << endl;
TIndexOffU ret = ents_[elt] + jumps_;
ASSERT_ONLY(TIndexOffU sanity = TRowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
assert_eq(sanity, ret);
return ret;
} else {
if(verbose_) cout << "Cache entry not set: " << (top_ + elt) << endl;
return RANGE_NOT_SET;
}
}
/**
* Check that len_ and the ents_ array both make sense.
*/
static bool sanityCheckEnts(TIndexOffU len, TIndexOffU *ents, TEbwt* ebwt) {
assert_gt(len, 0);
assert_leq(len, ebwt->_eh._len);
if(len < 10) {
for(size_t i = 0; i < len; i++) {
if(ents[i] == OFF_MASK) continue;
assert_leq(ents[i], ebwt->_eh._len);
for(size_t j = i+1; j < len; j++) {
if(ents[j] == OFF_MASK) continue;
assert_neq(ents[i], ents[j]);
}
}
} else {
std::set<TIndexOffU> seen;
for(size_t i = 0; i < len; i++) {
if(ents[i] == OFF_MASK) continue;
assert(seen.find(ents[i]) == seen.end());
seen.insert(ents[i]);
}
}
return true;
}
/**
* Check that len_ and the ents_ array both make sense.
*/
bool sanityCheckEnts() {
return RangeCacheEntry::sanityCheckEnts(len_, ents_, ebwt_);
}
private:
TIndexOffU top_; /// top pointer for this range
TIndexOffU jumps_; /// how many tunnel-jumps it is away from the requester
TIndexOffU len_; /// # of entries in cache entry
TIndexOffU *ents_; /// ptr to entries, which are flat offs within joined ref
TEbwt *ebwt_; /// index that alignments are in
bool verbose_; /// be talkative?
bool sanity_; /// do consistency checks?
};
/**
*
*/
class RangeCache {
typedef Ebwt<String<Dna> > TEbwt;
typedef std::vector<TIndexOffU> TUVec;
typedef std::map<TIndexOffU, TIndexOffU> TMap;
typedef std::map<TIndexOffU, TIndexOffU>::iterator TMapItr;
public:
RangeCache(TIndexOffU lim, TEbwt* ebwt) :
lim_(lim), map_(), pool_(lim), closed_(false), ebwt_(ebwt), sanity_(true) { }
/**
* Given top and bot offsets, retrieve the canonical cache entry
* that best covers that range. The cache entry may not directly
* correspond to the top offset provided, rather, it might be an
* entry that lies "at the end of the tunnel" when top and bot are
* walked backward.
*/
bool lookup(TIndexOffU top, TIndexOffU bot, RangeCacheEntry& ent) {
if(ebwt_ == NULL || lim_ == 0) return false;
assert_gt(bot, top);
ent.reset();
TMapItr itr = map_.find(top);
if(itr == map_.end()) {
// No cache entry for the given 'top' offset
if(closed_) {
return false; // failed to get cache entry
} else {
if(pool_.closed()) {
closed_ = true;
return false; // failed to get cache entry
}
}
// Use the tunnel
bool ret = tunnel(top, bot, ent);
return ret;
} else {
// There is a cache entry for the given 'top' offset
TIndexOffU ret = itr->second;
ent.init(pool_, top, ret, ebwt_);
return true; // success
}
}
/**
* Exhaustively check all entries linked to from map_ to ensure
* they're well-formed.
*/
bool repOk() {
#ifndef NDEBUG
for(TMapItr itr = map_.begin(); itr != map_.end(); itr++) {
TIndexOffU top = itr->first;
TIndexOffU idx = itr->second;
TIndexOffU jumps = 0;
assert_leq(top, ebwt_->_eh._len);
TIndexOffU *ents = pool_.get(idx);
if((ents[0] & CACHE_WRAPPER_BIT) != 0) {
jumps = ents[0] & ~CACHE_WRAPPER_BIT;
assert_leq(jumps, ebwt_->_eh._len);
idx = ents[1];
ents = pool_.get(idx);
}
TIndexOffU len = ents[0];
assert_leq(top + len, ebwt_->_eh._len);
RangeCacheEntry::sanityCheckEnts(len, ents + 1, ebwt_);
}
#endif
return true;
}
protected:
/**
* Tunnel through to the first range that 1) includes all the same
* suffixes (though longer) as the given range, and 2) has a cache
* entry for it.
*/
bool tunnel(TIndexOffU top, TIndexOffU bot, RangeCacheEntry& ent) {
assert_gt(bot, top);
TUVec tops;
const TIndexOffU spread = bot - top;
SideLocus tloc, bloc;
SideLocus::initFromTopBot(top, bot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
TIndexOffU newtop = top, newbot = bot;
TIndexOffU jumps = 0;
// Walk left through the tunnel
while(true) {
if(ebwt_->rowL(tloc) != ebwt_->rowL(bloc)) {
// Different characters at top and bot positions of
// BWT; this means that the calls to mapLF below are
// guaranteed to yield rows in two different character-
// sections of the BWT.
break;
}
// Advance top and bot
newtop = ebwt_->mapLF(tloc);
newbot = ebwt_->mapLF(bloc);
assert_geq(newbot, newtop);
assert_leq(newbot - newtop, spread);
// If the new spread is the same as the old spread, we can
// be confident that the new range includes all of the same
// suffixes as the last range (though longer by 1 char)
if((newbot - newtop) == spread) {
// Check if newtop is already cached
TMapItr itr = map_.find(newtop);
jumps++;
if(itr != map_.end()) {
// This range, which is further to the left in the
// same tunnel as the query range, has a cache
// entry already, so use that
TIndexOffU idx = itr->second;
TIndexOffU *ents = pool_.get(idx);
if((ents[0] & CACHE_WRAPPER_BIT) != 0) {
// The cache entry we found was a wrapper; make
// a new wrapper that points to that wrapper's
// target, with the appropriate number of jumps
jumps += (ents[0] & ~CACHE_WRAPPER_BIT);
idx = ents[1];
}
// Allocate a new wrapper
TIndexOffU newentIdx = pool_.alloc(2);
if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
// We successfully made a new wrapper entry;
// now populate it and install it in map_
TIndexOffU *newent = pool_.get(newentIdx); // get ptr to it
assert_eq(0, newent[0]);
newent[0] = CACHE_WRAPPER_BIT | jumps; // set jumps
newent[1] = idx; // set target
assert(map_.find(top) == map_.end());
map_[top] = newentIdx;
if(sanity_) assert(repOk());
}
// Initialize the entry
ent.init(pool_, top, jumps, idx, ebwt_);
return true;
}
// Save this range
tops.push_back(newtop);
SideLocus::initFromTopBot(newtop, newbot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
} else {
// Not all the suffixes were preserved, so we can't
// link the source range's cached result to this
// range's cached results
break;
}
assert_eq(jumps, tops.size());
}
assert_eq(jumps, tops.size());
// Try to create a new cache entry for the leftmost range in
// the tunnel (which might be the query range)
TIndexOffU newentIdx = pool_.alloc(spread + 1);
if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
// Successfully allocated new range cache entry; install it
TIndexOffU *newent = pool_.get(newentIdx);
assert_eq(0, newent[0]);
// Store cache-range length in first word
newent[0] = spread;
assert_lt(newent[0], CACHE_WRAPPER_BIT);
assert_eq(spread, newent[0]);
TIndexOffU entTop = top;
TIndexOffU jumps = 0;
if(tops.size() > 0) {
entTop = tops.back();
jumps = (TIndexOff)tops.size();
}
// Cache the entry for the end of the tunnel
assert(map_.find(entTop) == map_.end());
map_[entTop] = newentIdx;
if(sanity_) assert(repOk());
ent.init(pool_, entTop, jumps, newentIdx, ebwt_);
assert_eq(spread, newent[0]);
if(jumps > 0) {
assert_neq(entTop, top);
// Cache a wrapper entry for the query range (if possible)
TIndexOffU wrapentIdx = pool_.alloc(2);
if(wrapentIdx != RANGE_CACHE_BAD_ALLOC) {
TIndexOffU *wrapent = pool_.get(wrapentIdx);
assert_eq(0, wrapent[0]);
wrapent[0] = CACHE_WRAPPER_BIT | jumps;
wrapent[1] = newentIdx;
assert(map_.find(top) == map_.end());
map_[top] = wrapentIdx;
if(sanity_) assert(repOk());
}
}
return true;
} else {
// Could not allocate new range cache entry
return false;
}
}
TIndexOffU lim_; /// Total number of key/val bytes to keep in cache
TMap map_; ///
RangeCacheMemPool pool_; /// Memory pool
bool closed_; /// Out of space; no new entries
TEbwt* ebwt_; /// Index that alignments are in
bool sanity_;
};
#endif /* RANGE_CACHE_H_ */
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/Velcon-Zheng/bowtie.git
git@gitee.com:Velcon-Zheng/bowtie.git
Velcon-Zheng
bowtie
bowtie
master

搜索帮助