代码拉取完成,页面将自动刷新
#ifndef BLOCKWISE_SA_H_
#define BLOCKWISE_SA_H_
#ifdef WITH_TBB
#include <tbb/tbb.h>
#include <tbb/task_group.h>
#endif
#include <stdint.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <vector>
#include <seqan/sequence.h>
#include <seqan/index.h>
#include "assert_helpers.h"
#include "diff_sample.h"
#include "multikey_qsort.h"
#include "random_source.h"
#include "binary_sa_search.h"
#include "zbox.h"
#include "alphabet.h"
#include "timer.h"
#include "auto_array.h"
#include "word_io.h"
using namespace std;
using namespace seqan;
// Helpers for printing verbose messages
#ifndef VMSG_NL
#define VMSG_NL(...) \
if(this->verbose()) { \
stringstream tmp; \
tmp << __VA_ARGS__ << endl; \
this->verbose(tmp.str()); \
}
#endif
#ifndef VMSG
#define VMSG(...) \
if(this->verbose()) { \
stringstream tmp; \
tmp << __VA_ARGS__; \
this->verbose(tmp.str()); \
}
#endif
#define EBWTB_CAT ((int) 2)
/**
* Abstract parent class for blockwise suffix-array building schemes.
*/
template<typename TStr>
class BlockwiseSA {
public:
BlockwiseSA(const TStr& __text,
TIndexOffU __bucketSz,
bool __sanityCheck = false,
bool __passMemExc = false,
bool __verbose = false,
ostream& __logger = cout) :
_text(__text),
_bucketSz(max<TIndexOffU>(__bucketSz, 2u)),
_sanityCheck(__sanityCheck),
_passMemExc(__passMemExc),
_verbose(__verbose),
_itrBucket(),
_itrBucketPos(OFF_MASK),
_itrPushedBackSuffix(OFF_MASK),
_logger(__logger)
{ }
virtual ~BlockwiseSA()
#if __cplusplus > 199711L
noexcept(false)
#endif
{ }
/**
* Get the next suffix; compute the next bucket if necessary.
*/
virtual TIndexOffU nextSuffix() = 0;
/**
* Return true iff the next call to nextSuffix will succeed.
*/
bool hasMoreSuffixes() {
if(_itrPushedBackSuffix != OFF_MASK) return true;
try {
_itrPushedBackSuffix = nextSuffix();
} catch(out_of_range& e) {
assert_eq(OFF_MASK, _itrPushedBackSuffix);
return false;
}
return true;
}
/**
* Reset the suffix iterator so that the next call to nextSuffix()
* returns the lexicographically-first suffix.
*/
void resetSuffixItr() {
clear(_itrBucket);
_itrBucketPos = OFF_MASK;
_itrPushedBackSuffix = OFF_MASK;
reset();
assert(suffixItrIsReset());
}
/**
* Returns true iff the next call to nextSuffix() returns the
* lexicographically-first suffix.
*/
bool suffixItrIsReset() {
return length(_itrBucket) == 0 &&
_itrBucketPos == OFF_MASK &&
_itrPushedBackSuffix == OFF_MASK &&
isReset();
}
const TStr& text() const { return _text; }
TIndexOffU bucketSz() const { return _bucketSz; }
bool sanityCheck() const { return _sanityCheck; }
bool verbose() const { return _verbose; }
ostream& log() const { return _logger; }
size_t size() const { return length(_text)+1; }
protected:
/// Reset back to the first block
virtual void reset() = 0;
/// Return true iff reset to the first block
virtual bool isReset() = 0;
/**
* Grab the next block of sorted suffixes. The block is guaranteed
* to have at most _bucketSz elements.
*/
virtual void nextBlock(int cur_block, int tid = 0) = 0;
/// Return true iff more blocks are available
virtual bool hasMoreBlocks() const = 0;
/// Optionally output a verbose message
void verbose(const string& s) const {
if(this->verbose()) {
this->log() << s.c_str();
this->log().flush();
}
}
const TStr& _text; /// original string
const TIndexOffU _bucketSz; /// target maximum bucket size
const bool _sanityCheck; /// whether to perform sanity checks
const bool _passMemExc; /// true -> pass on memory exceptions
const bool _verbose; /// be talkative
String<TIndexOffU> _itrBucket; /// current bucket
TIndexOffU _itrBucketPos;/// offset into current bucket
TIndexOffU _itrPushedBackSuffix; /// temporary slot for lookahead
ostream& _logger; /// write log messages here
};
/**
* Abstract parent class for a blockwise suffix array builder that
* always doles out blocks in lexicographical order.
*/
template<typename TStr>
class InorderBlockwiseSA : public BlockwiseSA<TStr> {
public:
InorderBlockwiseSA(const TStr& __text,
TIndexOffU __bucketSz,
bool __sanityCheck = false,
bool __passMemExc = false,
bool __verbose = false,
ostream& __logger = cout) :
BlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger)
{ }
};
/**
* Build the SA a block at a time according to the scheme outlined in
* Karkkainen's "Fast BWT" paper.
*/
template<typename TStr>
class KarkkainenBlockwiseSA : public InorderBlockwiseSA<TStr> {
public:
typedef DifferenceCoverSample<TStr> TDC;
KarkkainenBlockwiseSA(const TStr& __text,
TIndexOffU __bucketSz,
int __nthreads,
uint32_t __dcV,
uint32_t __seed = 0,
bool __sanityCheck = false,
bool __passMemExc = false,
bool __verbose = false,
string base_fname = "",
ostream& __logger = cout) :
InorderBlockwiseSA<TStr>(__text, __bucketSz, __sanityCheck, __passMemExc, __verbose, __logger),
_sampleSuffs(), _nthreads(__nthreads), _itrBucketIdx(0), _cur(0), _dcV(__dcV), _dc(NULL), _built(false), _base_fname(base_fname), _bigEndian(currentlyBigEndian()), _done(NULL)
#ifdef WITH_TBB
,thread_group_started(false)
#endif
{ _randomSrc.init(__seed); reset(); }
~KarkkainenBlockwiseSA()
#if __cplusplus > 199711L
noexcept(false)
#endif
{
if(_dc != NULL) delete _dc; _dc = NULL; // difference cover sample
if (_done != NULL) {
delete[] _done;
_done = NULL;
}
#ifdef WITH_TBB
tbb_grp.wait();
#else
if(_threads.size() > 0) {
for (size_t tid = 0; tid < _threads.size(); tid++) {
_threads[tid]->join();
delete _threads[tid];
}
}
#endif
}
/**
* Allocate an amount of memory that simulates the peak memory
* usage of the DifferenceCoverSample with the given text and v.
* Throws bad_alloc if it's not going to fit in memory. Returns
* the approximate number of bytes the Cover takes at all times.
*/
static size_t simulateAllocs(const TStr& text, TIndexOffU bucketSz) {
size_t len = length(text);
// _sampleSuffs and _itrBucket are in memory at the peak
size_t bsz = bucketSz;
size_t sssz = len / max<TIndexOffU>(bucketSz-1, 1);
AutoArray<TIndexOffU> tmp(bsz + sssz + (1024 * 1024 /*out of caution*/));
return bsz;
}
//TBB requires a Functor to be passed to the thread group
//hence the nested class
#ifdef WITH_TBB
class nextBlock_Worker {
void *vp;
public:
nextBlock_Worker(const nextBlock_Worker& W): vp(W.vp) {};
nextBlock_Worker(void *vp_):vp(vp_) {};
void operator()() const {
#else
static void nextBlock_Worker(void *vp) {
#endif
pair<KarkkainenBlockwiseSA*, int> param = *(pair<KarkkainenBlockwiseSA*, int>*)vp;
KarkkainenBlockwiseSA* sa = param.first;
int tid = param.second;
while(true) {
size_t cur = 0;
{
ThreadSafe ts(&sa->_mutex);
cur = sa->_cur;
if(cur > length(sa->_sampleSuffs)) break;
sa->_cur++;
}
sa->nextBlock((int)cur, tid);
// Write suffixes into a file
std::ostringstream number; number << cur;
const string fname = sa->_base_fname + "." + number.str() + ".sa";
ofstream sa_file(fname.c_str(), ios::binary);
if(!sa_file.good()) {
cerr << "Could not open file for writing a reference graph: \"" << fname << "\"" << endl;
throw 1;
}
const String<TIndexOffU>& bucket = sa->_itrBuckets[tid];
writeU<TIndexOffU>(sa_file, (TIndexOffU)length(bucket), sa->_bigEndian);
for(size_t i = 0; i < length(bucket); i++) {
writeU<TIndexOffU>(sa_file, bucket[i], sa->_bigEndian);
}
sa_file.close();
clear(sa->_itrBuckets[tid]);
sa->_done[cur] = true;
}
}
#ifdef WITH_TBB
};
#endif
/**
* Get the next suffix; compute the next bucket if necessary.
*/
virtual TIndexOffU nextSuffix() {
// Launch threads if not
if(this->_nthreads > 1) {
#ifdef WITH_TBB
if(!thread_group_started)
{
#else
if(_threads.size() == 0) {
#endif
_done = new volatile bool[length(_sampleSuffs)+1];
for (int i = 0; i < length(_sampleSuffs) + 1; i++) {
_done[i] = false;
}
resize(_itrBuckets, this->_nthreads);
_tparams.resize(this->_nthreads);
for(int tid = 0; tid < this->_nthreads; tid++) {
_tparams[tid].first = this;
_tparams[tid].second = tid;
#ifdef WITH_TBB
tbb_grp.run(nextBlock_Worker((void*)&_tparams[tid]));
}
thread_group_started = true;
}
#else
_threads.push_back(new tthread::thread(nextBlock_Worker, (void*)&_tparams[tid]));
}
assert_eq(_threads.size(), (size_t)this->_nthreads);
}
#endif
}
if(this->_itrPushedBackSuffix != OFF_MASK) {
TIndexOffU tmp = this->_itrPushedBackSuffix;
this->_itrPushedBackSuffix = OFF_MASK;
return tmp;
}
while(this->_itrBucketPos >= length(this->_itrBucket) ||
length(this->_itrBucket) == 0)
{
if(!hasMoreBlocks()) {
throw out_of_range("No more suffixes");
}
if(this->_nthreads == 1) {
nextBlock((int)_cur);
_cur++;
} else {
while(!_done[this->_itrBucketIdx]) {
SLEEP(1);
}
// Read suffixes from a file
std::ostringstream number; number << this->_itrBucketIdx;
const string fname = _base_fname + "." + number.str() + ".sa";
ifstream sa_file(fname.c_str(), ios::binary);
if(!sa_file.good()) {
cerr << "Could not open file for reading a reference graph: \"" << fname << "\"" << endl;
throw 1;
}
size_t numSAs = readU<TIndexOffU>(sa_file, _bigEndian);
resize(this->_itrBucket, numSAs);
for(size_t i = 0; i < numSAs; i++) {
this->_itrBucket[i] = readU<TIndexOffU>(sa_file, _bigEndian);
}
sa_file.close();
std::remove(fname.c_str());
}
this->_itrBucketIdx++;
this->_itrBucketPos = 0;
}
return this->_itrBucket[this->_itrBucketPos++];
}
/// Defined in blockwise_sa.cpp
virtual void nextBlock(int cur_block, int tid = 0);
/// Defined in blockwise_sa.cpp
virtual void qsort(String<TIndexOffU>& bucket);
/// Return true iff more blocks are available
virtual bool hasMoreBlocks() const {
return this->_itrBucketIdx <= length(_sampleSuffs);
}
/// Return the difference-cover period
uint32_t dcV() const { return _dcV; }
protected:
/**
* Initialize the state of the blockwise suffix sort. If the
* difference cover sample and the sample set have not yet been
* built, build them. Then reset the block cursor to point to
* the first block.
*/
virtual void reset() {
if(!_built) {
build();
}
assert(_built);
_cur = 0;
}
/// Return true iff we're about to dole out the first bucket
virtual bool isReset() {
return _cur == 0;
}
private:
/**
* Calculate the difference-cover sample and sample suffixes.
*/
void build() {
// Calculate difference-cover sample
assert(_dc == NULL);
if(_dcV != 0) {
_dc = new TDC(this->text(), _dcV, this->verbose(), this->sanityCheck());
_dc->build(this->_nthreads);
}
// Calculate sample suffixes
if(this->bucketSz() <= length(this->text())) {
VMSG_NL("Building samples");
buildSamples();
} else {
VMSG_NL("Skipping building samples since text length " <<
length(this->text()) << " is less than bucket size: " <<
this->bucketSz());
}
_built = true;
}
/**
* Calculate the lcp between two suffixes using the difference
* cover as a tie-breaker. If the tie-breaker is employed, then
* the calculated lcp may be an underestimate.
*
* Defined in blockwise_sa.cpp
*/
inline bool tieBreakingLcp(TIndexOffU aOff,
TIndexOffU bOff,
TIndexOffU& lcp,
bool& lcpIsSoft);
/**
* Compare two suffixes using the difference-cover sample.
*/
inline bool suffixCmp(TIndexOffU cmp,
TIndexOffU i,
int64_t& j,
int64_t& k,
bool& kSoft,
const String<TIndexOffU>& z);
void buildSamples();
String<TIndexOffU> _sampleSuffs; /// sample suffixes
int _nthreads; /// # of threads
TIndexOffU _itrBucketIdx;
TIndexOffU _cur; /// offset to 1st elt of next block
const uint32_t _dcV; /// difference-cover periodicity
TDC* _dc; /// queryable difference-cover data
bool _built; /// whether samples/DC have been built
RandomSource _randomSrc; /// source of pseudo-randoms
MUTEX_T _mutex; /// synchronization of output message
string _base_fname; /// base file name for storing SA blocks
bool _bigEndian; /// bigEndian?
#ifdef WITH_TBB
tbb::task_group tbb_grp;/// thread "list" via Intel TBB
bool thread_group_started;
#else
std::vector<tthread::thread*> _threads; /// thread list
#endif
std::vector<pair<KarkkainenBlockwiseSA*, int> > _tparams;
String<String<TIndexOffU> > _itrBuckets; /// buckets
volatile bool* _done; /// is a block processed?
};
/**
* Qsort the set of suffixes whose offsets are in 'bucket'.
*/
template<typename TStr>
void KarkkainenBlockwiseSA<TStr>::qsort(String<TIndexOffU>& bucket) {
typedef typename Value<TStr>::Type TAlphabet;
const TStr& t = this->text();
TIndexOffU *s = begin(bucket);
TIndexOffU slen = (TIndexOffU)seqan::length(bucket);
TIndexOffU len = (TIndexOffU)seqan::length(t);
if(_dc != NULL) {
// Use the difference cover as a tie-breaker if we have it
VMSG_NL(" (Using difference cover)");
// Extract the 'host' array because it's faster to work
// with than the String<> container
uint8_t *host = (uint8_t*)t.data_begin;
mkeyQSortSufDcU8(t, host, len, s, slen, *_dc,
ValueSize<TAlphabet>::VALUE,
this->verbose(), this->sanityCheck());
} else {
VMSG_NL(" (Not using difference cover)");
// We don't have a difference cover - just do a normal
// suffix sort
mkeyQSortSuf(t, s, slen, ValueSize<TAlphabet>::VALUE,
this->verbose(), this->sanityCheck());
}
}
/**
* Qsort the set of suffixes whose offsets are in 'bucket'. This
* specialization for packed strings does not attempt to extract and
* operate directly on the host string; the fact that the string is
* packed means that the array cannot be sorted directly.
*/
template<>
void KarkkainenBlockwiseSA<String<Dna, Packed<> > >::qsort(String<TIndexOffU>& bucket) {
const String<Dna, Packed<> >& t = this->text();
TIndexOffU *s = begin(bucket);
TIndexOffU slen = (TIndexOffU)seqan::length(bucket);
TIndexOffU len = (TIndexOffU)seqan::length(t);
if(_dc != NULL) {
// Use the difference cover as a tie-breaker if we have it
VMSG_NL(" (Using difference cover)");
// Can't use the text's 'host' array because the backing
// store for the packed string is not one-char-per-elt.
mkeyQSortSufDcU8(t, t, len, s, slen, *_dc,
ValueSize<Dna>::VALUE,
this->verbose(), this->sanityCheck());
} else {
VMSG_NL(" (Not using difference cover)");
// We don't have a difference cover - just do a normal
// suffix sort
mkeyQSortSuf(t, s, slen, ValueSize<Dna>::VALUE,
this->verbose(), this->sanityCheck());
}
}
template<typename TStr>
struct BinarySortingParam {
const TStr* t;
const String<TIndexOffU>* sampleSuffs;
String<TIndexOffU> bucketSzs;
String<TIndexOffU> bucketReps;
size_t begin;
size_t end;
};
template<typename TStr>
#ifdef WITH_TBB
class BinarySorting_worker {
void *vp;
public:
BinarySorting_worker(const BinarySorting_worker& W): vp(W.vp) {};
BinarySorting_worker(void *vp_):vp(vp_) {};
void operator()() const
{
#else
static void BinarySorting_worker(void *vp)
{
#endif
BinarySortingParam<TStr>* param = (BinarySortingParam<TStr>*)vp;
const TStr& t = *(param->t);
size_t len = length(t);
const String<TIndexOffU>& sampleSuffs = *(param->sampleSuffs);
String<TIndexOffU>& bucketSzs = param->bucketSzs;
String<TIndexOffU>& bucketReps = param->bucketReps;
ASSERT_ONLY(size_t numBuckets = length(bucketSzs));
size_t begin = param->begin;
size_t end = param->end;
// Iterate through every suffix in the text, determine which
// bucket it falls into by doing a binary search across the
// sorted list of samples, and increment a counter associated
// with that bucket. Also, keep one representative for each
// bucket so that we can split it later. We loop in ten
// stretches so that we can print out a helpful progress
// message. (This step can take a long time.)
for(TIndexOffU i = (TIndexOffU)begin; i < end && i < len; i++) {
TIndexOffU r = binarySASearch(t, i, sampleSuffs);
if(r == std::numeric_limits<TIndexOffU>::max()) continue; // r was one of the samples
assert_lt(r, numBuckets);
bucketSzs[r]++;
assert_lt(bucketSzs[r], len);
if(bucketReps[r] == OFF_MASK || (i & 100) == 0) {
bucketReps[r] = i; // clobbers previous one, but that's OK
}
}
}
#ifdef WITH_TBB
};
#endif
/**
* Select a set of bucket-delineating sample suffixes such that no
* bucket is greater than the requested upper limit. Some care is
* taken to make each bucket's size close to the limit without
* going over.
*/
template<typename TStr>
void KarkkainenBlockwiseSA<TStr>::buildSamples() {
typedef typename Value<TStr>::Type TAlphabet;
const TStr& t = this->text();
TIndexOffU bsz = this->bucketSz()-1; // subtract 1 to leave room for sample
size_t len = length(this->text());
// Prepare _sampleSuffs array
clear(_sampleSuffs);
TIndexOffU numSamples = (TIndexOffU)((len/bsz)+1)<<1; // ~len/bsz x 2
assert_gt(numSamples, 0);
VMSG_NL("Reserving space for " << numSamples << " sample suffixes");
if(this->_passMemExc) {
reserve(_sampleSuffs, numSamples, Exact());
// Randomly generate samples. Allow duplicates for now.
VMSG_NL("Generating random suffixes");
for(size_t i = 0; i < numSamples; i++) {
#ifdef BOWTIE_64BIT_INDEX
appendValue(_sampleSuffs, (TIndexOffU)(_randomSrc.nextU64() % len));
#else
appendValue(_sampleSuffs, (TIndexOffU)(_randomSrc.nextU32() % len));
#endif
}
} else {
try {
reserve(_sampleSuffs, numSamples, Exact());
// Randomly generate samples. Allow duplicates for now.
VMSG_NL("Generating random suffixes");
for(size_t i = 0; i < numSamples; i++) {
#ifdef BOWTIE_64BIT_INDEX
appendValue(_sampleSuffs, (TIndexOffU)(_randomSrc.nextU64() % len));
#else
appendValue(_sampleSuffs, (TIndexOffU)(_randomSrc.nextU32() % len));
#endif
}
} catch(bad_alloc &e) {
if(this->_passMemExc) {
throw e; // rethrow immediately
} else {
cerr << "Could not allocate sample suffix container of " << (numSamples * OFF_SIZE) << " bytes." << endl
<< "Please try using a smaller number of blocks by specifying a larger --bmax or" << endl
<< "a smaller --bmaxdivn" << endl;
throw 1;
}
}
}
// Remove duplicates; very important to do this before the call to
// mkeyQSortSuf so that it doesn't try to calculate lexicographical
// relationships between very long, identical strings, which takes
// an extremely long time in general, and causes the stack to grow
// linearly with the size of the input
{
Timer timer(cout, "QSorting sample offsets, eliminating duplicates time: ", this->verbose());
VMSG_NL("QSorting " << length(_sampleSuffs) << " sample offsets, eliminating duplicates");
sort(begin(_sampleSuffs), end(_sampleSuffs));
size_t sslen = length(_sampleSuffs);
for(size_t i = 0; i < sslen-1; i++) {
if(_sampleSuffs[i] == _sampleSuffs[i+1]) {
erase(_sampleSuffs, i--);
sslen--;
}
}
}
// Multikey quicksort the samples
{
Timer timer(cout, " Multikey QSorting samples time: ", this->verbose());
VMSG_NL("Multikey QSorting " << length(_sampleSuffs) << " samples");
this->qsort(_sampleSuffs);
}
// Calculate bucket sizes
VMSG_NL("Calculating bucket sizes");
int limit = 5;
// Iterate until all buckets are less than
while(--limit >= 0) {
TIndexOffU numBuckets = (TIndexOffU)length(_sampleSuffs)+1;
#ifdef WITH_TBB
tbb::task_group tbb_grp;
#else
AutoArray<tthread::thread*> threads(this->_nthreads);
#endif
String<BinarySortingParam<TStr> > tparams;
resize(tparams, this->_nthreads);
for(int tid = 0; tid < this->_nthreads; tid++) {
// Calculate bucket sizes by doing a binary search for each
// suffix and noting where it lands
try {
// Allocate and initialize containers for holding bucket
// sizes and representatives.
fill(tparams[tid].bucketSzs, numBuckets, 0, Exact());
fill(tparams[tid].bucketReps, numBuckets, OFF_MASK, Exact());
} catch(bad_alloc &e) {
if(this->_passMemExc) {
throw e; // rethrow immediately
} else {
cerr << "Could not allocate sizes, representatives (" << ((numBuckets*8)>>10) << " KB) for blocks." << endl
<< "Please try using a smaller number of blocks by specifying a larger --bmax or a" << endl
<< "smaller --bmaxdivn." << endl;
throw 1;
}
}
tparams[tid].t = &t;
tparams[tid].sampleSuffs = &_sampleSuffs;
tparams[tid].begin = (tid == 0 ? 0 : len / this->_nthreads * tid);
tparams[tid].end = (tid + 1 == this->_nthreads ? len : len / this->_nthreads * (tid + 1));
if(this->_nthreads == 1) {
BinarySorting_worker<TStr>((void*)&tparams[tid]);
} else {
#ifdef WITH_TBB
tbb_grp.run(BinarySorting_worker<TStr>(((void*)&tparams[tid])));
}
}
tbb_grp.wait();
#else
threads[tid] = new tthread::thread(BinarySorting_worker<TStr>, (void*)&tparams[tid]);
}
}
if(this->_nthreads > 1) {
for (int tid = 0; tid < this->_nthreads; tid++) {
threads[tid]->join();
}
}
#endif
String<TIndexOffU>& bucketSzs = tparams[0].bucketSzs;
String<TIndexOffU>& bucketReps = tparams[0].bucketReps;
for(int tid = 1; tid < this->_nthreads; tid++) {
for(size_t j = 0; j < numBuckets; j++) {
bucketSzs[j] += tparams[tid].bucketSzs[j];
if(bucketReps[j] == OFF_MASK) {
bucketReps[j] = tparams[tid].bucketReps[j];
}
}
}
// Check for large buckets and mergeable pairs of small buckets
// and split/merge as necessary
TIndexOff added = 0;
TIndexOff merged = 0;
assert_eq(length(bucketSzs), numBuckets);
assert_eq(length(bucketReps), numBuckets);
{
Timer timer(cout, " Splitting and merging time: ", this->verbose());
VMSG_NL("Splitting and merging");
for(TIndexOffU i = 0; i < numBuckets; i++) {
TIndexOffU mergedSz = bsz + 1;
assert(bucketSzs[i] == 0 || bucketReps[i] != OFF_MASK);
if(i < (TIndexOffU)numBuckets-1) {
mergedSz = bucketSzs[(size_t)i] + bucketSzs[(size_t)i+1] + 1;
}
// Merge?
if(mergedSz <= bsz) {
bucketSzs[i+1] += (bucketSzs[i]+1);
// The following may look strange, but it's necessary
// to ensure that the merged bucket has a representative
bucketReps[i+1] = _sampleSuffs[i+added];
erase(_sampleSuffs, i+added);
erase(bucketSzs, i);
erase(bucketReps, i);
i--; // might go to -1 but ++ will overflow back to 0
numBuckets--;
merged++;
assert_eq(numBuckets, length(_sampleSuffs)+1-added);
assert_eq(numBuckets, length(bucketSzs));
}
// Split?
else if(bucketSzs[i] > bsz) {
// Add an additional sample from the bucketReps[]
// set accumulated in the binarySASearch loop; this
// effectively splits the bucket
insertValue(_sampleSuffs, TIndexOffU(i + (added++)), bucketReps[i]);
}
}
}
if(added == 0) {
//if(this->verbose()) {
// cout << "Final bucket sizes:" << endl;
// cout << " (begin): " << bucketSzs[0] << " (" << (int)(bsz - bucketSzs[0]) << ")" << endl;
// for(uint32_t i = 1; i < numBuckets; i++) {
// cout << " " << bucketSzs[i] << " (" << (int)(bsz - bucketSzs[i]) << ")" << endl;
// }
//}
break;
}
// Otherwise, continue until no more buckets need to be
// split
VMSG_NL("Split " << added << ", merged " << merged << "; iterating...");
}
// Do *not* force a do-over
// if(limit == 0) {
// VMSG_NL("Iterated too many times; trying again...");
// buildSamples();
// }
VMSG_NL("Avg bucket size: " << ((double)(len-length(_sampleSuffs)) / (length(_sampleSuffs)+1)) << " (target: " << bsz << ")");
}
/**
* Do a simple LCP calculation on two strings.
*/
template<typename T> inline
static TIndexOffU suffixLcp(const T& t, TIndexOffU aOff, TIndexOffU bOff) {
TIndexOffU c = 0;
size_t len = length(t);
assert_leq(aOff, len);
assert_leq(bOff, len);
while(aOff + c < len && bOff + c < len && t[aOff + c] == t[bOff + c]) c++;
return c;
}
/**
* Calculate the lcp between two suffixes using the difference
* cover as a tie-breaker. If the tie-breaker is employed, then
* the calculated lcp may be an underestimate. If the tie-breaker is
* employed, lcpIsSoft will be set to true (otherwise, false).
*/
template<typename TStr> inline
bool KarkkainenBlockwiseSA<TStr>::tieBreakingLcp(TIndexOffU aOff,
TIndexOffU bOff,
TIndexOffU& lcp,
bool& lcpIsSoft)
{
const TStr& t = this->text();
TIndexOffU c = 0;
TIndexOffU tlen = TIndexOffU(length(t));
assert_leq(aOff, tlen);
assert_leq(bOff, tlen);
assert(_dc != NULL);
uint32_t dcDist = _dc->tieBreakOff(aOff, bOff);
lcpIsSoft = false; // hard until proven soft
while(c < dcDist && // we haven't hit the tie breaker
c < tlen-aOff && // we haven't fallen off of LHS suffix
c < tlen-bOff && // we haven't fallen off of RHS suffix
t[aOff+c] == t[bOff+c]) // we haven't hit a mismatch
c++;
lcp = c;
if(c == tlen-aOff) {
// Fell off LHS (a), a is greater
return false;
} else if(c == tlen-bOff) {
// Fell off RHS (b), b is greater
return true;
} else if(c == dcDist) {
// Hit a tie-breaker element
lcpIsSoft = true;
assert_neq(dcDist, 0xffffffff);
return _dc->breakTie(aOff+c, bOff+c) < 0;
} else {
assert_neq(t[aOff+c], t[bOff+c]);
return t[aOff+c] < t[bOff+c];
}
}
/**
* Lookup a suffix LCP in the given z array; if the element is not
* filled in then calculate it from scratch.
*/
template<typename T>
static TIndexOffU lookupSuffixZ(
const T& t,
TIndexOffU zOff,
TIndexOffU off,
const String<TIndexOffU>& z)
{
if(zOff < length(z)) {
TIndexOffU ret = z[zOff];
assert_eq(ret, suffixLcp(t, off + zOff, off));
return ret;
}
assert_leq(off + zOff, length(t));
return suffixLcp(t, off + zOff, off);
}
/**
* true -> i < cmp
* false -> i > cmp
*/
template<typename TStr> inline
bool KarkkainenBlockwiseSA<TStr>::suffixCmp(
TIndexOffU cmp,
TIndexOffU i,
int64_t& j,
int64_t& k,
bool& kSoft,
const String<TIndexOffU>& z)
{
const TStr& t = this->text();
TIndexOffU len = TIndexOffU(length(t));
// i is not covered by any previous match
TIndexOffU l;
if((int64_t)i > k) {
k = i; // so that i + lHi == kHi
l = 0; // erase any previous l
kSoft = false;
// To be extended
}
// i is covered by a previous match
else /* i <= k */ {
assert_gt((int64_t)i, j);
TIndexOffU zIdx = (TIndexOffU)(i-j);
assert_leq(zIdx, len-cmp);
if(zIdx < _dcV || _dc == NULL) {
// Go as far as the Z-box says
l = lookupSuffixZ(t, zIdx, cmp, z);
if(i + l > len) {
l = len-i;
}
assert_leq(i + l, len);
// Possibly to be extended
} else {
// But we're past the point of no-more-Z-boxes
bool ret = tieBreakingLcp(i, cmp, l, kSoft);
// Sanity-check tie-breaker
if(this->sanityCheck()) {
if(ret) assert(dollarLt(suffix(t, i), suffix(t, cmp)));
else assert(dollarGt(suffix(t, i), suffix(t, cmp)));
}
j = i;
k = i + l;
if(this->sanityCheck()) {
if(kSoft) { assert_leq(l, suffixLcp(t, i, cmp)); }
else { assert_eq (l, suffixLcp(t, i, cmp)); }
}
return ret;
}
}
// Z box extends exactly as far as previous match (or there
// is neither a Z box nor a previous match)
if((int64_t)(i + l) == k) {
// Extend
while(l < len-cmp && k < (int64_t)len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
k++; l++;
}
j = i; // update furthest-extending LHS
kSoft = false;
assert_eq(l, suffixLcp(t, i, cmp));
}
// Z box extends further than previous match
else if((int64_t)(i + l) > k) {
l = (TIndexOffU)(k - i); // point to just after previous match
j = i; // update furthest-extending LHS
if(kSoft) {
while(l < len-cmp && k < (int64_t)len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
k++; l++;
}
kSoft = false;
assert_eq(l, suffixLcp(t, i, cmp));
} else assert_eq(l, suffixLcp(t, i, cmp));
}
// Check that calculated lcp matches actual lcp
if(this->sanityCheck()) {
if(!kSoft) {
// l should exactly match lcp
assert_eq(l, suffixLcp(t, i, cmp));
} else {
// l is an underestimate of LCP
assert_leq(l, suffixLcp(t, i, cmp));
}
}
assert_leq(l+i, len);
assert_leq(l, len-cmp);
// i and cmp should not be the same suffix
assert(l != len-cmp || i+l != len);
// Now we're ready to do a comparison on the next char
if(l+i != len && (
l == len-cmp || // departure from paper algorithm:
// falling off pattern implies
// pattern is *greater* in our case
t[i + l] < t[cmp + l]))
{
// Case 2: Text suffix is less than upper sample suffix
if(this->sanityCheck()) assert(dollarLt(suffix(t, i), suffix(t, cmp)));
return true; // suffix at i is less than suffix at cmp
}
else {
// Case 3: Text suffix is greater than upper sample suffix
if(this->sanityCheck()) assert(dollarGt(suffix(t, i), suffix(t, cmp)));
return false; // suffix at i is less than suffix at cmp
}
}
/**
* Retrieve the next block. This is the most performance-critical part
* of the blockwise suffix sorting process.
*/
template<typename TStr>
void KarkkainenBlockwiseSA<TStr>::nextBlock(int cur_block, int tid) {
#ifndef NDEBUG
if(this->_nthreads > 1) {
assert_lt(tid, length(this->_itrBuckets));
}
#endif
String<TIndexOffU>& bucket = (this->_nthreads > 1 ? this->_itrBuckets[tid] : this->_itrBucket);
{
ThreadSafe ts(&_mutex);
VMSG_NL("Getting block " << (cur_block+1) << " of " << length(_sampleSuffs)+1);
}
assert(_built);
assert_gt(_dcV, 3);
assert_leq(cur_block, length(_sampleSuffs));
const TStr& t = this->text();
TIndexOffU len = (TIndexOffU)length(t);
// Set up the bucket
clear(bucket);
TIndexOffU lo = OFF_MASK, hi = OFF_MASK;
if(length(_sampleSuffs) == 0) {
// Special case: if _sampleSuffs is 0, then multikey-quicksort
// everything
{
ThreadSafe ts(&_mutex);
VMSG_NL(" No samples; assembling all-inclusive block");
}
assert_eq(0, cur_block);
try {
if(capacity(bucket) < this->bucketSz()) {
reserve(bucket, len+1);
}
resize(bucket, len);
for(TIndexOffU i = 0; i < len; i++) {
bucket[i] = i;
}
} catch(bad_alloc &e) {
if(this->_passMemExc) {
throw e; // rethrow immediately
} else {
cerr << "Could not allocate a master suffix-array block of " << ((len+1) * 4) << " bytes" << endl
<< "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
<< "a larger --bmaxdivn" << endl;
throw 1;
}
}
} else {
try {
{
ThreadSafe ts(&_mutex);
VMSG_NL(" Reserving size (" << this->bucketSz() << ") for bucket " << (cur_block+1));
}
// BTL: Add a +100 fudge factor; there seem to be instances
// where a bucket ends up having one more elt than bucketSz()
if(length(bucket) < this->bucketSz()+100) {
reserve(bucket, this->bucketSz()+100);
}
} catch(bad_alloc &e) {
if(this->_passMemExc) {
throw e; // rethrow immediately
} else {
cerr << "Could not allocate a suffix-array block of " << ((this->bucketSz()+1) * 4) << " bytes" << endl;
cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
<< "a larger --bmaxdivn" << endl;
throw 1;
}
}
// Select upper and lower bounds from _sampleSuffs[] and
// calculate the Z array up to the difference-cover periodicity
// for both. Be careful about first/last buckets.
//String<TIndexOffU> zLo(EBWTB_CAT), zHi(EBWTB_CAT);
String<TIndexOffU> zLo, zHi;
assert_geq(cur_block, 0);
assert_leq((size_t)cur_block, length(_sampleSuffs));
bool first = (cur_block == 0);
bool last = ((size_t)cur_block == length(_sampleSuffs));
try {
// Timer timer(cout, " Calculating Z arrays time: ", this->verbose());
{
ThreadSafe ts(&_mutex);
VMSG_NL(" Calculating Z arrays for bucket " << (cur_block+1));
}
if(!last) {
// Not the last bucket
assert_lt(cur_block, length(_sampleSuffs));
hi = _sampleSuffs[cur_block];
//zHi.resizeExact(_dcV);
//zHi.fillZero();
fill(zHi, _dcV, 0, Exact());
assert_eq(getValue(zHi, 0), 0);
calcZ(t, hi, zHi, this->verbose(), this->sanityCheck());
}
if(!first) {
// Not the first bucket
assert_gt(cur_block, 0);
assert_leq(cur_block, length(_sampleSuffs));
lo = _sampleSuffs[cur_block-1];
//zLo.resizeExact(_dcV);
//zLo.fillZero();
fill(zLo, _dcV, 0, Exact());
assert_gt(_dcV, 3);
assert_eq(getValue(zLo, 0), 0);
calcZ(t, lo, zLo, this->verbose(), this->sanityCheck());
}
} catch(bad_alloc &e) {
if(this->_passMemExc) {
throw e; // rethrow immediately
} else {
cerr << "Could not allocate a z-array of " << (_dcV * 4) << " bytes" << endl;
cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
<< "a larger --bmaxdivn" << endl;
throw 1;
}
}
// This is the most critical loop in the algorithm; this is where
// we iterate over all suffixes in the text and pick out those that
// fall into the current bucket.
//
// This loop is based on the SMALLERSUFFIXES function outlined on
// p7 of the "Fast BWT" paper
//
int64_t kHi = -1, kLo = -1;
int64_t jHi = -1, jLo = -1;
bool kHiSoft = false, kLoSoft = false;
assert_eq(0, length(bucket));
{
// Timer timer(cout, " Block accumulator loop time: ", this->verbose());
{
ThreadSafe ts(&_mutex);
VMSG_NL(" Entering block accumulator loop for bucket " << (cur_block+1) << ":");
}
TIndexOffU lenDiv10 = (len + 9) / 10;
for(TIndexOffU iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
TIndexOffU itenNext = iten + lenDiv10;
{
ThreadSafe ts(&_mutex);
if(ten > 0) VMSG_NL(" bucket " << (cur_block+1) << ": " << (ten * 10) << "%");
}
for(TIndexOffU i = iten; i < itenNext && i < len; i++) {
assert_lt(jLo, (TIndexOff)i); assert_lt(jHi, (TIndexOff)i);
// Advance the upper-bound comparison by one character
if(i == hi || i == lo) continue; // equal to one of the bookends
if(hi != OFF_MASK && !suffixCmp(hi, i, jHi, kHi, kHiSoft, zHi)) {
continue; // not in the bucket
}
if(lo != OFF_MASK && suffixCmp(lo, i, jLo, kLo, kLoSoft, zLo)) {
continue; // not in the bucket
}
// In the bucket! - add it
assert_lt(i, len);
try {
appendValue(bucket, i);
} catch(bad_alloc &e) {
cerr << "Could not append element to block of " << ((length(bucket)) * OFF_SIZE) << " bytes" << endl;
if(this->_passMemExc) {
throw e; // rethrow immediately
} else {
cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
<< "a larger --bmaxdivn" << endl;
throw 1;
}
}
// Not necessarily true; we allow overflowing buckets
// since we can't guarantee that a good set of sample
// suffixes can be found in a reasonable amount of time
//assert_lt(bucket.size(), this->bucketSz());
}
} // end loop over all suffixes of t
{
ThreadSafe ts(&_mutex);
VMSG_NL(" bucket " << (cur_block+1) << ": 100%");
}
}
} // end else clause of if(_sampleSuffs.size() == 0)
// Sort the bucket
if(length(bucket) > 0) {
Timer timer(cout, " Sorting block time: ", this->verbose());
{
ThreadSafe ts(&_mutex);
VMSG_NL(" Sorting block of length " << length(bucket) << " for bucket " << (cur_block+1));
}
this->qsort(bucket);
}
if(hi != OFF_MASK) {
// Not the final bucket; throw in the sample on the RHS
appendValue(bucket, hi);
} else {
// Final bucket; throw in $ suffix
appendValue(bucket, len);
}
{
ThreadSafe ts(&_mutex);
VMSG_NL("Returning block of " << length(bucket) << " for bucket " << (cur_block+1));
}
}
#endif /*BLOCKWISE_SA_H_*/
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。