1 Star 0 Fork 0

Velcon-Zheng/bowtie

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
diff_sample.h 32.82 KB
一键复制 编辑 原始数据 按行查看 历史
BenLangmead 提交于 2017-06-07 12:13 . fix sPrimeSz indexing
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119
#ifndef DIFF_SAMPLE_H_
#define DIFF_SAMPLE_H_
#ifdef WITH_TBB
#include <tbb/tbb.h>
#include <tbb/task_group.h>
#endif
#include <stdint.h>
#include <seqan/sequence.h>
#include <seqan/index.h> // for LarssonSadakane
#include "assert_helpers.h"
#include "multikey_qsort.h"
#include "timer.h"
#include "auto_array.h"
#include "btypes.h"
using namespace std;
using namespace seqan;
#ifndef VMSG_NL
#define VMSG_NL(...) \
if(this->verbose()) { \
stringstream tmp; \
tmp << __VA_ARGS__ << endl; \
this->verbose(tmp.str()); \
}
#endif
#ifndef VMSG
#define VMSG(...) \
if(this->verbose()) { \
stringstream tmp; \
tmp << __VA_ARGS__; \
this->verbose(tmp.str()); \
}
#endif
/**
* Routines for calculating, sanity-checking, and dispensing difference
* cover samples to clients.
*/
/**
*
*/
struct sampleEntry {
uint32_t maxV;
uint32_t numSamples;
uint32_t samples[128];
};
/// Array of Colbourn and Ling calculated difference covers up to
/// r = 16 (maxV = 5953)
static struct sampleEntry clDCs[16];
static bool clDCs_calced = false; /// have clDCs been calculated?
/**
* Check that the given difference cover 'ds' actually covers all
* differences for a periodicity of v.
*/
template<typename T>
static bool dcRepOk(T v, String<T>& ds) {
// diffs[] records all the differences observed
bool *covered = new bool[v];
for(T i = 1; i < v; i++) {
covered[i] = false;
}
for(T di = T(); di < length(ds); di++) {
for(T dj = di+1; dj < length(ds); dj++) {
assert_lt(ds[di], ds[dj]);
T d1 = (ds[dj] - ds[di]);
T d2 = (ds[di] + v - ds[dj]);
assert_lt(d1, v);
assert_lt(d2, v);
covered[d1] = true;
covered[d2] = true;
}
}
bool ok = true;
for(T i = 1; i < v; i++) {
if(covered[i] == false) {
ok = false;
break;
}
}
delete[] covered;
return ok;
}
/**
* Return true iff each element of ts (with length 'limit') is greater
* than the last.
*/
template<typename T>
static bool increasing(T* ts, size_t limit) {
for(size_t i = 0; i < limit-1; i++) {
if(ts[i+1] <= ts[i]) return false;
}
return true;
}
/**
* Return true iff the given difference cover covers difference 'diff'
* mod 'v'.
*/
template<typename T>
static inline bool hasDifference(T *ds, T d, T v, T diff) {
// diffs[] records all the differences observed
for(T di = T(); di < d; di++) {
for(T dj = di+1; dj < d; dj++) {
assert_lt(ds[di], ds[dj]);
T d1 = (ds[dj] - ds[di]);
T d2 = (ds[di] + v - ds[dj]);
assert_lt(d1, v);
assert_lt(d2, v);
if(d1 == diff || d2 == diff) return true;
}
}
return false;
}
/**
* Exhaustively calculate optimal difference cover samples for v = 4,
* 8, 16, 32, 64, 128, 256 and store results in p2DCs[]
*/
template<typename T>
void calcExhaustiveDC(T i, bool verbose = false, bool sanityCheck = false) {
T v = i;
bool *diffs = new bool[v];
// v is the target period
T ld = (T)ceil(sqrt(v));
// ud is the upper bound on |D|
T ud = v / 2;
// for all possible |D|s
bool ok = true;
T *ds = NULL;
T d;
for(d = ld; d <= ud+1; d++) {
// for all possible |D| samples
ds = new T[d];
for(T j = 0; j < d; j++) {
ds[j] = j;
}
assert(increasing(ds, d));
while(true) {
// reset diffs[]
for(T t = 1; t < v; t++) {
diffs[t] = false;
}
T diffCnt = 0;
// diffs[] records all the differences observed
for(T di = 0; di < d; di++) {
for(T dj = di+1; dj < d; dj++) {
assert_lt(ds[di], ds[dj]);
T d1 = (ds[dj] - ds[di]);
T d2 = (ds[di] + v - ds[dj]);
assert_lt(d1, v);
assert_lt(d2, v);
assert_gt(d1, 0);
assert_gt(d2, 0);
if(!diffs[d1]) diffCnt++; diffs[d1] = true;
if(!diffs[d2]) diffCnt++; diffs[d2] = true;
}
}
// Do we observe all possible differences (except 0)
ok = diffCnt == v-1;
if(ok) {
// Yes, all differences are covered
break;
} else {
// Advance ds
// (Following is commented out because it turns out
// it's slow)
// Find a missing difference
//uint32_t missing = 0xffffffff;
//for(uint32_t t = 1; t < v; t++) {
// if(diffs[t] == false) {
// missing = diffs[t];
// break;
// }
//}
//assert_neq(missing, 0xffffffff);
assert(increasing(ds, d));
bool advanced = false;
bool keepGoing = false;
do {
keepGoing = false;
for(T bd = d-1; bd > 1; bd--) {
T dif = (d-1)-bd;
if(ds[bd] < v-1-dif) {
ds[bd]++;
assert_neq(0, ds[bd]);
// Reset subsequent ones
for(T bdi = bd+1; bdi < d; bdi++) {
assert_eq(0, ds[bdi]);
ds[bdi] = ds[bdi-1]+1;
assert_gt(ds[bdi], ds[bdi-1]);
}
assert(increasing(ds, d));
// (Following is commented out because
// it turns out it's slow)
// See if the new DC has the missing value
//if(!hasDifference(ds, d, v, missing)) {
// keepGoing = true;
// break;
//}
advanced = true;
break;
} else {
ds[bd] = 0;
// keep going
}
}
} while(keepGoing);
// No solution for this |D|
if(!advanced) break;
assert(increasing(ds, d));
}
} // next sample assignment
if(ok) {
break;
}
delete[] ds;
} // next |D|
assert(ok);
delete[] diffs;
cout << "Did exhaustive v=" << v << " |D|=" << d << endl;
cout << " ";
for(T i = 0; i < d; i++) {
cout << ds[i];
if(i < d-1) cout << ",";
}
cout << endl;
delete[] ds;
}
/**
* Routune for calculating the elements of clDCs up to r = 16 using the
* technique of Colbourn and Ling.
*
* See http://citeseer.ist.psu.edu/211575.html
*/
template <typename T>
void calcColbournAndLingDCs(bool verbose = false, bool sanityCheck = false) {
for(T r = 0; r < 16; r++) {
T maxv = 24*r*r + 36*r + 13; // Corollary 2.3
T numsamp = 6*r + 4;
clDCs[r].maxV = maxv;
clDCs[r].numSamples = numsamp;
memset(clDCs[r].samples, 0, 4 * 128);
T i;
// clDCs[r].samples[0] = 0;
// Fill in the 1^r part of the B series
for(i = 1; i < r+1; i++) {
clDCs[r].samples[i] = clDCs[r].samples[i-1] + 1;
}
// Fill in the (r + 1)^1 part
clDCs[r].samples[r+1] = clDCs[r].samples[r] + r + 1;
// Fill in the (2r + 1)^r part
for(i = r+2; i < r+2+r; i++) {
clDCs[r].samples[i] = clDCs[r].samples[i-1] + 2*r + 1;
}
// Fill in the (4r + 3)^(2r + 1) part
for(i = r+2+r; i < r+2+r+2*r+1; i++) {
clDCs[r].samples[i] = clDCs[r].samples[i-1] + 4*r + 3;
}
// Fill in the (2r + 2)^(r + 1) part
for(i = r+2+r+2*r+1; i < r+2+r+2*r+1+r+1; i++) {
clDCs[r].samples[i] = clDCs[r].samples[i-1] + 2*r + 2;
}
// Fill in the last 1^r part
for(i = r+2+r+2*r+1+r+1; i < r+2+r+2*r+1+r+1+r; i++) {
clDCs[r].samples[i] = clDCs[r].samples[i-1] + 1;
}
assert_eq(i, numsamp);
assert_lt(i, 128);
if(sanityCheck) {
// diffs[] records all the differences observed
bool *diffs = new bool[maxv];
for(T i = 0; i < numsamp; i++) {
for(T j = i+1; j < numsamp; j++) {
T d1 = (clDCs[r].samples[j] - clDCs[r].samples[i]);
T d2 = (clDCs[r].samples[i] + maxv - clDCs[r].samples[j]);
assert_lt(d1, maxv);
assert_lt(d2, maxv);
diffs[d1] = true;
diffs[d2] = true;
}
}
// Should have observed all possible differences (except 0)
for(T i = 1; i < maxv; i++) {
if(diffs[i] == false) cout << r << ", " << i << endl;
assert(diffs[i] == true);
}
delete[] diffs;
}
}
clDCs_calced = true;
}
/**
* Entries 4-57 are transcribed from page 6 of Luk and Wong's paper
* "Two New Quorum Based Algorithms for Distributed Mutual Exclusion",
* which is also used and cited in the Burkhardt and Karkkainen's
* papers on difference covers for sorting. These samples are optimal
* according to Luk and Wong.
*
* All other entries are generated via the exhaustive algorithm in
* calcExhaustiveDC().
*
* The 0 is stored at the end of the sample as an end-of-list marker,
* but 0 is also an element of each.
*
* Note that every difference cover has a 0 and a 1. Intuitively,
* any optimal difference cover sample can be oriented (i.e. rotated)
* such that it includes 0 and 1 as elements.
*
* All samples in this list have been verified to be complete covers.
*
* A value of 0xffffffff in the first column indicates that there is no
* sample for that value of v. We do not keep samples for values of v
* less than 3, since they are trivial (and the caller probably didn't
* mean to ask for it).
*/
static uint32_t dc0to64[65][10] = {
{0xffffffff}, // 0
{0xffffffff}, // 1
{0xffffffff}, // 2
{1, 0}, // 3
{1, 2, 0}, // 4
{1, 2, 0}, // 5
{1, 3, 0}, // 6
{1, 3, 0}, // 7
{1, 2, 4, 0}, // 8
{1, 2, 4, 0}, // 9
{1, 2, 5, 0}, // 10
{1, 2, 5, 0}, // 11
{1, 3, 7, 0}, // 12
{1, 3, 9, 0}, // 13
{1, 2, 3, 7, 0}, // 14
{1, 2, 3, 7, 0}, // 15
{1, 2, 5, 8, 0}, // 16
{1, 2, 4, 12, 0}, // 17
{1, 2, 5, 11, 0}, // 18
{1, 2, 6, 9, 0}, // 19
{1, 2, 3, 6, 10, 0}, // 20
{1, 4, 14, 16, 0}, // 21
{1, 2, 3, 7, 11, 0}, // 22
{1, 2, 3, 7, 11, 0}, // 23
{1, 2, 3, 7, 15, 0}, // 24
{1, 2, 3, 8, 12, 0}, // 25
{1, 2, 5, 9, 15, 0}, // 26
{1, 2, 5, 13, 22, 0}, // 27
{1, 4, 15, 20, 22, 0}, // 28
{1, 2, 3, 4, 9, 14, 0}, // 29
{1, 2, 3, 4, 9, 19, 0}, // 30
{1, 3, 8, 12, 18, 0}, // 31
{1, 2, 3, 7, 11, 19, 0}, // 32
{1, 2, 3, 6, 16, 27, 0}, // 33
{1, 2, 3, 7, 12, 20, 0}, // 34
{1, 2, 3, 8, 12, 21, 0}, // 35
{1, 2, 5, 12, 14, 20, 0}, // 36
{1, 2, 4, 10, 15, 22, 0}, // 37
{1, 2, 3, 4, 8, 14, 23, 0}, // 38
{1, 2, 4, 13, 18, 33, 0}, // 39
{1, 2, 3, 4, 9, 14, 24, 0}, // 40
{1, 2, 3, 4, 9, 15, 25, 0}, // 41
{1, 2, 3, 4, 9, 15, 25, 0}, // 42
{1, 2, 3, 4, 10, 15, 26, 0}, // 43
{1, 2, 3, 6, 16, 27, 38, 0}, // 44
{1, 2, 3, 5, 12, 18, 26, 0}, // 45
{1, 2, 3, 6, 18, 25, 38, 0}, // 46
{1, 2, 3, 5, 16, 22, 40, 0}, // 47
{1, 2, 5, 9, 20, 26, 36, 0}, // 48
{1, 2, 5, 24, 33, 36, 44, 0}, // 49
{1, 3, 8, 17, 28, 32, 38, 0}, // 50
{1, 2, 5, 11, 18, 30, 38, 0}, // 51
{1, 2, 3, 4, 6, 14, 21, 30, 0}, // 52
{1, 2, 3, 4, 7, 21, 29, 44, 0}, // 53
{1, 2, 3, 4, 9, 15, 21, 31, 0}, // 54
{1, 2, 3, 4, 6, 19, 26, 47, 0}, // 55
{1, 2, 3, 4, 11, 16, 33, 39, 0}, // 56
{1, 3, 13, 32, 36, 43, 52, 0}, // 57
// Generated by calcExhaustiveDC()
{1, 2, 3, 7, 21, 33, 37, 50, 0}, // 58
{1, 2, 3, 6, 13, 21, 35, 44, 0}, // 59
{1, 2, 4, 9, 15, 25, 30, 42, 0}, // 60
{1, 2, 3, 7, 15, 25, 36, 45, 0}, // 61
{1, 2, 4, 10, 32, 39, 46, 51, 0}, // 62
{1, 2, 6, 8, 20, 38, 41, 54, 0}, // 63
{1, 2, 5, 14, 16, 34, 42, 59, 0} // 64
};
/**
* Get a difference cover for the requested periodicity v.
*/
template <typename T>
static String<T> getDiffCover(T v,
bool verbose = false,
bool sanityCheck = false)
{
assert_gt(v, 2);
String<T> ret;
// Can we look it up in our hardcoded array?
if(v <= 64 && dc0to64[v][0] == 0xffffffff) {
if(verbose) cout << "v in hardcoded area, but hardcoded entry was all-fs" << endl;
return ret;
} else if(v <= 64) {
append(ret, 0);
for(size_t i = 0; i < 10; i++) {
if(dc0to64[v][i] == 0) break;
append(ret, dc0to64[v][i]);
}
if(sanityCheck) assert(dcRepOk(v, ret));
return ret;
}
// Can we look it up in our calcColbournAndLingDCs array?
if(!clDCs_calced) {
calcColbournAndLingDCs<uint32_t>(verbose, sanityCheck);
assert(clDCs_calced);
}
for(size_t i = 0; i < 16; i++) {
if(v <= clDCs[i].maxV) {
for(size_t j = 0; j < clDCs[i].numSamples; j++) {
T s = clDCs[i].samples[j];
if(s >= v) {
s %= v;
for(size_t k = 0; k < length(ret); k++) {
if(s == ret[k]) break;
if(s < ret[k]) {
insertValue(ret, k, s);
break;
}
}
} else {
append(ret, s % v);
}
}
if(sanityCheck) assert(dcRepOk(v, ret));
return ret;
}
}
cerr << "Error: Could not find a difference cover sample for v=" << v << endl;
throw 1;
}
/**
* Calculate and return a delta map based on the given difference cover
* and periodicity v.
*/
template <typename T>
static String<T> getDeltaMap(T v, const String<T>& dc) {
// Declare anchor-map-related items
String<T> amap;
size_t amapEnts = 1;
fill(amap, v, 0xffffffff, Exact());
amap[0] = 0;
// Print out difference cover (and optionally calculate
// anchor map)
for(size_t i = 0; i < length(dc); i++) {
for(size_t j = i+1; j < length(dc); j++) {
assert_gt(dc[j], dc[i]);
T diffLeft = dc[j] - dc[i];
T diffRight = dc[i] + v - dc[j];
assert_lt(diffLeft, v);
assert_lt(diffRight, v);
if(amap[diffLeft] == 0xffffffff) {
amap[diffLeft] = dc[i];
amapEnts++;
}
if(amap[diffRight] == 0xffffffff) {
amap[diffRight] = dc[j];
amapEnts++;
}
}
}
return amap;
}
/**
* Return population count (count of all bits set to 1) of i.
*/
template<typename T>
static unsigned int popCount(T i) {
unsigned int cnt = 0;
for(size_t j = 0; j < sizeof(T)*8; j++) {
if(i & 1) cnt++;
i >>= 1;
}
return cnt;
}
/**
* Calculate log-base-2 of i
*/
template<typename T>
static unsigned int myLog2(T i) {
assert_eq(1, popCount(i)); // must be power of 2
for(size_t j = 0; j < sizeof(T)*8; j++) {
if(i & 1) return (int)j;
i >>= 1;
}
assert(false);
return 0xffffffff;
}
/**
*
*/
template<typename TStr>
class DifferenceCoverSample {
public:
DifferenceCoverSample(const TStr& __text,
uint32_t __v,
bool __verbose = false,
bool __sanity = false,
ostream& __logger = cout) :
_text(__text),
_v(__v),
_verbose(__verbose),
_sanity(__sanity),
_ds(getDiffCover(_v, _verbose, _sanity)),
_dmap(getDeltaMap(_v, _ds)),
_d((uint32_t)length(_ds)),
_doffs(),
_isaPrime(),
_dInv(),
_log2v(myLog2(_v)),
_vmask(OFF_MASK << _log2v),
_logger(__logger)
{
assert_gt(_d, 0);
assert_eq(1, popCount(_v)); // must be power of 2
// Build map from d's to idx's
fill(_dInv, _v, 0xffffffff, Exact());
uint32_t lim = (uint32_t)length(_ds);
for(uint32_t i = 0; i < lim; i++) {
_dInv[_ds[i]] = i;
}
}
/**
* Allocate an amount of memory that simulates the peak memory
* usage of the DifferenceCoverSample with the given text and v.
* Throws bad_alloc if it's not going to fit in memory. Returns
* the approximate number of bytes the Cover takes at all times.
*/
static size_t simulateAllocs(const TStr& text, uint32_t v) {
String<uint32_t> ds = getDiffCover(v, false /*verbose*/, false /*sanity*/);
size_t len = length(text);
size_t sPrimeSz = (len / v) * length(ds);
// sPrime, sPrimeOrder, _isaPrime all exist in memory at
// once and that's the peak
AutoArray<TIndexOffU> aa(sPrimeSz * 3 + (1024 * 1024 /*out of caution*/));
return sPrimeSz * 4; // sPrime array
}
uint32_t v() const { return _v; }
uint32_t log2v() const { return _log2v; }
uint32_t vmask() const { return _vmask; }
uint32_t modv(TIndexOffU i) const { return (uint32_t)(i & ~_vmask); }
TIndexOffU divv(TIndexOffU i) const { return i >> _log2v; }
uint32_t d() const { return _d; }
bool verbose() const { return _verbose; }
bool sanityCheck() const { return _sanity; }
const TStr& text() const { return _text; }
const String<uint32_t>& ds() const { return _ds; }
const String<uint32_t>& dmap() const { return _dmap; }
ostream& log() const { return _logger; }
void build(int nthreads);
uint32_t tieBreakOff(TIndexOffU i, TIndexOffU j) const;
int64_t breakTie(TIndexOffU i, TIndexOffU j) const;
bool isCovered(TIndexOffU i) const;
TIndexOffU rank(TIndexOffU i) const;
/**
* Print out the suffix array such that every sample offset has its
* rank filled in and every non-sample offset is shown as '-'.
*/
void print(ostream& out) {
for(size_t i = 0; i < length(_text); i++) {
if(isCovered(i)) {
out << rank(i);
} else {
out << "-";
}
if(i < length(_text)-1) {
out << ",";
}
}
out << endl;
}
private:
void doBuiltSanityCheck() const;
void buildSPrime(String<TIndexOffU>& sPrime, size_t padding);
bool built() const {
return length(_isaPrime) > 0;
}
void verbose(const string& s) const {
if(this->verbose()) {
this->log() << s.c_str();
this->log().flush();
}
}
const TStr& _text; // text to sample
uint32_t _v; // periodicity of sample
bool _verbose; //
bool _sanity; //
String<uint32_t> _ds; // samples: idx -> d
String<uint32_t> _dmap; // delta map
uint32_t _d; // |D| - size of sample
String<TIndexOffU> _doffs; // offsets into sPrime/isaPrime for each d idx
String<TIndexOffU> _isaPrime; // ISA' array
String<uint32_t> _dInv; // Map from d -> idx
uint32_t _log2v;
TIndexOffU _vmask;
ostream& _logger;
};
/**
* Return true iff suffixes with offsets suf1 and suf2 out of host
* string 'host' are identical up to depth 'v'.
*/
template <typename TStr>
static inline bool suffixLt(const TStr& host, TIndexOffU suf1, TIndexOffU suf2) {
TIndexOffU hlen = (TIndexOffU)length(host);
assert_neq(suf1, suf2);
TIndexOffU i = 0;
while(suf1 + i < hlen && suf2 + i < hlen) {
if(host[suf1+i] < host[suf2+i]) return true;
if(host[suf1+i] > host[suf2+i]) return false;
i++;
}
if(suf1 + i == hlen) {
assert_lt(suf2 + i, hlen);
return false;
}
assert_eq(suf2 + i, hlen);
return true;
}
/**
* Sanity-check the difference cover by first inverting _isaPrime then
* checking that each successive suffix really is less than the next.
*/
template <typename TStr>
void DifferenceCoverSample<TStr>::doBuiltSanityCheck() const {
uint32_t v = this->v();
assert(built());
VMSG_NL(" Doing sanity check");
TIndexOffU added = 0;
String<TIndexOffU> sorted;
fill(sorted, length(_isaPrime), OFF_MASK, Exact());
for(size_t di = 0; di < this->d(); di++) {
uint32_t d = _ds[di];
size_t i = 0;
for(size_t doi = _doffs[di]; doi < _doffs[di+1]; doi++, i++) {
assert_eq(OFF_MASK, sorted[_isaPrime[doi]]);
// Maps the offset of the suffix to its rank
sorted[_isaPrime[doi]] = (TIndexOffU)(v*i + d);
added++;
}
}
assert_eq(added, length(_isaPrime));
for(size_t i = 0; i < length(sorted)-1; i++) {
assert(suffixLt(this->text(), sorted[i], sorted[i+1]));
}
}
/**
* Build the s' array by sampling suffixes (suffix offsets, actually)
* from t according to the difference-cover sample and pack them into
* an array of machine words in the order dictated by the "mu" mapping
* described in Burkhardt.
*
* Also builds _doffs map.
*/
template <typename TStr>
void DifferenceCoverSample<TStr>::buildSPrime(
String<TIndexOffU>& sPrime,
size_t padding)
{
const TStr& t = this->text();
const String<uint32_t>& ds = this->ds();
TIndexOffU tlen = (TIndexOffU)length(t);
uint32_t v = this->v();
uint32_t d = this->d();
assert_gt(v, 2);
assert_lt(d, v);
// Record where each d section should begin in sPrime
TIndexOffU tlenDivV = this->divv(tlen);
uint32_t tlenModV = this->modv(tlen);
TIndexOffU sPrimeSz = 0;
assert(empty(_doffs));
reserve(_doffs, d+1, Exact());
assert_eq(capacity(_doffs), d+1);
for(uint32_t di = 0; di < d; di++) {
// mu mapping
TIndexOffU sz = tlenDivV + ((ds[di] <= tlenModV) ? 1 : 0);
assert_geq(sz, 0);
appendValue(_doffs, sPrimeSz);
sPrimeSz += sz;
}
appendValue(_doffs, sPrimeSz);
#ifndef NDEBUG
if(tlenDivV > 0) {
for(size_t i = 0; i < d; i++) {
assert_gt(_doffs[i+1], _doffs[i]);
TIndexOffU diff = _doffs[i+1] - _doffs[i];
assert(diff == tlenDivV || diff == tlenDivV+1);
}
}
#endif
assert_eq(length(_doffs), d+1);
// Size sPrime appropriately
reserve(sPrime, sPrimeSz + padding, Exact()); // reserve extra slot for LS
fill(sPrime, sPrimeSz + padding, OFF_MASK, Exact());
// Slot suffixes from text into sPrime according to the mu
// mapping; where the mapping would leave a blank, insert a 0
TIndexOffU added = 0;
TIndexOffU i = 0;
for(uint64_t ti = 0; ti <= tlen; ti += v) {
for(uint32_t di = 0; di < d; di++) {
TIndexOffU tti = (TIndexOffU)(ti + ds[di]);
if(tti > tlen) break;
TIndexOffU spi = _doffs[di] + i;
assert_lt(spi, _doffs[di+1]);
assert_leq(tti, tlen);
assert_lt(spi, sPrimeSz);
assert_eq(OFF_MASK, sPrime[spi]);
sPrime[spi] = tti; added++;
}
i++;
}
assert_eq(added, sPrimeSz);
}
/**
* Return true iff suffixes with offsets suf1 and suf2 out of host
* string 'host' are identical up to depth 'v'.
*/
template <typename TStr>
static inline bool suffixSameUpTo(
const TStr& host,
TIndexOffU suf1,
TIndexOffU suf2,
TIndexOffU v)
{
for(TIndexOffU i = 0; i < v; i++) {
bool endSuf1 = suf1+i >= length(host);
bool endSuf2 = suf2+i >= length(host);
if((endSuf1 && !endSuf2) || (!endSuf1 && endSuf2)) return false;
if(endSuf1 && endSuf2) return true;
if(host[suf1+i] != host[suf2+i]) return false;
}
return true;
}
template<typename TStr>
struct VSortingParam {
DifferenceCoverSample<TStr>* dcs;
TIndexOffU* sPrimeArr;
size_t sPrimeSz;
TIndexOffU* sPrimeOrderArr;
size_t depth;
const std::vector<size_t>* boundaries;
size_t* cur;
MUTEX_T* mutex;
};
template<typename TStr>
#ifdef WITH_TBB
class VSorting_worker {
void *vp;
public:
VSorting_worker(const VSorting_worker& W): vp(W.vp) {};
VSorting_worker(void *vp_):vp(vp_) {};
void operator()() const
{
#else
static void VSorting_worker(void *vp)
{
#endif
VSortingParam<TStr>* param = (VSortingParam<TStr>*)vp;
DifferenceCoverSample<TStr>* dcs = param->dcs;
const TStr& host = dcs->text();
const size_t hlen = length(host);
uint32_t v = dcs->v();
while(true) {
size_t cur = 0;
{
ThreadSafe ts(param->mutex);
cur = *(param->cur);
(*param->cur)++;
}
if(cur >= param->boundaries->size()) return;
size_t begin = (cur == 0 ? 0 : (*param->boundaries)[cur-1]);
size_t end = (*param->boundaries)[cur];
assert_leq(begin, end);
if(end - begin <= 1) continue;
mkeyQSortSuf2(
host,
hlen,
param->sPrimeArr,
param->sPrimeSz,
param->sPrimeOrderArr,
4,
begin,
end,
param->depth,
v);
}
}
#ifdef WITH_TBB
};
#endif
/**
* Calculates a ranking of all suffixes in the sample and stores them,
* packed according to the mu mapping, in _isaPrime.
*/
template <typename TStr>
void DifferenceCoverSample<TStr>::build(int nthreads) {
// Local names for relevant types
typedef typename Value<TStr>::Type TAlphabet;
VMSG_NL("Building DifferenceCoverSample");
// Local names for relevant data
const TStr& t = this->text();
uint32_t v = this->v();
assert_gt(v, 2);
// Build s'
size_t padding = 1;
String<TIndexOffU> sPrime;
VMSG_NL(" Building sPrime");
buildSPrime(sPrime, padding);
size_t sPrimeSz = length(sPrime) - padding;
assert_gt(length(sPrime), 0);
assert_leq(length(sPrime), length(t)+1); // +1 is because of the end-cap
TIndexOffU nextRank = 0;
{
VMSG_NL(" Building sPrimeOrder");
String<TIndexOffU> sPrimeOrder;
reserve(sPrimeOrder, length(sPrime)+1, Exact()); // reserve extra slot for LS
resize(sPrimeOrder, length(sPrime), Exact());
for(TIndexOffU i = 0; i < (TIndexOffU)length(sPrimeOrder); i++) {
sPrimeOrder[i] = i;
}
// sPrime now holds suffix-offsets for DC samples.
{
Timer timer(cout, " V-Sorting samples time: ", this->verbose());
VMSG_NL(" V-Sorting samples");
// Extract backing-store array from sPrime and sPrimeOrder;
// the mkeyQSortSuf2 routine works on the array for maximum
// efficiency
TIndexOffU *sPrimeArr = (TIndexOffU*)begin(sPrime);
assert_eq(sPrimeArr[0], sPrime[0]);
assert_eq(sPrimeArr[sPrimeSz-1], sPrime[sPrimeSz-1]);
TIndexOffU *sPrimeOrderArr = (TIndexOffU*)begin(sPrimeOrder);
assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]);
assert_eq(sPrimeOrderArr[sPrimeSz-1], sPrimeOrder[sPrimeSz-1]);
TIndexOffU *sOrig = NULL;
if(this->sanityCheck()) {
sOrig = new TIndexOffU[sPrimeSz];
memcpy(sOrig, sPrimeArr, OFF_SIZE * sPrimeSz);
}
// Sort sample suffixes up to the vth character using a
// multikey quicksort. Sort time is proportional to the
// number of samples times v. It isn't quadratic.
// sPrimeOrder is passed in as a swapping partner for
// sPrimeArr, i.e., every time the multikey qsort swaps
// elements in sPrime, it swaps the same elements in
// sPrimeOrder too. This allows us to easily reconstruct
// what the sort did.
if(nthreads == 1) {
mkeyQSortSuf2(t, sPrimeArr, sPrimeSz, sPrimeOrderArr, 4,
this->verbose(), this->sanityCheck(), v);
} else {
int query_depth = 0;
int tmp_nthreads = nthreads;
while(tmp_nthreads > 0) {
query_depth++;
tmp_nthreads >>= 1;
}
std::vector<size_t> boundaries; // bucket boundaries for parallelization
mkeyQSortSuf2(t, sPrimeArr, sPrimeSz, sPrimeOrderArr, 4,
this->verbose(), false, query_depth, &boundaries);
if(boundaries.size() > 0) {
#ifdef WITH_TBB
tbb::task_group tbb_grp;
#else
AutoArray<tthread::thread*> threads(nthreads);
#endif
std::vector<VSortingParam<TStr> > tparams;
tparams.resize(nthreads);
size_t cur = 0;
MUTEX_T mutex;
for(int tid = 0; tid < nthreads; tid++) {
// Calculate bucket sizes by doing a binary search for each
// suffix and noting where it lands
tparams[tid].dcs = this;
tparams[tid].sPrimeArr = sPrimeArr;
tparams[tid].sPrimeSz = sPrimeSz;
tparams[tid].sPrimeOrderArr = sPrimeOrderArr;
tparams[tid].depth = query_depth;
tparams[tid].boundaries = &boundaries;
tparams[tid].cur = &cur;
tparams[tid].mutex = &mutex;
#ifdef WITH_TBB
tbb_grp.run(VSorting_worker<TStr>(((void*)&tparams[tid])));
}
tbb_grp.wait();
#else
threads[tid] = new tthread::thread(VSorting_worker<TStr>, (void*)&tparams[tid]);
}
for (int tid = 0; tid < nthreads; tid++) {
threads[tid]->join();
}
#endif
}
}
if(this->sanityCheck()) {
sanityCheckOrderedSufs(t, length(t), sPrimeArr, sPrimeSz, v);
for(size_t i = 0; i < sPrimeSz; i++) {
assert_eq(sPrimeArr[i], sOrig[sPrimeOrderArr[i]]);
}
delete[] sOrig;
}
// Make sure sPrime and sPrimeOrder are consistent with
// their respective backing-store arrays
assert_eq(sPrimeArr[0], sPrime[0]);
assert_eq(sPrimeArr[sPrimeSz-1], sPrime[sPrimeSz-1]);
assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]);
assert_eq(sPrimeOrderArr[sPrimeSz-1], sPrimeOrder[sPrimeSz-1]);
}
// Now assign the ranking implied by the sorted sPrime/sPrimeOrder
// arrays back into sPrime.
VMSG_NL(" Allocating rank array");
reserve(_isaPrime, length(sPrime), Exact());
fill(_isaPrime, length(sPrime), OFF_MASK, Exact());
assert_gt(length(_isaPrime), 0);
{
Timer timer(cout, " Ranking v-sort output time: ", this->verbose());
VMSG_NL(" Ranking v-sort output");
for(size_t i = 0; i < sPrimeSz-1; i++) {
// Place the appropriate ranking
_isaPrime[sPrimeOrder[i]] = nextRank;
// If sPrime[i] and sPrime[i+1] are identical up to v, then we
// should give the next suffix the same rank
if(!suffixSameUpTo(t, sPrime[i], sPrime[i+1], v)) nextRank++;
}
_isaPrime[sPrimeOrder[sPrimeSz-1]] = nextRank; // finish off
}
// sPrimeOrder is destroyed
// All the information we need is now in _isaPrime
}
#ifndef NDEBUG
// Check that all ranks are sane
for(size_t i = 0; i < sPrimeSz; i++) {
assert_neq(_isaPrime[i], OFF_MASK);
assert_lt(_isaPrime[i], sPrimeSz);
}
#endif
// Now pass the sPrimeRanks[] array to LarssonSadakane (in SeqAn).
_isaPrime[length(_isaPrime) - 1] = sPrimeSz;
sPrime[length(sPrime) - 1] = sPrimeSz;
{
Timer timer(cout, " Invoking Larsson-Sadakane on ranks time: ", this->verbose());
VMSG_NL(" Invoking Larsson-Sadakane on ranks");
_Context_LSS<TIndexOff> c;
c.suffixsort(
(TIndexOff*)begin(_isaPrime, Standard()),
(TIndexOff*)begin(sPrime, Standard()),
(TIndexOff)sPrimeSz,
(TIndexOff)length(sPrime),
0);
}
// sPrime now contains the suffix array (which we ignore)
assert_eq(length(_isaPrime), length(sPrime));
assert_gt(length(_isaPrime), 0);
// chop off final character of _isaPrime
resize(_isaPrime, sPrimeSz);
// Subtract 1 from each isaPrime (to adjust for LarssonSadakane
// always ranking the final suffix as lexicographically first)
for(size_t i = 0; i < length(_isaPrime); i++) {
_isaPrime[i]--;
}
VMSG_NL(" Sanity-checking and returning");
// done!
//if(this->verbose()) print(cout);
if(this->sanityCheck()) doBuiltSanityCheck();
}
/**
* Return true iff index i within the text is covered by the difference
* cover sample. Allow i to be off the end of the text; simplifies
* logic elsewhere.
*/
template <typename TStr>
bool DifferenceCoverSample<TStr>::isCovered(TIndexOffU i) const {
assert(built());
uint32_t modi = this->modv(i);
assert_lt(modi, length(_dInv));
return _dInv[modi] != 0xffffffff;
}
/**
* Given a text offset that's covered, return its lexicographical rank
* among the sample suffixes.
*/
template <typename TStr>
TIndexOffU DifferenceCoverSample<TStr>::rank(TIndexOffU i) const {
assert(built());
assert_lt(i, length(this->text()));
uint32_t imodv = this->modv(i);
assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample
TIndexOffU ioff = this->divv(i);
assert_lt(ioff, _doffs[_dInv[imodv]+1] - _doffs[_dInv[imodv]]);
TIndexOffU isaIIdx = _doffs[_dInv[imodv]] + ioff;
assert_lt(isaIIdx, length(_isaPrime));
TIndexOffU isaPrimeI = _isaPrime[isaIIdx];
assert_leq(isaPrimeI, length(_isaPrime));
return isaPrimeI;
}
/**
* Return: < 0 if suffix i is lexicographically less than suffix j; > 0
* if suffix j is lexicographically greater.
*/
template <typename TStr>
int64_t DifferenceCoverSample<TStr>::breakTie(TIndexOffU i, TIndexOffU j) const {
assert(built());
assert_neq(i, j);
assert_lt(i, length(this->text()));
assert_lt(j, length(this->text()));
uint32_t imodv = this->modv(i);
uint32_t jmodv = this->modv(j);
assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample
assert_neq(0xffffffff, _dInv[jmodv]); // must be in the sample
uint32_t dimodv = _dInv[imodv];
uint32_t djmodv = _dInv[jmodv];
TIndexOffU ioff = this->divv(i);
TIndexOffU joff = this->divv(j);
assert_lt(dimodv+1, length(_doffs));
assert_lt(djmodv+1, length(_doffs));
// assert_lt: expected (32024) < (0)
assert_lt(ioff, _doffs[dimodv+1] - _doffs[dimodv]);
assert_lt(joff, _doffs[djmodv+1] - _doffs[djmodv]);
TIndexOffU isaIIdx = _doffs[dimodv] + ioff;
TIndexOffU isaJIdx = _doffs[djmodv] + joff;
assert_lt(isaIIdx, length(_isaPrime));
assert_lt(isaJIdx, length(_isaPrime));
assert_neq(isaIIdx, isaJIdx); // ranks must be unique
TIndexOffU isaPrimeI = _isaPrime[isaIIdx];
TIndexOffU isaPrimeJ = _isaPrime[isaJIdx];
assert_neq(isaPrimeI, isaPrimeJ); // ranks must be unique
assert_leq(isaPrimeI, length(_isaPrime));
assert_leq(isaPrimeJ, length(_isaPrime));
return (int64_t)isaPrimeI - (int64_t)isaPrimeJ;
}
/**
* Given i, j, return the number of additional characters that need to
* be compared before the difference cover can break the tie.
*/
template <typename TStr>
uint32_t DifferenceCoverSample<TStr>::tieBreakOff(TIndexOffU i, TIndexOffU j) const {
const TStr& t = this->text();
const String<uint32_t>& dmap = this->dmap();
assert(built());
// It's actually convenient to allow this, but we're permitted to
// return nonsense in that case
if(t[i] != t[j]) return 0xffffffff;
//assert_eq(t[i], t[j]); // if they're unequal, there's no tie to break
uint32_t v = this->v();
assert_neq(i, j);
assert_lt(i, length(t));
assert_lt(j, length(t));
uint32_t imod = this->modv(i);
uint32_t jmod = this->modv(j);
uint32_t diffLeft = (jmod >= imod)? (jmod - imod) : (jmod + v - imod);
uint32_t diffRight = (imod >= jmod)? (imod - jmod) : (imod + v - jmod);
assert_lt(diffLeft, length(dmap));
assert_lt(diffRight, length(dmap));
uint32_t destLeft = dmap[diffLeft]; // offset where i needs to be
uint32_t destRight = dmap[diffRight]; // offset where i needs to be
assert(isCovered(destLeft));
assert(isCovered(destLeft+diffLeft));
assert(isCovered(destRight));
assert(isCovered(destRight+diffRight));
assert_lt(destLeft, v);
assert_lt(destRight, v);
uint32_t deltaLeft = (destLeft >= imod)? (destLeft - imod) : (destLeft + v - imod);
if(deltaLeft == v) deltaLeft = 0;
uint32_t deltaRight = (destRight >= jmod)? (destRight - jmod) : (destRight + v - jmod);
if(deltaRight == v) deltaRight = 0;
assert_lt(deltaLeft, v);
assert_lt(deltaRight, v);
assert(isCovered(i+deltaLeft));
assert(isCovered(j+deltaLeft));
assert(isCovered(i+deltaRight));
assert(isCovered(j+deltaRight));
return min(deltaLeft, deltaRight);
}
#endif /*DIFF_SAMPLE_H_*/
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/Velcon-Zheng/bowtie.git
git@gitee.com:Velcon-Zheng/bowtie.git
Velcon-Zheng
bowtie
bowtie
master

搜索帮助