代码拉取完成,页面将自动刷新
同步操作将从 src-openEuler/redis 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
From 1e9a7748716e1cd234893dd858d07ffa77920e41 Mon Sep 17 00:00:00 2001
From: Otmar Ertl <otmar.ertl@gmail.com>
Date: Sat, 10 Mar 2018 20:13:21 +0100
Subject: [PATCH] improved HyperLogLog cardinality estimation
based on method described in https://arxiv.org/abs/1702.01284
that does not rely on any magic constants
---
src/hyperloglog.c | 230 +++++++++++++++++++++++-----------------------
1 file changed, 117 insertions(+), 113 deletions(-)
diff --git a/src/hyperloglog.c b/src/hyperloglog.c
index 8ab9d2a30c0..7f5f62445c9 100644
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@@ -192,6 +192,7 @@ struct hllhdr {
#define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0)
#define HLL_P 14 /* The greater is P, the smaller the error. */
+#define HLL_Q (63-HLL_P)
#define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
#define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */
#define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */
@@ -510,13 +511,9 @@ int hllDenseAdd(uint8_t *registers, unsigned char *ele, size_t elesize) {
return hllDenseSet(registers,index,count);
}
-/* Compute SUM(2^-reg) in the dense representation.
- * PE is an array with a pre-computer table of values 2^-reg indexed by reg.
- * As a side effect the integer pointed by 'ezp' is set to the number
- * of zero registers. */
-double hllDenseSum(uint8_t *registers, double *PE, int *ezp) {
- double E = 0;
- int j, ez = 0;
+/* Compute the register histogram in the dense representation. */
+void hllDenseRegHisto(uint8_t *registers, int* regHisto) {
+ int j;
/* Redis default is to use 16384 registers 6 bits each. The code works
* with other values by modifying the defines, but for our target value
@@ -527,47 +524,49 @@ double hllDenseSum(uint8_t *registers, double *PE, int *ezp) {
r10, r11, r12, r13, r14, r15;
for (j = 0; j < 1024; j++) {
/* Handle 16 registers per iteration. */
- r0 = r[0] & 63; if (r0 == 0) ez++;
- r1 = (r[0] >> 6 | r[1] << 2) & 63; if (r1 == 0) ez++;
- r2 = (r[1] >> 4 | r[2] << 4) & 63; if (r2 == 0) ez++;
- r3 = (r[2] >> 2) & 63; if (r3 == 0) ez++;
- r4 = r[3] & 63; if (r4 == 0) ez++;
- r5 = (r[3] >> 6 | r[4] << 2) & 63; if (r5 == 0) ez++;
- r6 = (r[4] >> 4 | r[5] << 4) & 63; if (r6 == 0) ez++;
- r7 = (r[5] >> 2) & 63; if (r7 == 0) ez++;
- r8 = r[6] & 63; if (r8 == 0) ez++;
- r9 = (r[6] >> 6 | r[7] << 2) & 63; if (r9 == 0) ez++;
- r10 = (r[7] >> 4 | r[8] << 4) & 63; if (r10 == 0) ez++;
- r11 = (r[8] >> 2) & 63; if (r11 == 0) ez++;
- r12 = r[9] & 63; if (r12 == 0) ez++;
- r13 = (r[9] >> 6 | r[10] << 2) & 63; if (r13 == 0) ez++;
- r14 = (r[10] >> 4 | r[11] << 4) & 63; if (r14 == 0) ez++;
- r15 = (r[11] >> 2) & 63; if (r15 == 0) ez++;
-
- /* Additional parens will allow the compiler to optimize the
- * code more with a loss of precision that is not very relevant
- * here (floating point math is not commutative!). */
- E += (PE[r0] + PE[r1]) + (PE[r2] + PE[r3]) + (PE[r4] + PE[r5]) +
- (PE[r6] + PE[r7]) + (PE[r8] + PE[r9]) + (PE[r10] + PE[r11]) +
- (PE[r12] + PE[r13]) + (PE[r14] + PE[r15]);
+ r0 = r[0] & 63;
+ r1 = (r[0] >> 6 | r[1] << 2) & 63;
+ r2 = (r[1] >> 4 | r[2] << 4) & 63;
+ r3 = (r[2] >> 2) & 63;
+ r4 = r[3] & 63;
+ r5 = (r[3] >> 6 | r[4] << 2) & 63;
+ r6 = (r[4] >> 4 | r[5] << 4) & 63;
+ r7 = (r[5] >> 2) & 63;
+ r8 = r[6] & 63;
+ r9 = (r[6] >> 6 | r[7] << 2) & 63;
+ r10 = (r[7] >> 4 | r[8] << 4) & 63;
+ r11 = (r[8] >> 2) & 63;
+ r12 = r[9] & 63;
+ r13 = (r[9] >> 6 | r[10] << 2) & 63;
+ r14 = (r[10] >> 4 | r[11] << 4) & 63;
+ r15 = (r[11] >> 2) & 63;
+
+ regHisto[r0] += 1;
+ regHisto[r1] += 1;
+ regHisto[r2] += 1;
+ regHisto[r3] += 1;
+ regHisto[r4] += 1;
+ regHisto[r5] += 1;
+ regHisto[r6] += 1;
+ regHisto[r7] += 1;
+ regHisto[r8] += 1;
+ regHisto[r9] += 1;
+ regHisto[r10] += 1;
+ regHisto[r11] += 1;
+ regHisto[r12] += 1;
+ regHisto[r13] += 1;
+ regHisto[r14] += 1;
+ regHisto[r15] += 1;
+
r += 12;
}
} else {
- for (j = 0; j < HLL_REGISTERS; j++) {
+ for(j = 0; j < HLL_REGISTERS; j++) {
unsigned long reg;
-
HLL_DENSE_GET_REGISTER(reg,registers,j);
- if (reg == 0) {
- ez++;
- /* Increment E at the end of the loop. */
- } else {
- E += PE[reg]; /* Precomputed 2^(-reg[j]). */
- }
+ regHisto[reg] += 1;
}
- E += ez; /* Add 2^0 'ez' times. */
}
- *ezp = ez;
- return E;
}
/* ================== Sparse representation implementation ================= */
@@ -903,76 +902,96 @@ int hllSparseAdd(robj *o, unsigned char *ele, size_t elesize) {
return hllSparseSet(o,index,count);
}
-/* Compute SUM(2^-reg) in the sparse representation.
- * PE is an array with a pre-computer table of values 2^-reg indexed by reg.
- * As a side effect the integer pointed by 'ezp' is set to the number
- * of zero registers. */
-double hllSparseSum(uint8_t *sparse, int sparselen, double *PE, int *ezp, int *invalid) {
- double E = 0;
- int ez = 0, idx = 0, runlen, regval;
+/* Compute the register histogram in the sparse representation. */
+void hllSparseRegHisto(uint8_t *sparse, int sparselen, int *invalid, int* regHisto) {
+ int idx = 0, runlen, regval;
uint8_t *end = sparse+sparselen, *p = sparse;
while(p < end) {
if (HLL_SPARSE_IS_ZERO(p)) {
runlen = HLL_SPARSE_ZERO_LEN(p);
idx += runlen;
- ez += runlen;
- /* Increment E at the end of the loop. */
+ regHisto[0] += runlen;
p++;
} else if (HLL_SPARSE_IS_XZERO(p)) {
runlen = HLL_SPARSE_XZERO_LEN(p);
idx += runlen;
- ez += runlen;
- /* Increment E at the end of the loop. */
+ regHisto[0] += runlen;
p += 2;
} else {
runlen = HLL_SPARSE_VAL_LEN(p);
regval = HLL_SPARSE_VAL_VALUE(p);
idx += runlen;
- E += PE[regval]*runlen;
+ regHisto[regval] += runlen;
p++;
}
}
if (idx != HLL_REGISTERS && invalid) *invalid = 1;
- E += ez; /* Add 2^0 'ez' times. */
- *ezp = ez;
- return E;
}
/* ========================= HyperLogLog Count ==============================
* This is the core of the algorithm where the approximated count is computed.
- * The function uses the lower level hllDenseSum() and hllSparseSum() functions
- * as helpers to compute the SUM(2^-reg) part of the computation, which is
- * representation-specific, while all the rest is common. */
-
-/* Implements the SUM operation for uint8_t data type which is only used
- * internally as speedup for PFCOUNT with multiple keys. */
-double hllRawSum(uint8_t *registers, double *PE, int *ezp) {
- double E = 0;
- int j, ez = 0;
+ * The function uses the lower level hllDenseRegHisto() and hllSparseRegHisto()
+ * functions as helpers to compute histogram of register values part of the
+ * computation, which is representation-specific, while all the rest is common. */
+
+/* Implements the register histogram calculation for uint8_t data type
+ * which is only used internally as speedup for PFCOUNT with multiple keys. */
+void hllRawRegHisto(uint8_t *registers, int* regHisto) {
uint64_t *word = (uint64_t*) registers;
uint8_t *bytes;
+ int j;
for (j = 0; j < HLL_REGISTERS/8; j++) {
if (*word == 0) {
- ez += 8;
+ regHisto[0] += 8;
} else {
bytes = (uint8_t*) word;
- if (bytes[0]) E += PE[bytes[0]]; else ez++;
- if (bytes[1]) E += PE[bytes[1]]; else ez++;
- if (bytes[2]) E += PE[bytes[2]]; else ez++;
- if (bytes[3]) E += PE[bytes[3]]; else ez++;
- if (bytes[4]) E += PE[bytes[4]]; else ez++;
- if (bytes[5]) E += PE[bytes[5]]; else ez++;
- if (bytes[6]) E += PE[bytes[6]]; else ez++;
- if (bytes[7]) E += PE[bytes[7]]; else ez++;
+ regHisto[bytes[0]] += 1;
+ regHisto[bytes[1]] += 1;
+ regHisto[bytes[2]] += 1;
+ regHisto[bytes[3]] += 1;
+ regHisto[bytes[4]] += 1;
+ regHisto[bytes[5]] += 1;
+ regHisto[bytes[6]] += 1;
+ regHisto[bytes[7]] += 1;
}
word++;
}
- E += ez; /* 2^(-reg[j]) is 1 when m is 0, add it 'ez' times for every
- zero register in the HLL. */
- *ezp = ez;
- return E;
+}
+
+/* Helper function sigma as defined in
+ * "New cardinality estimation algorithms for HyperLogLog sketches"
+ * Otmar Ertl, arXiv:1702.01284 */
+double hllSigma(double x) {
+ if (x == 1.) return INFINITY;
+ double zPrime;
+ double y = 1;
+ double z = x;
+ do {
+ x *= x;
+ zPrime = z;
+ z += x * y;
+ y += y;
+ } while(zPrime != z);
+ return z;
+}
+
+/* Helper function tau as defined in
+ * "New cardinality estimation algorithms for HyperLogLog sketches"
+ * Otmar Ertl, arXiv:1702.01284 */
+double hllTau(double x) {
+ if (x == 0. || x == 1.) return 0.;
+ double zPrime;
+ double y = 1.0;
+ double z = 1 - x;
+ do {
+ x = sqrt(x);
+ zPrime = z;
+ y *= 0.5;
+ z -= pow(1 - x, 2)*y;
+ } while(zPrime != z);
+ return z / 3;
}
/* Return the approximated cardinality of the set based on the harmonic
@@ -988,49 +1007,34 @@ double hllRawSum(uint8_t *registers, double *PE, int *ezp) {
* keys (no need to work with 6-bit integers encoding). */
uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
double m = HLL_REGISTERS;
- double E, alpha = 0.7213/(1+1.079/m);
- int j, ez; /* Number of registers equal to 0. */
-
- /* We precompute 2^(-reg[j]) in a small table in order to
- * speedup the computation of SUM(2^-register[0..i]). */
- static int initialized = 0;
- static double PE[64];
- if (!initialized) {
- PE[0] = 1; /* 2^(-reg[j]) is 1 when m is 0. */
- for (j = 1; j < 64; j++) {
- /* 2^(-reg[j]) is the same as 1/2^reg[j]. */
- PE[j] = 1.0/(1ULL << j);
- }
- initialized = 1;
- }
+ double E;
+ int j;
+ double alphaInf = 0.5 / log(2.);
+ int regHisto[HLL_Q+2] = {0};
- /* Compute SUM(2^-register[0..i]). */
+ /* Compute register histogram */
if (hdr->encoding == HLL_DENSE) {
- E = hllDenseSum(hdr->registers,PE,&ez);
+ hllDenseRegHisto(hdr->registers,regHisto);
} else if (hdr->encoding == HLL_SPARSE) {
- E = hllSparseSum(hdr->registers,
- sdslen((sds)hdr)-HLL_HDR_SIZE,PE,&ez,invalid);
+ hllSparseRegHisto(hdr->registers,
+ sdslen((sds)hdr)-HLL_HDR_SIZE,invalid,regHisto);
} else if (hdr->encoding == HLL_RAW) {
- E = hllRawSum(hdr->registers,PE,&ez);
+ hllRawRegHisto(hdr->registers,regHisto);
} else {
serverPanic("Unknown HyperLogLog encoding in hllCount()");
}
- /* Apply loglog-beta to the raw estimate. See:
- * "LogLog-Beta and More: A New Algorithm for Cardinality Estimation
- * Based on LogLog Counting" Jason Qin, Denys Kim, Yumei Tung
- * arXiv:1612.02284 */
- double zl = log(ez + 1);
- double beta = -0.370393911*ez +
- 0.070471823*zl +
- 0.17393686*pow(zl,2) +
- 0.16339839*pow(zl,3) +
- -0.09237745*pow(zl,4) +
- 0.03738027*pow(zl,5) +
- -0.005384159*pow(zl,6) +
- 0.00042419*pow(zl,7);
-
- E = llroundl(alpha*m*(m-ez)*(1/(E+beta)));
+ /* Estimate cardinality form register histogram. See:
+ * "New cardinality estimation algorithms for HyperLogLog sketches"
+ * Otmar Ertl, arXiv:1702.01284 */
+ double z = m * hllTau((m-regHisto[HLL_Q+1])/(double)m);
+ for (j = HLL_Q; j >= 1; --j) {
+ z += regHisto[j];
+ z *= 0.5;
+ }
+ z += m * hllSigma(regHisto[0]/(double)m);
+ E = llroundl(alphaInf*m*m/z);
+
return (uint64_t) E;
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。