Related
https://jsfiddle.net/2L4t9saq/180/ is my fiddle
most of the code is just useless, ill just post the stuff that matters
var baseConverter = function(r, e, n) {
var o = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
if (e <= 0 || e > o.length || n <= 0 || n > o.length) return console.log("Base unallowed"), null;
var l, t = 0;
if (10 != e) {
var a = r.length;
for (l = 0; l < a; l++) {
var u, f = -1;
for (u = 0; u < o.length; u++)
if (r[l] == o[u]) {
f = 1;
break
}
if (u >= e) return console.log("Symbol unallowed in baseform"), null;
if (-1 == f) return console.log("Symbol not found"), null;
var s = a - l - 1;
t += 0 == s ? u : u * Math.pow(e, s)
}
} else t = parseInt(r);
if (10 != n) {
for (var g = []; t > 0;) {
var i = t % n;
if (i < 0 || i >= o.length) return console.log("Out of bounds error"), null;
g.push(o[i]), t = parseInt(t / n)
}
return g.reverse().toString().replace(/,/g, "")
}
return t.toString()
}
var b36torgba = function(input) {
for (var i = 1; i < (input.length / 8) + 1; i++) {
var arr = input
var r = arr.charAt[0 + (i - 1) * 8] + "" + arr.charAt[1 + (i - 1) * 8]
var g = arr.charAt[2 + (i - 1) * 8] + "" + arr.charAt[3 + (i - 1) * 8]
console.log(g.charAt[2])
var b = arr.charAt[4 + (i - 1) * 8] + "" + arr.charAt[5 + (i - 1) * 8]
console.log(b)
var a = arr.charAt[6 + (i - 1) * 8] + "" + arr.charAt[7 + (i - 1) * 8]
console.log(a)
var rrgba = baseConverter(r, 36, 10)
var grgba = baseConverter(r, 36, 10)
var brgba = baseConverter(r, 36, 10)
var argba = baseConverter(r, 36, 10)
var bigMessOfAVariable = "rgba(" + rrgba + "," + grgba + "," + brgba + "," + argba + "),"
return bigMessOfAVariable;
}
}
you can ignore the top function, all it is is a base converter script, that takes in three inputs, an input, the base its in, and the base it should be converted to: eg baseConverter(73,36,10) will output 255.
now, the problem is with my b36torgba function.
it will take in a string, which is guaranteed to have a length that is either 0, 8, or a multiple of 8, this is just standardization to make sure everything runs smoothly, without having 700 indexOf[] functions.
it takes in the input, and divides it by 8, this tells the function how many bytes it has to go through, and how many it will spit out, so a string "[7300002S7300002S]" should (divided by 8) output 2, therefore the script runs 2 iterations.
currently, it should be taking in the string, and assigning each group of 2 characters (again standard) to a specific variable, this will allow it to all be put in the end and outputted as the same string but in base 10 rgba (hence 73 being use, 73 in base 36 is 255), but before it can do any of that, it breaks when it tries to find the characters in a string, saying this syntax error:
Uncaught TypeError: Cannot read property '0' of undefined
at b36torgba ((index):40)
at window.onload ((index):55)
why does it break as soon as it tries to feed the string into my charAt()'s?
ps: i do understand that the code in its current state, if it worked, it'd only output the rgba value of the last 8 characters
Easy mistake. You're using charAt (which is a function) by doing charAt[index] (using square brackets), rather than charAt(index) (using round brackets). Fixing that up should solve your issue.
Also - you're calling the function by doing b36torgba(["7300002S7300002S"]) in your JSFiddle, and trying to do string manipulation on it. Since ["7300002S7300002S"] is an array, not a string, .charAt() won't work on it. Try calling the function by doing b36torgba("7300002S7300002S") instead.
I have an problem where i have 3 times of the 24 hour day. To keep it simple i can use the decimal representation:
a) 23:45 (23.75)
b) 11:30 (11.50)
c) 00:15 (00.25)
I'd like to know , for each time, which other time is closest.
var closestTime = 24
var closestActualTime = 0;
for (var i = 0; i < times.length; i++) {
if (times[i].time == this.time) continue;
var temp = Math.abs(this.time - times[i].time)
if (temp < closestTime) {
closestTime = temp;
closestActualTime = times[i].time;
}
}
My issue is that 23:45 and 00:25 are actually really close but i don't know how process a variable with a modulo type
I suggest to build a list with the pairs and then calculate the difference.
The difference is the third element in the pairs array.
Basically you need to check the delta and if it greater than 12 hours, take the difference from 24 and delta.
delta = Math.abs(aa - bb);
if (delta > 12) {
delta = 24 - delta;
}
function combination(array, size) {
function c(part, start) {
var i, l, p;
for (i = start, l = array.length + part.length + 1 - size; i < l; i++) {
p = part.slice();
p.push(array[i]);
p.length < size ? c(p, i + 1) : result.push(p);
}
}
var result = [];
c([], 0);
return result;
}
function timeDelta(a, b) {
function decimalTime(s) {
var p = s.split(':');
return +p[0] + p[1] / 60;
}
function padZero(v) {
return (v < 10) ? '0' + v : String(v);
}
var aa = decimalTime(a),
bb = decimalTime(b),
delta = Math.abs(aa - bb);
if (delta > 12) {
delta = 24 - delta;
}
return padZero(Math.floor(delta)) + ':' + padZero(Math.round(60 * (delta - Math.floor(delta))));
}
var times = ['23:45', '11:30', '00:15'],
pairs = combination(times, 2);
pairs.forEach(function (a, i, aa) {
aa[i][2] = timeDelta(a[0], a[1]);
});
console.log(pairs);
.as-console-wrapper { max-height: 100% !important; top: 0; }
Loop over times.
Try combinations of delta time, offset by 24 hours.
Pick smallest delta time.
var times = [23.75, 11.50, 3, 6, 7];
/**
* timeClosestTo
*
* #param {number} time
* #returns {number}
*/
function timeClosestTo(time) {
//Distance variable to compare against
var distance = 100;
//Hours in a day
var day = 24;
//Current best
var best = null;
//Test against all times
for (var i = 0; i < times.length; i++) {
//Find best score based upon day
var d = Math.min(Math.abs((times[i]) - (time)), Math.abs((times[i] + day) - time), Math.abs((times[i]) - (time + day)), Math.abs((times[i] + day) - (time + day)));
//If best found distance yet, set best to current
if (d < distance) {
best = times[i];
distance = d;
}
}
//Return best
return best;
}
console.log("times to deal with:",times.join(", "));
console.log("closest to 1:", timeClosestTo(1), "closest to 11:", timeClosestTo(11), "closest to 5:", timeClosestTo(5));
Quite functionally i would do this job as follows
var times = [23.75,11.50,0.25],
diffs = times.reduce((d,t1,i,a) => a[i+1] ? d.concat(a.slice(i+1)
.map(t2 => [t1,t2,[Math.min(Math.abs(t1-t2),24-Math.abs(t1-t2))]
.map(n => ~~n + ":" + (n%1)*60)[0]]))
: d,[]);
console.log(diffs);
function div(num) {
while (num <= 9) {
var a = (num - (num % 10));
var b = (a / 10)
var c = 0
var digits = num.toString().split("")
for (i = digits.length - 1; i > 2; i--) {
c = digits[i]
}
num = b - (c * 2);
}
return num;
}
document.write(div(1234))
My code is supposed to take the number and get the last digit and subtract it from the rest of the number, and repeat the process until the answer is lesser or equal to 9. I made a while loop but I keep on getting the number I started with. What is wrong and how do I fix it?
Edit :
It is supposed to multiply the last digit by 2 and then subtract it from the rest of the number and then repeat the process until the answer is less than or equal to 9
while (num<=9)
This should be
while (num>=9)
As of now, the while loop will be executed only for a number that is less than 9.
Use while(num >= 9) instead of while(num <= 9).
function div(num) {
while (num >= 9) {
var a = (num - (num % 10));
var b = (a / 10)
var c = 0
var digits = num.toString().split("")
for (i = digits.length - 1; i > 2; i--) {
c = digits[i]
}
num = b - (c * 2);
}
return num;
}
document.write(div(1234));
So I have a random javascript array of names...
[#larry,#nicholas,#notch] etc.
They all start with the # symbol. I'd like to sort them by the Levenshtein Distance so that the the ones at the top of the list are closest to the search term. At the moment, I have some javascript that uses jQuery's .grep() on it using javascript .match() method around the entered search term on key press:
(code edited since first publish)
limitArr = $.grep(imTheCallback, function(n){
return n.match(searchy.toLowerCase())
});
modArr = limitArr.sort(levenshtein(searchy.toLowerCase(), 50))
if (modArr[0].substr(0, 1) == '#') {
if (atRes.childred('div').length < 6) {
modArr.forEach(function(i){
atRes.append('<div class="oneResult">' + i + '</div>');
});
}
} else if (modArr[0].substr(0, 1) == '#') {
if (tagRes.children('div').length < 6) {
modArr.forEach(function(i){
tagRes.append('<div class="oneResult">' + i + '</div>');
});
}
}
$('.oneResult:first-child').addClass('active');
$('.oneResult').click(function(){
window.location.href = 'http://hashtag.ly/' + $(this).html();
});
It also has some if statements detecting if the array contains hashtags (#) or mentions (#). Ignore that. The imTheCallback is the array of names, either hashtags or mentions, then modArr is the array sorted. Then the .atResults and .tagResults elements are the elements that it appends each time in the array to, this forms a list of names based on the entered search terms.
I also have the Levenshtein Distance algorithm:
var levenshtein = function(min, split) {
// Levenshtein Algorithm Revisited - WebReflection
try {
split = !("0")[0]
} catch(i) {
split = true
};
return function(a, b) {
if (a == b)
return 0;
if (!a.length || !b.length)
return b.length || a.length;
if (split) {
a = a.split("");
b = b.split("")
};
var len1 = a.length + 1,
len2 = b.length + 1,
I = 0,
i = 0,
d = [[0]],
c, j, J;
while (++i < len2)
d[0][i] = i;
i = 0;
while (++i < len1) {
J = j = 0;
c = a[I];
d[i] = [i];
while(++j < len2) {
d[i][j] = min(d[I][j] + 1, d[i][J] + 1, d[I][J] + (c != b[J]));
++J;
};
++I;
};
return d[len1 - 1][len2 - 1];
}
}(Math.min, false);
How can I work with algorithm (or a similar one) into my current code to sort it without bad performance?
UPDATE:
So I'm now using James Westgate's Lev Dist function. Works WAYYYY fast. So performance is solved, the issue now is using it with source...
modArr = limitArr.sort(function(a, b){
levDist(a, searchy)
levDist(b, searchy)
});
My problem now is general understanding on using the .sort() method. Help is appreciated, thanks.
Thanks!
I wrote an inline spell checker a few years ago and implemented a Levenshtein algorithm - since it was inline and for IE8 I did quite a lot of performance optimisation.
var levDist = function(s, t) {
var d = []; //2d matrix
// Step 1
var n = s.length;
var m = t.length;
if (n == 0) return m;
if (m == 0) return n;
//Create an array of arrays in javascript (a descending loop is quicker)
for (var i = n; i >= 0; i--) d[i] = [];
// Step 2
for (var i = n; i >= 0; i--) d[i][0] = i;
for (var j = m; j >= 0; j--) d[0][j] = j;
// Step 3
for (var i = 1; i <= n; i++) {
var s_i = s.charAt(i - 1);
// Step 4
for (var j = 1; j <= m; j++) {
//Check the jagged ld total so far
if (i == j && d[i][j] > 4) return n;
var t_j = t.charAt(j - 1);
var cost = (s_i == t_j) ? 0 : 1; // Step 5
//Calculate the minimum
var mi = d[i - 1][j] + 1;
var b = d[i][j - 1] + 1;
var c = d[i - 1][j - 1] + cost;
if (b < mi) mi = b;
if (c < mi) mi = c;
d[i][j] = mi; // Step 6
//Damerau transposition
if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
}
}
}
// Step 7
return d[n][m];
}
I came to this solution:
var levenshtein = (function() {
var row2 = [];
return function(s1, s2) {
if (s1 === s2) {
return 0;
} else {
var s1_len = s1.length, s2_len = s2.length;
if (s1_len && s2_len) {
var i1 = 0, i2 = 0, a, b, c, c2, row = row2;
while (i1 < s1_len)
row[i1] = ++i1;
while (i2 < s2_len) {
c2 = s2.charCodeAt(i2);
a = i2;
++i2;
b = i2;
for (i1 = 0; i1 < s1_len; ++i1) {
c = a + (s1.charCodeAt(i1) === c2 ? 0 : 1);
a = row[i1];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
row[i1] = b;
}
}
return b;
} else {
return s1_len + s2_len;
}
}
};
})();
See also http://jsperf.com/levenshtein-distance/12
Most speed was gained by eliminating some array usages.
Updated: http://jsperf.com/levenshtein-distance/5
The new Revision annihilates all other benchmarks. I was specifically chasing Chromium/Firefox performance as I don't have an IE8/9/10 test environment, but the optimisations made should apply in general to most browsers.
Levenshtein Distance
The matrix to perform Levenshtein Distance can be reused again and again. This was an obvious target for optimisation (but be careful, this now imposes a limit on string length (unless you were to resize the matrix dynamically)).
The only option for optimisation not pursued in jsPerf Revision 5 is memoisation. Depending on your use of Levenshtein Distance, this could help drastically but was omitted due to its implementation specific nature.
// Cache the matrix. Note this implementation is limited to
// strings of 64 char or less. This could be altered to update
// dynamically, or a larger value could be used.
var matrix = [];
for (var i = 0; i < 64; i++) {
matrix[i] = [i];
matrix[i].length = 64;
}
for (var i = 0; i < 64; i++) {
matrix[0][i] = i;
}
// Functional implementation of Levenshtein Distance.
String.levenshteinDistance = function(__this, that, limit) {
var thisLength = __this.length, thatLength = that.length;
if (Math.abs(thisLength - thatLength) > (limit || 32)) return limit || 32;
if (thisLength === 0) return thatLength;
if (thatLength === 0) return thisLength;
// Calculate matrix.
var this_i, that_j, cost, min, t;
for (i = 1; i <= thisLength; ++i) {
this_i = __this[i-1];
for (j = 1; j <= thatLength; ++j) {
// Check the jagged ld total so far
if (i === j && matrix[i][j] > 4) return thisLength;
that_j = that[j-1];
cost = (this_i === that_j) ? 0 : 1; // Chars already match, no ++op to count.
// Calculate the minimum (much faster than Math.min(...)).
min = matrix[i - 1][j ] + 1; // Deletion.
if ((t = matrix[i ][j - 1] + 1 ) < min) min = t; // Insertion.
if ((t = matrix[i - 1][j - 1] + cost) < min) min = t; // Substitution.
matrix[i][j] = min; // Update matrix.
}
}
return matrix[thisLength][thatLength];
};
Damerau-Levenshtein Distance
jsperf.com/damerau-levenshtein-distance
Damerau-Levenshtein Distance is a small modification to Levenshtein Distance to include transpositions. There is very little to optimise.
// Damerau transposition.
if (i > 1 && j > 1 && this_i === that[j-2] && this[i-2] === that_j
&& (t = matrix[i-2][j-2]+cost) < matrix[i][j]) matrix[i][j] = t;
Sorting Algorithm
The second part of this answer is to choose an appropriate sort function. I will upload optimised sort functions to http://jsperf.com/sort soon.
I implemented a very performant implementation of levenshtein distance calculation if you still need this.
function levenshtein(s, t) {
if (s === t) {
return 0;
}
var n = s.length, m = t.length;
if (n === 0 || m === 0) {
return n + m;
}
var x = 0, y, a, b, c, d, g, h, k;
var p = new Array(n);
for (y = 0; y < n;) {
p[y] = ++y;
}
for (; (x + 3) < m; x += 4) {
var e1 = t.charCodeAt(x);
var e2 = t.charCodeAt(x + 1);
var e3 = t.charCodeAt(x + 2);
var e4 = t.charCodeAt(x + 3);
c = x;
b = x + 1;
d = x + 2;
g = x + 3;
h = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
a = p[y];
if (a < c || b < c) {
c = (a > b ? b + 1 : a + 1);
}
else {
if (e1 !== k) {
c++;
}
}
if (c < b || d < b) {
b = (c > d ? d + 1 : c + 1);
}
else {
if (e2 !== k) {
b++;
}
}
if (b < d || g < d) {
d = (b > g ? g + 1 : b + 1);
}
else {
if (e3 !== k) {
d++;
}
}
if (d < g || h < g) {
g = (d > h ? h + 1 : d + 1);
}
else {
if (e4 !== k) {
g++;
}
}
p[y] = h = g;
g = d;
d = b;
b = c;
c = a;
}
}
for (; x < m;) {
var e = t.charCodeAt(x);
c = x;
d = ++x;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || d < c) {
d = (a > d ? d + 1 : a + 1);
}
else {
if (e !== s.charCodeAt(y)) {
d = c + 1;
}
else {
d = c;
}
}
p[y] = d;
c = a;
}
h = d;
}
return h;
}
It was my answer to a similar SO question
Fastest general purpose Levenshtein Javascript implementation
Update
A improved version of the above is now on github/npm see
https://github.com/gustf/js-levenshtein
The obvious way of doing this is to map each string to a (distance, string) pair, then sort this list, then drop the distances again. This way you ensure the levenstein distance only has to be computed once. Maybe merge duplicates first, too.
I would definitely suggest using a better Levenshtein method like the one in #James Westgate's answer.
That said, DOM manipulations are often a great expense. You can certainly improve your jQuery usage.
Your loops are rather small in the example above, but concatenating the generated html for each oneResult into a single string and doing one append at the end of the loop will be much more efficient.
Your selectors are slow. $('.oneResult') will search all elements in the DOM and test their className in older IE browsers. You may want to consider something like atRes.find('.oneResult') to scope the search.
In the case of adding the click handlers, we may want to do one better avoid setting handlers on every keyup. You could leverage event delegation by setting a single handler on atRest for all results in the same block you are setting the keyup handler:
atRest.on('click', '.oneResult', function(){
window.location.href = 'http://hashtag.ly/' + $(this).html();
});
See http://api.jquery.com/on/ for more info.
I just wrote an new revision: http://jsperf.com/levenshtein-algorithms/16
function levenshtein(a, b) {
if (a === b) return 0;
var aLen = a.length;
var bLen = b.length;
if (0 === aLen) return bLen;
if (0 === bLen) return aLen;
var len = aLen + 1;
var v0 = new Array(len);
var v1 = new Array(len);
var i = 0;
var j = 0;
var c2, min, tmp;
while (i < len) v0[i] = i++;
while (j < bLen) {
c2 = b.charAt(j++);
v1[0] = j;
i = 0;
while (i < aLen) {
min = v0[i] - (a.charAt(i) === c2 ? 1 : 0);
if (v1[i] < min) min = v1[i];
if (v0[++i] < min) min = v0[i];
v1[i] = min + 1;
}
tmp = v0;
v0 = v1;
v1 = tmp;
}
return v0[aLen];
}
This revision is faster than the other ones. Works even on IE =)
Where can I find a valid implementation of LogLog algorithm? Have tried to implement it by myself but my draft implementation yields strange results.
Here it is:
function LogLog(max_error, max_count)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}
var m = 1.30 / max_error;
var k = Math.ceil(log2(m * m));
m = Math.pow(2, k);
var k_comp = 32 - k;
var l = log2(log2(max_count / m));
if (isNaN(l)) l = 1; else l = Math.ceil(l);
var l_mask = ((1 << l) - 1) >>> 0;
var M = [];
for (var i = 0; i < m; ++i) M[i] = 0;
function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
var rank = 0;
for (var i = 0; i < k_comp; ++i)
{
if ((hash >>> i) & 1)
{
rank = i + 1;
break;
}
}
M[j] = Math.max(M[j], rank & l_mask);
}
else
{
var c = 0;
for (var i = 0; i < m; ++i) c += M[i];
return 0.79402 * m * Math.pow(2, c / m);
}
}
return {count: count};
}
function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words
var log_log = LogLog(0.01, 100000);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]));
alert(log_log.count());
For unknown reason implementation is very sensitive to max_error parameter, it is the main factor that determines the magnitude of the result. I'm sure, there is some stupid mistake :)
UPDATE: This problem is solved in the newer version of algorithm. I will post its implementation later.
Here it is the updated version of the algorithm based on the newer paper:
var pow_2_32 = 0xFFFFFFFF + 1;
function HyperLogLog(std_error)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}
function rank(hash, max)
{
var r = 1;
while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
return r;
}
var m = 1.04 / std_error;
var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
m = Math.pow(2, k);
var alpha_m = m == 16 ? 0.673
: m == 32 ? 0.697
: m == 64 ? 0.709
: 0.7213 / (1 + 1.079 / m);
var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
M[j] = Math.max(M[j], rank(hash, k_comp));
}
else
{
var c = 0.0;
for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
var E = alpha_m * m * m / c;
// -- make corrections
if (E <= 5/2 * m)
{
var V = 0;
for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
if (V > 0) E = m * Math.log(m / V);
}
else if (E > 1/30 * pow_2_32)
E = -pow_2_32 * Math.log(1 - E / pow_2_32);
// --
return E;
}
}
return {count: count};
}
function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words
var seed = Math.floor(Math.random() * pow_2_32); // make more fun
var log_log = HyperLogLog(0.065);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed);
var count = log_log.count();
alert(count + ', error ' +
(count - words.length) / (words.length / 100.0) + '%');
Here is a slightly modified version which adds the merge operation.
Merge allows you to take the counters from several instances of HyperLogLog,
and determine the unique counters overall.
For example, if you have unique visitors collected on Monday, Tuesday and Wednesday,
then you can merge the buckets together and count the number of unique visitors
over the three day span:
var pow_2_32 = 0xFFFFFFFF + 1;
function HyperLogLog(std_error)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}
function rank(hash, max)
{
var r = 1;
while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
return r;
}
var m = 1.04 / std_error;
var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
m = Math.pow(2, k);
var alpha_m = m == 16 ? 0.673
: m == 32 ? 0.697
: m == 64 ? 0.709
: 0.7213 / (1 + 1.079 / m);
var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
function merge(other)
{
for (var i = 0; i < m; i++)
M[i] = Math.max(M[i], other.buckets[i]);
}
function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
M[j] = Math.max(M[j], rank(hash, k_comp));
}
else
{
var c = 0.0;
for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
var E = alpha_m * m * m / c;
// -- make corrections
if (E <= 5/2 * m)
{
var V = 0;
for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
if (V > 0) E = m * Math.log(m / V);
}
else if (E > 1/30 * pow_2_32)
E = -pow_2_32 * Math.log(1 - E / pow_2_32);
// --
return E;
}
}
return {count: count, merge: merge, buckets: M};
}
function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
Then you can do something like this:
// initialize one counter per day
var ll_monday = HyperLogLog(0.01);
var ll_tuesday = HyperLogLog(0.01);
var ll_wednesday = HyperLogLog(0.01);
// add 5000 unique values in each day
for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random()));
// add 5000 values which appear every day
for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i)); ll_wednesday.count(fnv1a('' + i));}
// merge three days together
together = HyperLogLog(0.01);
together.merge(ll_monday);
together.merge(ll_tuesday);
together.merge(ll_wednesday);
// report
console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count()));
console.log('unique numbers overall: ' + Math.round(together.count()));
We've open sourced a project called Stream-Lib that has a LogLog implementation. The work was based on this paper.
Using the js version #actual provided, I tried to implement the same in C#, which seems close enough. Just changed fnv1a function a little bit and renamed it to getHashCode. (Credit goes to Jenkins hash function, http://en.wikipedia.org/wiki/Jenkins_hash_function)
public class HyperLogLog
{
private double mapSize, alpha_m, k;
private int kComplement;
private Dictionary<int, int> Lookup = new Dictionary<int, int>();
private const double pow_2_32 = 4294967297;
public HyperLogLog(double stdError)
{
mapSize = (double)1.04 / stdError;
k = (long)Math.Ceiling(log2(mapSize * mapSize));
kComplement = 32 - (int)k;
mapSize = (long)Math.Pow(2, k);
alpha_m = mapSize == 16 ? (double)0.673
: mapSize == 32 ? (double)0.697
: mapSize == 64 ? (double)0.709
: (double)0.7213 / (double)(1 + 1.079 / mapSize);
for (int i = 0; i < mapSize; i++)
Lookup[i] = 0;
}
private static double log2(double x)
{
return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2
}
private static int getRank(uint hash, int max)
{
int r = 1;
uint one = 1;
while ((hash & one) == 0 && r <= max)
{
++r;
hash >>= 1;
}
return r;
}
public static uint getHashCode(string text)
{
uint hash = 0;
for (int i = 0, l = text.Length; i < l; i++)
{
hash += (uint)text[i];
hash += hash << 10;
hash ^= hash >> 6;
}
hash += hash << 3;
hash ^= hash >> 6;
hash += hash << 16;
return hash;
}
public int Count()
{
double c = 0, E;
for (var i = 0; i < mapSize; i++)
c += 1d / Math.Pow(2, (double)Lookup[i]);
E = alpha_m * mapSize * mapSize / c;
// Make corrections & smoothen things.
if (E <= (5 / 2) * mapSize)
{
double V = 0;
for (var i = 0; i < mapSize; i++)
if (Lookup[i] == 0) V++;
if (V > 0)
E = mapSize * Math.Log(mapSize / V);
}
else
if (E > (1 / 30) * pow_2_32)
E = -pow_2_32 * Math.Log(1 - E / pow_2_32);
// Made corrections & smoothen things, or not.
return (int)E;
}
public void Add(object val)
{
uint hashCode = getHashCode(val.ToString());
int j = (int)(hashCode >> kComplement);
Lookup[j] = Math.Max(Lookup[j], getRank(hashCode, kComplement));
}
}
I know this is an old post but the #buryat implementation has moved, and is in any case incomplete, and a bit on the slow side (sorry o_o ).
I've taken the implementation used by the new Redis release which can be found here and ported it to PHP. The repo is here https://github.com/joegreen0991/HyperLogLog
<?php
class HyperLogLog {
private $HLL_P_MASK;
private $HLL_REGISTERS;
private $ALPHA;
private $registers;
public function __construct($HLL_P = 14)
{
$this->HLL_REGISTERS = (1 << $HLL_P); /* With P=14, 16384 registers. */
$this->HLL_P_MASK = ($this->HLL_REGISTERS - 1); /* Mask to index register. */
$this->ALPHA = 0.7213 / (1 + 1.079 / $this->HLL_REGISTERS);
$this->registers = new SplFixedArray($this->HLL_REGISTERS);
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
$this->registers[$i] = 0;
}
}
public function add($v)
{
$h = crc32(md5($v));
$h |= 1 << 63; /* Make sure the loop terminates. */
$bit = $this->HLL_REGISTERS; /* First bit not used to address the register. */
$count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
while(($h & $bit) == 0) {
$count++;
$bit <<= 1;
}
/* Update the register if this element produced a longer run of zeroes. */
$index = $h & $this->HLL_P_MASK; /* Index a register inside registers. */
if ($this->registers[$index] < $count) {
$this->registers[$index] = $count;
}
}
public function export()
{
$str = '';
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
$str .= chr($this->registers[$i]);
}
return $str;
}
public function import($str)
{
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
$this->registers[$i] = isset($str[$i]) ? ord($str[$i]) : 0;
}
}
public function merge($str)
{
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
if(isset($str[$i]))
{
$ord = ord($str[$i]);
if ($this->registers[$i] < $ord) {
$this->registers[$i] = $ord;
}
}
}
}
/**
* #static
* #param $arr
* #return int Number of unique items in $arr
*/
public function count() {
$E = 0;
$ez = 0;
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
if ($this->registers[$i] !== 0) {
$E += (1.0 / pow(2, $this->registers[$i]));
} else {
$ez++;
$E += 1.0;
}
}
$E = (1 / $E) * $this->ALPHA * $this->HLL_REGISTERS * $this->HLL_REGISTERS;
/* Use the LINEARCOUNTING algorithm for small cardinalities.
* For larger values but up to 72000 HyperLogLog raw approximation is
* used since linear counting error starts to increase. However HyperLogLog
* shows a strong bias in the range 2.5*16384 - 72000, so we try to
* compensate for it. */
if ($E < $this->HLL_REGISTERS * 2.5 && $ez != 0) {
$E = $this->HLL_REGISTERS * log($this->HLL_REGISTERS / $ez);
}
else if ($this->HLL_REGISTERS == 16384 && $E < 72000) {
// We did polynomial regression of the bias for this range, this
// way we can compute the bias for a given cardinality and correct
// according to it. Only apply the correction for P=14 that's what
// we use and the value the correction was verified with.
$bias = 5.9119 * 1.0e-18 * ($E*$E*$E*$E)
-1.4253 * 1.0e-12 * ($E*$E*$E)+
1.2940 * 1.0e-7 * ($E*$E)
-5.2921 * 1.0e-3 * $E+
83.3216;
$E -= $E * ($bias/100);
}
return floor($E);
}
}
I implemented loglog and hyperloglog in JS and PHP and well-commented code https://github.com/buryat/loglog