Bringing in a Function from Javascript to Big Query StandardSQL - javascript

I have a javascript that assigns a certain bucket to a customer according to the algorithm number and test number (function getBuckets). However I am having trouble bringing this function over from the javascript and defining it into standard sql. So I can use it on specific databases.
Here is the Input Schema from Big Query that I am trying to get a bucket number for:
7354430 AS customerId,
4 AS algorithmIndex,
5947 AS testId
Here is what the output should be after running my sql:
customerId,
bucketNumber
Does anyone know how to bring in the getBuckets function from the javascript into big query standard sql in order to get the bucketNumber?
Javascript is below:
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* SHA-1 implementation in JavaScript (c) Chris Veness 2002-2014 / MIT Licence */
/* */
/* - see http://csrc.nist.gov/groups/ST/toolkit/secure_hashing.html */
/* http://csrc.nist.gov/groups/ST/toolkit/examples.html */
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* jshint node:true *//* global define, escape, unescape */
'use strict';
/**
* SHA-1 hash function reference implementation.
*
* #namespace
*/
var Sha1 = {};
/**
* Generates SHA-1 hash of string.
*
* #param {string} msg - (Unicode) string to be hashed.
* #returns {string} Hash of msg as hex character string.
*/
Sha1.hash = function(msg) {
// convert string to UTF-8, as SHA only deals with byte-streams
msg = msg.utf8Encode();
// constants [§4.2.1]
var K = [ 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 ];
// PREPROCESSING
msg += String.fromCharCode(0x80); // add trailing '1' bit (+ 0's padding) to string [§5.1.1]
// convert string msg into 512-bit/16-integer blocks arrays of ints [§5.2.1]
var l = msg.length/4 + 2; // length (in 32-bit integers) of msg + ‘1’ + appended length
var N = Math.ceil(l/16); // number of 16-integer-blocks required to hold 'l' ints
var M = new Array(N);
for (var i=0; i<N; i++) {
M[i] = new Array(16);
for (var j=0; j<16; j++) { // encode 4 chars per integer, big-endian encoding
M[i][j] = (msg.charCodeAt(i*64+j*4)<<24) | (msg.charCodeAt(i*64+j*4+1)<<16) |
(msg.charCodeAt(i*64+j*4+2)<<8) | (msg.charCodeAt(i*64+j*4+3));
} // note running off the end of msg is ok 'cos bitwise ops on NaN return 0
}
// add length (in bits) into final pair of 32-bit integers (big-endian) [§5.1.1]
// note: most significant word would be (len-1)*8 >>> 32, but since JS converts
// bitwise-op args to 32 bits, we need to simulate this by arithmetic operators
M[N-1][14] = ((msg.length-1)*8) / Math.pow(2, 32); M[N-1][14] = Math.floor(M[N-1][14]);
M[N-1][15] = ((msg.length-1)*8) & 0xffffffff;
// set initial hash value [§5.3.1]
var H0 = 0x67452301;
var H1 = 0xefcdab89;
var H2 = 0x98badcfe;
var H3 = 0x10325476;
var H4 = 0xc3d2e1f0;
// HASH COMPUTATION [§6.1.2]
var W = new Array(80); var a, b, c, d, e;
for (var i=0; i<N; i++) {
// 1 - prepare message schedule 'W'
for (var t=0; t<16; t++) W[t] = M[i][t];
for (var t=16; t<80; t++) W[t] = Sha1.ROTL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
// 2 - initialise five working variables a, b, c, d, e with previous hash value
a = H0; b = H1; c = H2; d = H3; e = H4;
// 3 - main loop
for (var t=0; t<80; t++) {
var s = Math.floor(t/20); // seq for blocks of 'f' functions and 'K' constants
var T = (Sha1.ROTL(a,5) + Sha1.f(s,b,c,d) + e + K[s] + W[t]) & 0xffffffff;
e = d;
d = c;
c = Sha1.ROTL(b, 30);
b = a;
a = T;
}
// 4 - compute the new intermediate hash value (note 'addition modulo 2^32')
H0 = (H0+a) & 0xffffffff;
H1 = (H1+b) & 0xffffffff;
H2 = (H2+c) & 0xffffffff;
H3 = (H3+d) & 0xffffffff;
H4 = (H4+e) & 0xffffffff;
}
return Sha1.toHexStr(H0) + Sha1.toHexStr(H1) + Sha1.toHexStr(H2) +
Sha1.toHexStr(H3) + Sha1.toHexStr(H4);
};
/**
* Function 'f' [§4.1.1].
* #private
*/
Sha1.f = function(s, x, y, z) {
switch (s) {
case 0: return (x & y) ^ (~x & z); // Ch()
case 1: return x ^ y ^ z; // Parity()
case 2: return (x & y) ^ (x & z) ^ (y & z); // Maj()
case 3: return x ^ y ^ z; // Parity()
}
};
/**
* Rotates left (circular left shift) value x by n positions [§3.2.5].
* #private
*/
Sha1.ROTL = function(x, n) {
return (x<<n) | (x>>>(32-n));
};
/**
* Hexadecimal representation of a number.
* #private
*/
Sha1.toHexStr = function(n) {
// note can't use toString(16) as it is implementation-dependant,
// and in IE returns signed numbers when used on full words
var s="", v;
for (var i=7; i>=0; i--) { v = (n>>>(i*4)) & 0xf; s += v.toString(16); }
return s;
};
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/** Extend String object with method to encode multi-byte string to utf8
* - monsur.hossa.in/2012/07/20/utf-8-in-javascript.html */
if (typeof String.prototype.utf8Encode == 'undefined') {
String.prototype.utf8Encode = function() {
return unescape( encodeURIComponent( this ) );
};
}
/** Extend String object with method to decode utf8 string to multi-byte */
if (typeof String.prototype.utf8Decode == 'undefined') {
String.prototype.utf8Decode = function() {
try {
return decodeURIComponent( escape( this ) );
} catch (e) {
return this; // invalid UTF-8? return as-is
}
};
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
if (typeof module != 'undefined' && module.exports) module.exports = Sha1; // CommonJs export
if (typeof define == 'function' && define.amd) define([], function() { return Sha1; }); // AMD
Sha1.getDeterministicBuckets = function(customerId,algorithmIndex,testNumber) {
var testDescriptors = {
28:{s:100, d:'1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1'},
29:{s:21, d:'40/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3'}
};
var seed = (customerId+testNumber).toString();
var sha1 = Sha1.hash(seed);
var end = sha1.substr(36)
var c = parseInt(end, 16)/65535.0;
var result = '';
/*
if ( algorithmIndex == 3 )
if ( c < 0.10 ) result = 9;
else if ( c < 0.20 ) result = 8;
else if ( c < 0.30 ) result = 7;
else if ( c < 0.40 ) result = 6;
else if ( c < 0.50 ) result = 5;
else if ( c < 0.60 ) result = 4;
else if ( c < 0.70 ) result = 3;
else if ( c < 0.80 ) result = 2;
else if ( c < 0.90 ) result = 1;
else result = 0;
*/
/* (on 6/3/13) Bad inputs (and/or logged out users) yield a -1. */
if ( customerId == null || algorithmIndex == null || testNumber == null ) {
result = -1;
if( algorithmIndex < 1 || algorithmIndex > 27)
result = 0;
/* 25/25/50 */
}else if ( algorithmIndex == 1 ) {
if ( c < 0.25 ) result = 2;
else if ( c < 0.50 ) result = 1;
else result = 0;
/* 1/99 */
}else if ( algorithmIndex == 2 ) {
if ( c < 0.01 ) result = 1
else result = 0;
/* 10/10/10/... */
}else if ( algorithmIndex == 3 ){
if ( c < 0.10 ) result = 9;
else if ( c < 0.20 ) result = 8;
else if ( c < 0.30 ) result = 7;
else if ( c < 0.40 ) result = 6;
else if ( c < 0.50 ) result = 5;
else if ( c < 0.60 ) result = 4;
else if ( c < 0.70 ) result = 3;
else if ( c < 0.80 ) result = 2;
else if ( c < 0.90 ) result = 1;
else result = 0;
/* 25/25/25/25 */
}else if ( algorithmIndex == 4 ) {
if ( c < 0.25 ) result = 3;
else if ( c < 0.50 ) result = 2;
else if ( c < 0.75 ) result = 1;
else result = 0;
/* 50/50 */
}else if ( algorithmIndex == 5 ) {
if ( c < 0.50 ) result = 1;
else result = 0;
/* 10/90 */
}else if ( algorithmIndex == 6 ) {
if ( c < 0.10 ) result = 1;
else result = 0;
/* 10/10/10/10/10/50 */
}else if ( algorithmIndex == 7 ) {
if ( c < 0.10 ) result = 5;
else if ( c < 0.20 ) result = 4;
else if ( c < 0.30 ) result = 3;
else if ( c < 0.40 ) result = 2;
else if ( c < 0.50 ) result = 1;
else result = 0;
/* 20/20/20/20/20 */
}else if ( algorithmIndex == 8 ){
if ( c < 0.20 ) result = 4;
else if ( c < 0.40 ) result = 3;
else if ( c < 0.60 ) result = 2;
else if ( c < 0.80 ) result = 1;
else result = 0;
/* 96/2/2 */
}else if ( algorithmIndex == 9 ) {
if ( c < 0.02 ) result = 2;
else if ( c < 0.04 ) result = 1;
else result = 0;
/* 80/20 */
}else if ( algorithmIndex == 10 ) {
if ( c < 0.20 ) result = 1;
else result = 0;
/* 12.5/12.5/12.5/etc */
}else if ( algorithmIndex == 11 ) {
if ( c < 0.125 ) result = 7;
else if ( c < 0.250 ) result = 6;
else if ( c < 0.375 ) result = 5;
else if ( c < 0.500 ) result = 4;
else if ( c < 0.625 ) result = 3;
else if ( c < 0.750 ) result = 2;
else if ( c < 0.875 ) result = 1;
else result = 0;
/* 50/10/20/20 */
}else if ( algorithmIndex == 12 ) {
if ( c < 0.20 ) result = 3;
else if ( c < 0.40 ) result = 2;
else if ( c < 0.50 ) result = 1;
else result = 0;
/* 100 */
}else if ( algorithmIndex == 13 ) {
result = 0;
/* 80/2/2/2/2/2/2/2/2/2/2 */
}else if ( algorithmIndex == 14 ) {
if ( c < 0.02 ) result = 10;
else if ( c < 0.04 ) result = 9;
else if ( c < 0.06 ) result = 8;
else if ( c < 0.08 ) result = 7;
else if ( c < 0.10 ) result = 6;
else if ( c < 0.12 ) result = 5;
else if ( c < 0.14 ) result = 4;
else if ( c < 0.16 ) result = 3;
else if ( c < 0.18 ) result = 2;
else if ( c < 0.20 ) result = 1;
else result = 0;
/* 11/11/3/3/3/3/.... */
}else if ( algorithmIndex == 15 ) {
if ( c < 0.03 ) result = 27;
else if ( c < 0.06 ) result = 26;
else if ( c < 0.09 ) result = 25;
else if ( c < 0.12 ) result = 24;
else if ( c < 0.15 ) result = 23;
else if ( c < 0.18 ) result = 22;
else if ( c < 0.21 ) result = 21;
else if ( c < 0.24 ) result = 20;
else if ( c < 0.27 ) result = 19;
else if ( c < 0.30 ) result = 18;
else if ( c < 0.33 ) result = 17;
else if ( c < 0.36 ) result = 16;
else if ( c < 0.39 ) result = 15;
else if ( c < 0.42 ) result = 14;
else if ( c < 0.45 ) result = 13;
else if ( c < 0.48 ) result = 12;
else if ( c < 0.51 ) result = 11;
else if ( c < 0.54 ) result = 10;
else if ( c < 0.57 ) result = 9;
else if ( c < 0.60 ) result = 8;
else if ( c < 0.63 ) result = 7;
else if ( c < 0.66 ) result = 6;
else if ( c < 0.69 ) result = 5;
else if ( c < 0.72 ) result = 4;
else if ( c < 0.75 ) result = 3;
else if ( c < 0.78 ) result = 2;
else if ( c < 0.89 ) result = 1;
else result = 0;
/* 23/23/23/23/8 */
} else if ( algorithmIndex == 16 ) {
if ( c < 0.08 ) result = 4;
else if ( c < 0.31 ) result = 3;
else if ( c < 0.54 ) result = 2;
else if ( c < 0.77 ) result = 1;
else result = 0;
/* 97/0.5/0.5/0.5/0.5/0.5?0.5 */
}else if ( algorithmIndex == 17 ) {
if ( c < 0.005 ) result = 6;
else if ( c < 0.010 ) result = 5;
else if ( c < 0.015 ) result = 4;
else if ( c < 0.020 ) result = 3;
else if ( c < 0.025 ) result = 2;
else if ( c < 0.030 ) result = 1;
else result = 0;
/* 80/10/10 */
} else if ( algorithmIndex == 18 ){
if ( c < 0.10 ) result = 2;
else if ( c < 0.20 ) result = 1;
else result = 0;
/* 70/10/10/10 */
} else if ( algorithmIndex == 19 ){
if ( c < 0.10 ) result = 3;
else if ( c < 0.20 ) result = 2;
else if ( c < 0.30 ) result = 1;
else result = 0;
/* 90/5/5 */
}else if ( algorithmIndex == 20 ){
if ( c < 0.05 ) result = 2;
else if ( c < 0.10 ) result = 1;
else result = 0;
/* 80/5/5/5/5 */
}else if ( algorithmIndex == 21 ){
if ( c < 0.05 ) result = 4;
else if ( c < 0.10 ) result = 3;
else if ( c < 0.15 ) result = 2;
else if ( c < 0.20 ) result = 1;
else result = 0;
/* 45/45/10 */
}else if ( algorithmIndex == 22 ) {
if ( c < 0.10 ) result = 2;
else if ( c < 0.55 ) result = 1;
else result = 0;
/* 5.88 x 17 ... seriously? */
} else if ( algorithmIndex == 23 ){
if ( c < 0.0588 ) result = 16;
else if ( c < 0.1176 ) result = 15
else if ( c < 0.1764 ) result = 14;
else if ( c < 0.2352 ) result = 13;
else if ( c < 0.2940 ) result = 12;
else if ( c < 0.3528 ) result = 11;
else if ( c < 0.4116 ) result = 10;
else if ( c < 0.4704 ) result = 9;
else if ( c < 0.5292 ) result = 8;
else if ( c < 0.5880 ) result = 7;
else if ( c < 0.6468 ) result = 6;
else if ( c < 0.7056 ) result = 5;
else if ( c < 0.7644 ) result = 4;
else if ( c < 0.8232 ) result = 3;
else if ( c < 0.8820 ) result = 2;
else if ( c < 0.9401 ) result = 1;
else result = 0;
/* 97.5/2.5 */
}else if ( algorithmIndex == 24 ) {
if ( c < 0.025 ) result = 1;
else result = 0;
/* 92.5/7.5 */
}else if ( algorithmIndex == 25 ) {
if ( c < 0.075 ) result = 1;
else result = 0;
/* 50/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5 */
/* not sure why this one was setup with results asc vs desc; results were validated and match webstore */
}else if ( algorithmIndex == 26 ) {
if ( c < 0.025 ) result = 1;
else if ( c < 0.050 ) result = 2;
else if ( c < 0.075 ) result = 3;
else if ( c < 0.100 ) result = 4;
else if ( c < 0.125 ) result = 5;
else if ( c < 0.150 ) result = 6;
else if ( c < 0.175 ) result = 7;
else if ( c < 0.200 ) result = 8;
else if ( c < 0.225 ) result = 9;
else if ( c < 0.250 ) result = 10;
else if ( c < 0.275 ) result = 11;
else if ( c < 0.300 ) result = 12;
else if ( c < 0.325 ) result = 13;
else if ( c < 0.350 ) result = 14;
else if ( c < 0.375 ) result = 15;
else if ( c < 0.400 ) result = 16;
else if ( c < 0.425 ) result = 17;
else if ( c < 0.450 ) result = 18;
else if ( c < 0.475 ) result = 19;
else if ( c < 0.500 ) result = 20;
else result = 0;
/* 33.3/33.3/33.3 */
} else if ( algorithmIndex == 27 ) {
if ( c < 0.333 ) result = 2;
else if ( c < 0.666 ) result = 1;
else result = 0;
} else {
if (algorithmIndex in testDescriptors) {
var distributions = testDescriptors[algorithmIndex].d.split('/');
var threshold = 0.0;
result = 0;
for (var i = 1; i < distributions.length; i++) {
threshold += parseFloat(distributions[i])/100.0;
if (c < threshold) {
result = i;
break;
}
}
}
}
return result;
};
function getBuckets(row, emit) {
var result = Sha1.getDeterministicBuckets(row.inCustomerId,row.inAlgorithmIndex,row.inTestNumber);
emit({bucketNumber: result , CustomerId: row.inCustomerId});
};
bigquery.defineFunction(
'getBuckets', // Name of the function exported to SQL
['inCustomerId','inAlgorithmIndex','inTestNumber'], // Names of input columns
[{'name': 'bucketNumber', 'type': 'integer'}, // Output schema
{'name': 'CustomerId', 'type': 'integer'}],
getBuckets // Reference to JavaScript UDF
);

I fixed it for #standardSQL.
First, add these 3 lines at the beginning, to define a JS UDF:
CREATE TEMP FUNCTION getDeterministicBuckets(customerId INT64, algorithmIndex INT64, testId INT64)
RETURNS STRUCT<bucketNumber INT64, CustomerId INT64>
LANGUAGE js AS """
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* SHA-1 implementation in JavaScript (c) Chris Veness 2002-2014 / MIT Licence */
/* */
/* - see http://csrc.nist.gov/groups/ST/toolkit/secure_hashing.html */
/* http://csrc.nist.gov/groups/ST/toolkit/examples.html */
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
At the end, instead of a signature for a function:
Don't use emit()
return instead
Like in:
return {bucketNumber: Sha1.getDeterministicBuckets(customerId, algorithmIndex, testId) , CustomerId: customerId};
""";
That will allow you to call the function like this:
SELECT getDeterministicBuckets(7354430,4,5947) x
Complete working code:
CREATE TEMP FUNCTION getDeterministicBuckets(customerId INT64, algorithmIndex INT64, testId INT64)
RETURNS STRUCT<bucketNumber INT64, CustomerId INT64>
LANGUAGE js AS """
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* SHA-1 implementation in JavaScript (c) Chris Veness 2002-2014 / MIT Licence */
/* */
/* - see http://csrc.nist.gov/groups/ST/toolkit/secure_hashing.html */
/* http://csrc.nist.gov/groups/ST/toolkit/examples.html */
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* jshint node:true *//* global define, escape, unescape */
'use strict';
/**
* SHA-1 hash function reference implementation.
*
* #namespace
*/
var Sha1 = {};
/**
* Generates SHA-1 hash of string.
*
* #param {string} msg - (Unicode) string to be hashed.
* #returns {string} Hash of msg as hex character string.
*/
Sha1.hash = function(msg) {
// convert string to UTF-8, as SHA only deals with byte-streams
msg = msg.utf8Encode();
// constants [§4.2.1]
var K = [ 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 ];
// PREPROCESSING
msg += String.fromCharCode(0x80); // add trailing '1' bit (+ 0's padding) to string [§5.1.1]
// convert string msg into 512-bit/16-integer blocks arrays of ints [§5.2.1]
var l = msg.length/4 + 2; // length (in 32-bit integers) of msg + ‘1’ + appended length
var N = Math.ceil(l/16); // number of 16-integer-blocks required to hold 'l' ints
var M = new Array(N);
for (var i=0; i<N; i++) {
M[i] = new Array(16);
for (var j=0; j<16; j++) { // encode 4 chars per integer, big-endian encoding
M[i][j] = (msg.charCodeAt(i*64+j*4)<<24) | (msg.charCodeAt(i*64+j*4+1)<<16) |
(msg.charCodeAt(i*64+j*4+2)<<8) | (msg.charCodeAt(i*64+j*4+3));
} // note running off the end of msg is ok 'cos bitwise ops on NaN return 0
}
// add length (in bits) into final pair of 32-bit integers (big-endian) [§5.1.1]
// note: most significant word would be (len-1)*8 >>> 32, but since JS converts
// bitwise-op args to 32 bits, we need to simulate this by arithmetic operators
M[N-1][14] = ((msg.length-1)*8) / Math.pow(2, 32); M[N-1][14] = Math.floor(M[N-1][14]);
M[N-1][15] = ((msg.length-1)*8) & 0xffffffff;
// set initial hash value [§5.3.1]
var H0 = 0x67452301;
var H1 = 0xefcdab89;
var H2 = 0x98badcfe;
var H3 = 0x10325476;
var H4 = 0xc3d2e1f0;
// HASH COMPUTATION [§6.1.2]
var W = new Array(80); var a, b, c, d, e;
for (var i=0; i<N; i++) {
// 1 - prepare message schedule 'W'
for (var t=0; t<16; t++) W[t] = M[i][t];
for (var t=16; t<80; t++) W[t] = Sha1.ROTL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
// 2 - initialise five working variables a, b, c, d, e with previous hash value
a = H0; b = H1; c = H2; d = H3; e = H4;
// 3 - main loop
for (var t=0; t<80; t++) {
var s = Math.floor(t/20); // seq for blocks of 'f' functions and 'K' constants
var T = (Sha1.ROTL(a,5) + Sha1.f(s,b,c,d) + e + K[s] + W[t]) & 0xffffffff;
e = d;
d = c;
c = Sha1.ROTL(b, 30);
b = a;
a = T;
}
// 4 - compute the new intermediate hash value (note 'addition modulo 2^32')
H0 = (H0+a) & 0xffffffff;
H1 = (H1+b) & 0xffffffff;
H2 = (H2+c) & 0xffffffff;
H3 = (H3+d) & 0xffffffff;
H4 = (H4+e) & 0xffffffff;
}
return Sha1.toHexStr(H0) + Sha1.toHexStr(H1) + Sha1.toHexStr(H2) +
Sha1.toHexStr(H3) + Sha1.toHexStr(H4);
};
/**
* Function 'f' [§4.1.1].
* #private
*/
Sha1.f = function(s, x, y, z) {
switch (s) {
case 0: return (x & y) ^ (~x & z); // Ch()
case 1: return x ^ y ^ z; // Parity()
case 2: return (x & y) ^ (x & z) ^ (y & z); // Maj()
case 3: return x ^ y ^ z; // Parity()
}
};
/**
* Rotates left (circular left shift) value x by n positions [§3.2.5].
* #private
*/
Sha1.ROTL = function(x, n) {
return (x<<n) | (x>>>(32-n));
};
/**
* Hexadecimal representation of a number.
* #private
*/
Sha1.toHexStr = function(n) {
// note can't use toString(16) as it is implementation-dependant,
// and in IE returns signed numbers when used on full words
var s="", v;
for (var i=7; i>=0; i--) { v = (n>>>(i*4)) & 0xf; s += v.toString(16); }
return s;
};
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/** Extend String object with method to encode multi-byte string to utf8
* - monsur.hossa.in/2012/07/20/utf-8-in-javascript.html */
if (typeof String.prototype.utf8Encode == 'undefined') {
String.prototype.utf8Encode = function() {
return unescape( encodeURIComponent( this ) );
};
}
/** Extend String object with method to decode utf8 string to multi-byte */
if (typeof String.prototype.utf8Decode == 'undefined') {
String.prototype.utf8Decode = function() {
try {
return decodeURIComponent( escape( this ) );
} catch (e) {
return this; // invalid UTF-8? return as-is
}
};
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
if (typeof module != 'undefined' && module.exports) module.exports = Sha1; // CommonJs export
if (typeof define == 'function' && define.amd) define([], function() { return Sha1; }); // AMD
Sha1.getDeterministicBuckets = function(customerId,algorithmIndex,testNumber) {
var testDescriptors = {
28:{s:100, d:'1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1/1'},
29:{s:21, d:'40/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3/3'}
};
var seed = (customerId+testNumber).toString();
var sha1 = Sha1.hash(seed);
var end = sha1.substr(36)
var c = parseInt(end, 16)/65535.0;
var result = '';
/*
if ( algorithmIndex == 3 )
if ( c < 0.10 ) result = 9;
else if ( c < 0.20 ) result = 8;
else if ( c < 0.30 ) result = 7;
else if ( c < 0.40 ) result = 6;
else if ( c < 0.50 ) result = 5;
else if ( c < 0.60 ) result = 4;
else if ( c < 0.70 ) result = 3;
else if ( c < 0.80 ) result = 2;
else if ( c < 0.90 ) result = 1;
else result = 0;
*/
/* (on 6/3/13) Bad inputs (and/or logged out users) yield a -1. */
if ( customerId == null || algorithmIndex == null || testNumber == null ) {
result = -1;
if( algorithmIndex < 1 || algorithmIndex > 27)
result = 0;
/* 25/25/50 */
}else if ( algorithmIndex == 1 ) {
if ( c < 0.25 ) result = 2;
else if ( c < 0.50 ) result = 1;
else result = 0;
/* 1/99 */
}else if ( algorithmIndex == 2 ) {
if ( c < 0.01 ) result = 1
else result = 0;
/* 10/10/10/... */
}else if ( algorithmIndex == 3 ){
if ( c < 0.10 ) result = 9;
else if ( c < 0.20 ) result = 8;
else if ( c < 0.30 ) result = 7;
else if ( c < 0.40 ) result = 6;
else if ( c < 0.50 ) result = 5;
else if ( c < 0.60 ) result = 4;
else if ( c < 0.70 ) result = 3;
else if ( c < 0.80 ) result = 2;
else if ( c < 0.90 ) result = 1;
else result = 0;
/* 25/25/25/25 */
}else if ( algorithmIndex == 4 ) {
if ( c < 0.25 ) result = 3;
else if ( c < 0.50 ) result = 2;
else if ( c < 0.75 ) result = 1;
else result = 0;
/* 50/50 */
}else if ( algorithmIndex == 5 ) {
if ( c < 0.50 ) result = 1;
else result = 0;
/* 10/90 */
}else if ( algorithmIndex == 6 ) {
if ( c < 0.10 ) result = 1;
else result = 0;
/* 10/10/10/10/10/50 */
}else if ( algorithmIndex == 7 ) {
if ( c < 0.10 ) result = 5;
else if ( c < 0.20 ) result = 4;
else if ( c < 0.30 ) result = 3;
else if ( c < 0.40 ) result = 2;
else if ( c < 0.50 ) result = 1;
else result = 0;
/* 20/20/20/20/20 */
}else if ( algorithmIndex == 8 ){
if ( c < 0.20 ) result = 4;
else if ( c < 0.40 ) result = 3;
else if ( c < 0.60 ) result = 2;
else if ( c < 0.80 ) result = 1;
else result = 0;
/* 96/2/2 */
}else if ( algorithmIndex == 9 ) {
if ( c < 0.02 ) result = 2;
else if ( c < 0.04 ) result = 1;
else result = 0;
/* 80/20 */
}else if ( algorithmIndex == 10 ) {
if ( c < 0.20 ) result = 1;
else result = 0;
/* 12.5/12.5/12.5/etc */
}else if ( algorithmIndex == 11 ) {
if ( c < 0.125 ) result = 7;
else if ( c < 0.250 ) result = 6;
else if ( c < 0.375 ) result = 5;
else if ( c < 0.500 ) result = 4;
else if ( c < 0.625 ) result = 3;
else if ( c < 0.750 ) result = 2;
else if ( c < 0.875 ) result = 1;
else result = 0;
/* 50/10/20/20 */
}else if ( algorithmIndex == 12 ) {
if ( c < 0.20 ) result = 3;
else if ( c < 0.40 ) result = 2;
else if ( c < 0.50 ) result = 1;
else result = 0;
/* 100 */
}else if ( algorithmIndex == 13 ) {
result = 0;
/* 80/2/2/2/2/2/2/2/2/2/2 */
}else if ( algorithmIndex == 14 ) {
if ( c < 0.02 ) result = 10;
else if ( c < 0.04 ) result = 9;
else if ( c < 0.06 ) result = 8;
else if ( c < 0.08 ) result = 7;
else if ( c < 0.10 ) result = 6;
else if ( c < 0.12 ) result = 5;
else if ( c < 0.14 ) result = 4;
else if ( c < 0.16 ) result = 3;
else if ( c < 0.18 ) result = 2;
else if ( c < 0.20 ) result = 1;
else result = 0;
/* 11/11/3/3/3/3/.... */
}else if ( algorithmIndex == 15 ) {
if ( c < 0.03 ) result = 27;
else if ( c < 0.06 ) result = 26;
else if ( c < 0.09 ) result = 25;
else if ( c < 0.12 ) result = 24;
else if ( c < 0.15 ) result = 23;
else if ( c < 0.18 ) result = 22;
else if ( c < 0.21 ) result = 21;
else if ( c < 0.24 ) result = 20;
else if ( c < 0.27 ) result = 19;
else if ( c < 0.30 ) result = 18;
else if ( c < 0.33 ) result = 17;
else if ( c < 0.36 ) result = 16;
else if ( c < 0.39 ) result = 15;
else if ( c < 0.42 ) result = 14;
else if ( c < 0.45 ) result = 13;
else if ( c < 0.48 ) result = 12;
else if ( c < 0.51 ) result = 11;
else if ( c < 0.54 ) result = 10;
else if ( c < 0.57 ) result = 9;
else if ( c < 0.60 ) result = 8;
else if ( c < 0.63 ) result = 7;
else if ( c < 0.66 ) result = 6;
else if ( c < 0.69 ) result = 5;
else if ( c < 0.72 ) result = 4;
else if ( c < 0.75 ) result = 3;
else if ( c < 0.78 ) result = 2;
else if ( c < 0.89 ) result = 1;
else result = 0;
/* 23/23/23/23/8 */
} else if ( algorithmIndex == 16 ) {
if ( c < 0.08 ) result = 4;
else if ( c < 0.31 ) result = 3;
else if ( c < 0.54 ) result = 2;
else if ( c < 0.77 ) result = 1;
else result = 0;
/* 97/0.5/0.5/0.5/0.5/0.5?0.5 */
}else if ( algorithmIndex == 17 ) {
if ( c < 0.005 ) result = 6;
else if ( c < 0.010 ) result = 5;
else if ( c < 0.015 ) result = 4;
else if ( c < 0.020 ) result = 3;
else if ( c < 0.025 ) result = 2;
else if ( c < 0.030 ) result = 1;
else result = 0;
/* 80/10/10 */
} else if ( algorithmIndex == 18 ){
if ( c < 0.10 ) result = 2;
else if ( c < 0.20 ) result = 1;
else result = 0;
/* 70/10/10/10 */
} else if ( algorithmIndex == 19 ){
if ( c < 0.10 ) result = 3;
else if ( c < 0.20 ) result = 2;
else if ( c < 0.30 ) result = 1;
else result = 0;
/* 90/5/5 */
}else if ( algorithmIndex == 20 ){
if ( c < 0.05 ) result = 2;
else if ( c < 0.10 ) result = 1;
else result = 0;
/* 80/5/5/5/5 */
}else if ( algorithmIndex == 21 ){
if ( c < 0.05 ) result = 4;
else if ( c < 0.10 ) result = 3;
else if ( c < 0.15 ) result = 2;
else if ( c < 0.20 ) result = 1;
else result = 0;
/* 45/45/10 */
}else if ( algorithmIndex == 22 ) {
if ( c < 0.10 ) result = 2;
else if ( c < 0.55 ) result = 1;
else result = 0;
/* 5.88 x 17 ... seriously? */
} else if ( algorithmIndex == 23 ){
if ( c < 0.0588 ) result = 16;
else if ( c < 0.1176 ) result = 15
else if ( c < 0.1764 ) result = 14;
else if ( c < 0.2352 ) result = 13;
else if ( c < 0.2940 ) result = 12;
else if ( c < 0.3528 ) result = 11;
else if ( c < 0.4116 ) result = 10;
else if ( c < 0.4704 ) result = 9;
else if ( c < 0.5292 ) result = 8;
else if ( c < 0.5880 ) result = 7;
else if ( c < 0.6468 ) result = 6;
else if ( c < 0.7056 ) result = 5;
else if ( c < 0.7644 ) result = 4;
else if ( c < 0.8232 ) result = 3;
else if ( c < 0.8820 ) result = 2;
else if ( c < 0.9401 ) result = 1;
else result = 0;
/* 97.5/2.5 */
}else if ( algorithmIndex == 24 ) {
if ( c < 0.025 ) result = 1;
else result = 0;
/* 92.5/7.5 */
}else if ( algorithmIndex == 25 ) {
if ( c < 0.075 ) result = 1;
else result = 0;
/* 50/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5/2.5 */
/* not sure why this one was setup with results asc vs desc; results were validated and match webstore */
}else if ( algorithmIndex == 26 ) {
if ( c < 0.025 ) result = 1;
else if ( c < 0.050 ) result = 2;
else if ( c < 0.075 ) result = 3;
else if ( c < 0.100 ) result = 4;
else if ( c < 0.125 ) result = 5;
else if ( c < 0.150 ) result = 6;
else if ( c < 0.175 ) result = 7;
else if ( c < 0.200 ) result = 8;
else if ( c < 0.225 ) result = 9;
else if ( c < 0.250 ) result = 10;
else if ( c < 0.275 ) result = 11;
else if ( c < 0.300 ) result = 12;
else if ( c < 0.325 ) result = 13;
else if ( c < 0.350 ) result = 14;
else if ( c < 0.375 ) result = 15;
else if ( c < 0.400 ) result = 16;
else if ( c < 0.425 ) result = 17;
else if ( c < 0.450 ) result = 18;
else if ( c < 0.475 ) result = 19;
else if ( c < 0.500 ) result = 20;
else result = 0;
/* 33.3/33.3/33.3 */
} else if ( algorithmIndex == 27 ) {
if ( c < 0.333 ) result = 2;
else if ( c < 0.666 ) result = 1;
else result = 0;
} else {
if (algorithmIndex in testDescriptors) {
var distributions = testDescriptors[algorithmIndex].d.split('/');
var threshold = 0.0;
result = 0;
for (var i = 1; i < distributions.length; i++) {
threshold += parseFloat(distributions[i])/100.0;
if (c < threshold) {
result = i;
break;
}
}
}
}
return result;
};
return {bucketNumber: Sha1.getDeterministicBuckets(customerId, algorithmIndex, testId) , CustomerId: customerId};
""";
SELECT getDeterministicBuckets(7354430,4,5947) x

If your question is how to call your JS function in the example, you can follow https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions#including-javascript-libraries to:
Upload the JS file to GCS bucket
Create a function to call the JS function, which will be something like
CREATE OR REPLACE FUNCTION yourDataset.getBuckets(customerId STRING, algorithmIndex FLOAT64, testId STRING)
RETURNS STRING
LANGUAGE js
OPTIONS (
library=["gs://my-bucket/path/to/lib1.js", "gs://my-bucket/path/to/lib2.js"]
)
AS """
return yourJS.getBuckets(...);
"""

Looks like your JavaScript is SHA-1 implementation in JavaScript
So, how about simply using built-in SHA1 function

Related

Choropleth Pretty Breaks

I'm trying to develop pretty breaks function in javascript, but I only find examples in r.
Do you know where to find the information for javascript ?
My issue is that I dont understand how to generate the breaks. I understand how to make a round breaks, but how do we calculate the ranges of breaks when selecting 4 breaks ?
I use geojson file working with density.
Thank you for the help
PS : I apologize for my english, I am not a native speaker.
I've solved using this steps.
I hope it serves someone else.
Thanks a lot :D
function prettyBreaks(minimum, maximum, classes) {
let breaks = [];
if ( classes < 1 ) {
breaks.push( maximum );
return breaks;
}
let minimumCount = parseInt(classes / 3 );
let shrink = 0.75;
let highBias = 1.5;
let adjustBias = 0.5 + 1.5 * highBias;
let divisions = parseInt( classes );
let h = highBias;
let cell;
let small = false;
let dx = maximum - minimum;
if (NearTo( dx, 0.0 ) && NearTo( maximum, 0.0 )) {
cell = 1.0;
small = true;
} else {
let U = 1;
cell = (Math.abs(maximum) >= Math.abs(minimum)) ? Math.abs(maximum) : Math.abs(minimum);
if ( adjustBias >= 1.5 * h + 0.5) {
U = parseInt( 1 + (1.0 / (1 + h)) );
} else {
U = parseInt( 1 + ( 1.5 / ( 1 + adjustBias ) ) );
}
let maxBetweenDivisions = (1 >= divisions) ? 1 : divisions;
small = dx < ( cell * U * maxBetweenDivisions * 1e-07 * 3.0 );
}
if (small) {
if (cell > 10) {
cell = 9 + cell / 10;
cell = cell * shrink;
}
if (minimumCount > 1) {
cell = cell / minimumCount;
}
} else {
cell = dx;
if (divisions > 1) {
cell = cell / divisions;
}
}
if (cell < 20 * 1e-07) {
cell = 20 * 1e-07;
}
let base = Math.pow(10.0, Math.floor( Math.log10( cell )));
let unit = base;
if ( ( 2 * base ) - cell < h * ( cell - unit ) ) {
unit = 2.0 * base;
if ( ( 5 * base ) - cell < adjustBias * ( cell - unit ) )
{
unit = 5.0 * base;
if ( ( 10.0 * base ) - cell < h * ( cell - unit ) )
{
unit = 10.0 * base;
}
}
}
let start = parseInt( Math.floor( minimum / unit + 1e-07 ) );
let end = parseInt( Math.ceil( maximum / unit - 1e-07 ) );
// Extend the range out beyond the data. Does this ever happen??
while ( start * unit > minimum + ( 1e-07 * unit ) ) {
start = start - 1;
}
while ( end * unit < maximum - ( 1e-07 * unit ) ) {
end = end + 1;
}
let k = parseInt( Math.floor( 0.5 + end - start ) );
if ( k < minimumCount ) {
k = minimumCount - k;
if ( start >= 0 ) {
end = end + k / 2;
start = start - k / 2 + k % 2;
}
else {
start = start - k / 2;
end = end + k / 2 + k % 2;
}
}
let minimumBreak = start * unit;
let count = parseInt( end - start );
breaks = [];
for ( let i = 1; i < count + 1; i++ ) {
breaks.push( minimumBreak + i * unit );
}
if ( breaks.length === 0 ) return breaks;
if ( breaks[0] < minimum ) {
breaks.splice(0, 1, minimum);
}
if ( breaks[breaks.length-1] > maximum ) {
breaks.splice(breaks.length-1, 1, maximum);
}
if ( minimum < 0.0 && maximum > 0.0 ) { //then there should be a zero somewhere
let breaksMinusZero = []; // compute difference "each break - 0"
for ( let i = 0; i <= breaks.length; i++ ) {
breaksMinusZero.push( breaks[i] - 0.0 );
}
let posOfMin = 0;
for ( let i = 1; i <= breaks.length; i++ ) { // find position of minimal difference
if ( Math.abs( breaksMinusZero[i] ) < Math.abs( breaksMinusZero[i - 1] ) )
posOfMin = i;
}
breaks[posOfMin] = 0.0;
}
return breaks;
};
/** Check if value1 is nearest to value2 */
function NearTo(value1, value2) {
if (value1 > 1 || value2 > 1) return false;
else if ((value1 >= 0 && value1 < 1 ) && (value2 >= 0 && value2 < 1 )) return true;
else return false;
};

Python and Javascript Pseudo Random Number Generator PRNG

I am looking for a way to generate the same sequence of pseudo random integer numbers from both python and javascript.
When I seed in python like this I get the below results:
random.seed(3909461935)
random.randint(0, 2147483647) = 162048056
random.randint(0, 2147483647) = 489743869
random.randint(0, 2147483647) = 1561110296
I need the same sequence in javascript.
Note: I used 2147483647 as the range in the randint method because I am assuming javascript can only handle 32 bit INTs.
Are there any libraries on both sides I can use to generate the same set of pseudo random numbers given the same seed?
I have found two implementations of Mersenne Twister that generate the same 32 bit integer values given the same seed.
This way you can generate a server side sequence in Python, and have the browser independently generate the same sequence in javascript.
Python:
from mt_random import *
r = mersenne_rng(seed = 12345)
r.get_random_number() # Prints 3992670690
r.get_random_number() # Prints 3823185381
r.get_random_number() # Prints 1358822685
Javascript:
r = new MersenneTwister();
r.init_genrand(12345);
r = mersenne_rng(seed = 12345)
r.genrand_int32(); # Prints 3992670690
r.genrand_int32(); # Prints 3823185381
r.genrand_int32(); # Prints 1358822685
The JS is here:
/*
* 疑似乱数生成機 移植
*
* Mersenne Twister with improved initialization (2002)
* http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/mt.html
* http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/mt19937ar.html
*/
// = 移植元ラインセンス =======================================================
// ======================================================================
/*
A C-program for MT19937, with initialization improved 2002/2/10.
Coded by Takuji Nishimura and Makoto Matsumoto.
This is a faster version by taking Shawn Cokus's optimization,
Matthe Bellew's simplification, Isaku Wada's real version.
Before using, initialize the state by using init_genrand(seed)
or init_by_array(init_key, key_length).
Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or another materials provided with the distribution.
3. The names of its contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Any feedback is very welcome.
http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
email: m-mat # math.sci.hiroshima-u.ac.jp (remove space)
*/
// ======================================================================
function MersenneTwister() {
// 整数を扱うクラス
function Int32(value) {
var bits = new Array(0, 0, 0, 0);
var i;
var v = value;
if (v != 0) {
for (i = 0; i < 4; ++i) {
bits[i] = v & 0xff;
v = v >> 8;
}
}
this.getValue = function () {
return (bits[0] | (bits[1] << 8) | (bits[2] << 16)) + ((bits[3] << 16) * 0x100);
};
this.getBits = function (i) { return bits[i & 3]; };
this.setBits = function (i, val) { return (bits[i & 3] = val & 0xff); };
this.add = function (another) {
var tmp = new Int32(0);
var i, fl = 0, b;
for (i = 0; i < 4; ++i) {
b = bits[i] + another.getBits(i) + fl;
tmp.setBits(i, b);
fl = b >> 8;
}
return tmp;
};
this.sub = function (another) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0);
var i;
for (i = 0; i < 4; ++i) {
bb[i] = bits[i] - another.getBits(i);
if ((i > 0) && (bb[i - 1] < 0)) {
--bb[i];
}
}
for (i = 0; i < 4; ++i) {
tmp.setBits(i, bb[i]);
}
return tmp;
};
this.mul = function (another) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0, 0);
var i, j;
for (i = 0; i < 4; ++i) {
for (j = 0; i + j < 4; ++j) {
bb[i + j] += bits[i] * another.getBits(j);
}
tmp.setBits(i, bb[i]);
bb[i + 1] += bb[i] >> 8;
}
return tmp;
};
this.and = function (another) {
var tmp = new Int32(0);
var i;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, bits[i] & another.getBits(i));
}
return tmp;
};
this.or = function (another) {
var tmp = new Int32(0);
var i;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, bits[i] | another.getBits(i));
}
return tmp;
};
this.xor = function (another) {
var tmp = new Int32(0);
var i;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, bits[i] ^ another.getBits(i));
}
return tmp;
};
this.rshifta = function (s) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0, 0);
var p = s >> 3;
var i, sg = 0;
if ((bits[3] & 0x80) > 0) {
bb[4] = sg = 0xff;
}
for (i = 0; i + p < 4; ++i) {
bb[i] = bits[i + p];
}
for (; i < 4; ++i) {
bb[i] = sg;
}
p = s & 0x7;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, ((bb[i] | (bb[i + 1] << 8)) >> p) & 0xff);
}
return tmp;
};
this.rshiftl = function (s) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0, 0);
var p = s >> 3;
var i;
for (i = 0; i + p < 4; ++i) {
bb[i] = bits[i + p];
}
p = s & 0x7;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, ((bb[i] | (bb[i + 1] << 8)) >> p) & 0xff);
}
return tmp;
};
this.lshift = function (s) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0, 0);
var p = s >> 3;
var i;
for (i = 0; i + p < 4; ++i) {
bb[i + p + 1] = bits[i];
}
p = s & 0x7;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, (((bb[i] | (bb[i + 1] << 8)) << p) >> 8) & 0xff);
}
return tmp;
};
this.equals = function (another) {
var i;
for (i = 0; i < 4; ++i) {
if (bits[i] != another.getBits(i)) {
return false;
}
}
return true;
};
this.compare = function (another) {
var i;
for (i = 3; i >= 0; --i) {
if (bits[i] > another.getBits(i)) {
return 1;
} else if (bits[i] < another.getBits(i)) {
return -1;
}
}
return 0;
};
}
// End of Int32
/* Period parameters */
var N = 624;
var M = 397;
var MATRIX_A = new Int32(0x9908b0df); /* constant vector a */
var UMASK = new Int32(0x80000000); /* most significant w-r bits */
var LMASK = new Int32(0x7fffffff); /* least significant r bits */
var INT32_ZERO = new Int32(0);
var INT32_ONE = new Int32(1);
var MIXBITS = function (u, v) {
return (u.and(UMASK)).or(v.and(LMASK));
};
var TWIST = function (u, v) {
return ((MIXBITS(u, v).rshiftl(1)).xor((v.and(INT32_ONE)).equals(INT32_ZERO) ? INT32_ZERO : MATRIX_A));
};
var state = new Array(); /* the array for the state vector */
var left = 1;
var initf = 0;
var next = 0;
var i;
for (i = 0; i < N; ++i) {
state[i] = INT32_ZERO;
}
/* initializes state[N] with a seed */
var _init_genrand = function (s) {
var lt1812433253 = new Int32(1812433253);
var j;
state[0]= new Int32(s);
for (j = 1; j < N; ++j) {
state[j] = ((lt1812433253.mul(state[j - 1].xor(state[j - 1].rshiftl(30)))).add(new Int32(j)));
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
/* In the previous versions, MSBs of the seed affect */
/* only MSBs of the array state[]. */
/* 2002/01/09 modified by Makoto Matsumoto */
//state[j] &= 0xffffffff; /* for >32 bit machines */
}
left = 1; initf = 1;
};
this.init_genrand = _init_genrand;
/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
/* slight change for C++, 2004/2/26 */
this.init_by_array = function (init_key, key_length) {
var lt1664525 = new Int32(1664525);
var lt1566083941 = new Int32(1566083941);
var i, j, k;
_init_genrand(19650218);
i = 1; j = 0;
k = (N > key_length ? N : key_length);
for (; k; --k) {
state[i] = ((state[i].xor((state[i - 1].xor(state[i - 1].rshiftl(30))).mul(lt1664525))).add(
new Int32(init_key[j]))).add(new Int32(j)); /* non linear */
//state[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
i++; j++;
if (i >= N) {
state[0] = state[N - 1];
i = 1;
}
if (j >= key_length) {
j = 0;
}
}
for (k = N - 1; k; --k) {
state[i] = (state[i].xor((state[i-1].xor(state[i - 1].rshiftl(30))).mul(lt1566083941))).sub(
new Int32(i)); /* non linear */
//state[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
i++;
if (i >= N) {
state[0] = state[N - 1];
i = 1;
}
}
state[0] = new Int32(0x80000000); /* MSB is 1; assuring non-zero initial array */
left = 1; initf = 1;
};
var next_state = function () {
var p = 0;
var j;
/* if init_genrand() has not been called, */
/* a default initial seed is used */
if (initf == 0) {
_init_genrand(5489);
}
left = N;
next = 0;
for (j = N - M + 1; --j; ++p) {
state[p] = state[p + M].xor(TWIST(state[p], state[p + 1]));
}
for (j = M; --j; ++p) {
state[p] = state[p + M - N].xor(TWIST(state[p], state[p + 1]));
}
state[p] = state[p + M - N].xor(TWIST(state[p], state[0]));
};
var lt0x9d2c5680 = new Int32(0x9d2c5680);
var lt0xefc60000 = new Int32(0xefc60000);
/* generates a random number on [0,0xffffffff]-interval */
var _genrand_int32 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return y.getValue();
};
this.genrand_int32 = _genrand_int32;
/* generates a random number on [0,0x7fffffff]-interval */
this.genrand_int31 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return (y.rshiftl(1)).getValue();
};
/* generates a random number on [0,1]-real-interval */
this.genrand_real1 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return y.getValue() * (1.0/4294967295.0);
/* divided by 2^32-1 */
};
/* generates a random number on [0,1)-real-interval */
this.genrand_real2 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return y.getValue() * (1.0 / 4294967296.0);
/* divided by 2^32 */
};
/* generates a random number on (0,1)-real-interval */
this.genrand_real3 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return (y.getValue() + 0.5) * (1.0 / 4294967296.0);
/* divided by 2^32 */
};
/* generates a random number on [0,1) with 53-bit resolution*/
this.genrand_res53 = function () {
var a = ((new Int32(_genrand_int32())).rshiftl(5)).getValue();
var b = ((new Int32(_genrand_int32())).rshiftl(6)).getValue();
return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0);
};
/* These real versions are due to Isaku Wada, 2002/01/09 added */
}
The corresponding Python implementation is here:
class mersenne_rng(object):
def __init__(self, seed = 5489):
self.state = [0]*624
self.f = 1812433253
self.m = 397
self.u = 11
self.s = 7
self.b = 0x9D2C5680
self.t = 15
self.c = 0xEFC60000
self.l = 18
self.index = 624
self.lower_mask = (1<<31)-1
self.upper_mask = 1<<31
# update state
self.state[0] = seed
for i in range(1,624):
self.state[i] = self.int_32(self.f*(self.state[i-1]^(self.state[i-1]>>30)) + i)
def twist(self):
for i in range(624):
temp = self.int_32((self.state[i]&self.upper_mask)+(self.state[(i+1)%624]&self.lower_mask))
temp_shift = temp>>1
if temp%2 != 0:
temp_shift = temp_shift^0x9908b0df
self.state[i] = self.state[(i+self.m)%624]^temp_shift
self.index = 0
def get_random_number(self):
if self.index >= 624:
self.twist()
y = self.state[self.index]
y = y^(y>>self.u)
y = y^((y<<self.s)&self.b)
y = y^((y<<self.t)&self.c)
y = y^(y>>self.l)
self.index+=1
return self.int_32(y)
def int_32(self, number):
return int(0xFFFFFFFF & number)
if __name__ == "__main__":
rng = mersenne_rng(1131464071)
for i in range(10):
print rng.get_random_number()

How to convert json objectId to its string representaion in javascript

I would like to convert json representation of bson ObjectId returned from REST mongodb API to string
from: {"inc":1365419770,"machine":-856505582,"timeSecond":1375343587,"time":1375343587000,"new":false};
to: 51fa13e3ccf2c3125162a6fa
in the client side, so it will call other API using path params.
I just developed it, in case some one else is looking for the same functionality.
var ObjectIdStr = function (hexstr) {
this.timestamp ;
this.machine ;
this.increment ;
if (this.__proto__.constructor !== ObjectIdStr) {
return new ObjectIdStr(hexstr);
}
var isValid = function( s ){
if ( s == null )
return false;
len = s.length;
if ( len != 24 )
return false;
for ( i=0; i<len; i++ ){
c = s.charAt(i);
if ( c >= '0' && c <= '9' )
continue;
if ( c >= 'a' && c <= 'f' )
continue;
if ( c >= 'A' && c <= 'F' )
continue;
return false;
}
return true;
}
var fromHex = function(hex){
hex = parseInt(hex, 16);
if (hex > 0x80000000) {
hex = hex - 0xFFFFFFFF - 1;
}
return hex;
}
if ( ! isValid( hexstr ) )
throw "invalid ObjectId [" + s + "]" ;
this.timestamp = fromHex(hexstr.substring(0,8));
this.machine = fromHex(hexstr.substring(8,16));
this.increment = parseInt( hexstr.substring(16,24) , 16);
}
var ObjectId = function (json) {
this.timestamp = json.timeSecond;
this.machine = json.machine;
this.increment = json.inc;
if (this.__proto__.constructor !== ObjectId) {
return new ObjectId(json);
}
var hex = function(number){
if (number < 0) {
number = 0xFFFFFFFF + number + 1;
}
return number.toString(16).toLowerCase();
}
this.toString = function () {
var timestamp = hex(this.timestamp);
var machine = hex(this.machine);
var increment = hex(this.increment);
return '00000000'.substr(0, 6 - timestamp.length) + timestamp +
'00000000'.substr(0, 6 - machine.length) + machine +
'00000000'.substr(0, 6 - increment.length) + increment ;
};
};
function testme(){
var objJson = {"inc":1365419770,"machine":-856505582,"timeSecond":1375343587,"time":1375343587000,"new":false};
$("#ObjIdStr").html(ObjectId(objJson).toString());
obj = ObjectIdStr("51fa13e3ccf2c3125162a6fa")
$("#out").html( obj.increment + " " + obj.machine + " " + obj.timestamp)
}

Is it possible to call sub-method in javascript?

As an experiment, I am trying to build the following hierarchy of methods around Number:
Number
|
|
pad()
/\
/ \
/ \
/ \
left() right()
Precisely, I want left() and right() to be the sub-methods of pad() so that I am able to pad the number by making a call something like:
var i = 100;
i.pad().left();
Here's an experimental script that I was working on:
/*
Javascript to pad zero's to a number at the left or right.
*/
Number.prototype.pad = function()
{
Number.prototype.pad.left = function(l)
{
if(typeof l=='undefined')
l = 0;
p = this+'';
var r = l - p.length;
while((r--)>0)
p = '0'+p;
return p;
}
Number.prototype.pad.right = function(l)
{
if(typeof l=='undefined')
l = 0;
p = this+'';
var r = l - p.length;
while((r--)>0)
p = p+'0';
return p;
}
}
i = 646;
padded = i.pad.left(9); /* Possible calling format*/
alert(padded);
How is this possible in Javascript?
Issues fixed:
1) calling pad() instead of just pad
2) Pad() returns object with member left and right.
3) Most importantly, 'this' will not work inside the new functions as they are not part of number. Extracting the number outside the function.
4) p was inadvertly becoming public (window.p). Put a var in front.
See it working
Number.prototype.pad = function() {
var number = this.valueOf();
return {
left: function(l)
{
console.log(this);
if(typeof l=='undefined')
l = 0;
var p = number + '';
var r = l - p.length;
while((r--) > 0)
p = '0'+ p;
return p;
},
right: function(l)
{
if(typeof l=='undefined')
l = 0;
var p = number + '';
var r = l - p.length;
while((r--)>0)
p = p+'0';
return p;
}
}
}
i = 646;
padded = i.pad().left(9); /* Possible calling format*/
alert(padded);
Consider something simpler. You could just pass left and right as parameters to pad. For example:
Number.prototype.pad = function( l,r ) {
l = l || 0; r = r || 0;
var arr = [ this ],
i = 0, j = 0;
for ( ; i < l; ++i ) arr.unshift(0);
for ( ; j < r; ++j ) arr.push(0);
return arr.join('');
};
console.log( (12).pad(2,0) ); //=> 0012
console.log( (12).pad(0,1) ); //=> 120
console.log( (12).pad(3,2) ); //=> 0001200
Here's a shorter more functional and terse piece of code that works as well:
Number.prototype.pad = function( l,r ) {
return this.toString()
.replace( /\d+/, new Array( ++l ).join('0') +'$&'+
new Array( ++r||0 ).join('0'));
}
My version
Number.prototype.lpad = function( num ) {
var padding = new Array( num + 1 ).join("0"),
result = padding + this.toString( 10 );
return result.slice( -num );
}
Number.prototype.rpad = function( num ) {
var padding = new Array( num + 1 ).join("0"),
result = this.toString( 10 ) + padding;
return result.slice( 0, num );
}

LogLog and HyperLogLog algorithms for counting of large cardinalities

Where can I find a valid implementation of LogLog algorithm? Have tried to implement it by myself but my draft implementation yields strange results.
Here it is:
function LogLog(max_error, max_count)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}
var m = 1.30 / max_error;
var k = Math.ceil(log2(m * m));
m = Math.pow(2, k);
var k_comp = 32 - k;
var l = log2(log2(max_count / m));
if (isNaN(l)) l = 1; else l = Math.ceil(l);
var l_mask = ((1 << l) - 1) >>> 0;
var M = [];
for (var i = 0; i < m; ++i) M[i] = 0;
function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
var rank = 0;
for (var i = 0; i < k_comp; ++i)
{
if ((hash >>> i) & 1)
{
rank = i + 1;
break;
}
}
M[j] = Math.max(M[j], rank & l_mask);
}
else
{
var c = 0;
for (var i = 0; i < m; ++i) c += M[i];
return 0.79402 * m * Math.pow(2, c / m);
}
}
return {count: count};
}
function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words
var log_log = LogLog(0.01, 100000);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]));
alert(log_log.count());
For unknown reason implementation is very sensitive to max_error parameter, it is the main factor that determines the magnitude of the result. I'm sure, there is some stupid mistake :)
UPDATE: This problem is solved in the newer version of algorithm. I will post its implementation later.
Here it is the updated version of the algorithm based on the newer paper:
var pow_2_32 = 0xFFFFFFFF + 1;
function HyperLogLog(std_error)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}
function rank(hash, max)
{
var r = 1;
while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
return r;
}
var m = 1.04 / std_error;
var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
m = Math.pow(2, k);
var alpha_m = m == 16 ? 0.673
: m == 32 ? 0.697
: m == 64 ? 0.709
: 0.7213 / (1 + 1.079 / m);
var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
M[j] = Math.max(M[j], rank(hash, k_comp));
}
else
{
var c = 0.0;
for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
var E = alpha_m * m * m / c;
// -- make corrections
if (E <= 5/2 * m)
{
var V = 0;
for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
if (V > 0) E = m * Math.log(m / V);
}
else if (E > 1/30 * pow_2_32)
E = -pow_2_32 * Math.log(1 - E / pow_2_32);
// --
return E;
}
}
return {count: count};
}
function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words
var seed = Math.floor(Math.random() * pow_2_32); // make more fun
var log_log = HyperLogLog(0.065);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed);
var count = log_log.count();
alert(count + ', error ' +
(count - words.length) / (words.length / 100.0) + '%');
Here is a slightly modified version which adds the merge operation.
Merge allows you to take the counters from several instances of HyperLogLog,
and determine the unique counters overall.
For example, if you have unique visitors collected on Monday, Tuesday and Wednesday,
then you can merge the buckets together and count the number of unique visitors
over the three day span:
var pow_2_32 = 0xFFFFFFFF + 1;
function HyperLogLog(std_error)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}
function rank(hash, max)
{
var r = 1;
while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
return r;
}
var m = 1.04 / std_error;
var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
m = Math.pow(2, k);
var alpha_m = m == 16 ? 0.673
: m == 32 ? 0.697
: m == 64 ? 0.709
: 0.7213 / (1 + 1.079 / m);
var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
function merge(other)
{
for (var i = 0; i < m; i++)
M[i] = Math.max(M[i], other.buckets[i]);
}
function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
M[j] = Math.max(M[j], rank(hash, k_comp));
}
else
{
var c = 0.0;
for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
var E = alpha_m * m * m / c;
// -- make corrections
if (E <= 5/2 * m)
{
var V = 0;
for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
if (V > 0) E = m * Math.log(m / V);
}
else if (E > 1/30 * pow_2_32)
E = -pow_2_32 * Math.log(1 - E / pow_2_32);
// --
return E;
}
}
return {count: count, merge: merge, buckets: M};
}
function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
Then you can do something like this:
// initialize one counter per day
var ll_monday = HyperLogLog(0.01);
var ll_tuesday = HyperLogLog(0.01);
var ll_wednesday = HyperLogLog(0.01);
// add 5000 unique values in each day
for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random()));
// add 5000 values which appear every day
for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i)); ll_wednesday.count(fnv1a('' + i));}
// merge three days together
together = HyperLogLog(0.01);
together.merge(ll_monday);
together.merge(ll_tuesday);
together.merge(ll_wednesday);
// report
console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count()));
console.log('unique numbers overall: ' + Math.round(together.count()));
We've open sourced a project called Stream-Lib that has a LogLog implementation. The work was based on this paper.
Using the js version #actual provided, I tried to implement the same in C#, which seems close enough. Just changed fnv1a function a little bit and renamed it to getHashCode. (Credit goes to Jenkins hash function, http://en.wikipedia.org/wiki/Jenkins_hash_function)
public class HyperLogLog
{
private double mapSize, alpha_m, k;
private int kComplement;
private Dictionary<int, int> Lookup = new Dictionary<int, int>();
private const double pow_2_32 = 4294967297;
public HyperLogLog(double stdError)
{
mapSize = (double)1.04 / stdError;
k = (long)Math.Ceiling(log2(mapSize * mapSize));
kComplement = 32 - (int)k;
mapSize = (long)Math.Pow(2, k);
alpha_m = mapSize == 16 ? (double)0.673
: mapSize == 32 ? (double)0.697
: mapSize == 64 ? (double)0.709
: (double)0.7213 / (double)(1 + 1.079 / mapSize);
for (int i = 0; i < mapSize; i++)
Lookup[i] = 0;
}
private static double log2(double x)
{
return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2
}
private static int getRank(uint hash, int max)
{
int r = 1;
uint one = 1;
while ((hash & one) == 0 && r <= max)
{
++r;
hash >>= 1;
}
return r;
}
public static uint getHashCode(string text)
{
uint hash = 0;
for (int i = 0, l = text.Length; i < l; i++)
{
hash += (uint)text[i];
hash += hash << 10;
hash ^= hash >> 6;
}
hash += hash << 3;
hash ^= hash >> 6;
hash += hash << 16;
return hash;
}
public int Count()
{
double c = 0, E;
for (var i = 0; i < mapSize; i++)
c += 1d / Math.Pow(2, (double)Lookup[i]);
E = alpha_m * mapSize * mapSize / c;
// Make corrections & smoothen things.
if (E <= (5 / 2) * mapSize)
{
double V = 0;
for (var i = 0; i < mapSize; i++)
if (Lookup[i] == 0) V++;
if (V > 0)
E = mapSize * Math.Log(mapSize / V);
}
else
if (E > (1 / 30) * pow_2_32)
E = -pow_2_32 * Math.Log(1 - E / pow_2_32);
// Made corrections & smoothen things, or not.
return (int)E;
}
public void Add(object val)
{
uint hashCode = getHashCode(val.ToString());
int j = (int)(hashCode >> kComplement);
Lookup[j] = Math.Max(Lookup[j], getRank(hashCode, kComplement));
}
}
I know this is an old post but the #buryat implementation has moved, and is in any case incomplete, and a bit on the slow side (sorry o_o ).
I've taken the implementation used by the new Redis release which can be found here and ported it to PHP. The repo is here https://github.com/joegreen0991/HyperLogLog
<?php
class HyperLogLog {
private $HLL_P_MASK;
private $HLL_REGISTERS;
private $ALPHA;
private $registers;
public function __construct($HLL_P = 14)
{
$this->HLL_REGISTERS = (1 << $HLL_P); /* With P=14, 16384 registers. */
$this->HLL_P_MASK = ($this->HLL_REGISTERS - 1); /* Mask to index register. */
$this->ALPHA = 0.7213 / (1 + 1.079 / $this->HLL_REGISTERS);
$this->registers = new SplFixedArray($this->HLL_REGISTERS);
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
$this->registers[$i] = 0;
}
}
public function add($v)
{
$h = crc32(md5($v));
$h |= 1 << 63; /* Make sure the loop terminates. */
$bit = $this->HLL_REGISTERS; /* First bit not used to address the register. */
$count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
while(($h & $bit) == 0) {
$count++;
$bit <<= 1;
}
/* Update the register if this element produced a longer run of zeroes. */
$index = $h & $this->HLL_P_MASK; /* Index a register inside registers. */
if ($this->registers[$index] < $count) {
$this->registers[$index] = $count;
}
}
public function export()
{
$str = '';
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
$str .= chr($this->registers[$i]);
}
return $str;
}
public function import($str)
{
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
$this->registers[$i] = isset($str[$i]) ? ord($str[$i]) : 0;
}
}
public function merge($str)
{
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
if(isset($str[$i]))
{
$ord = ord($str[$i]);
if ($this->registers[$i] < $ord) {
$this->registers[$i] = $ord;
}
}
}
}
/**
* #static
* #param $arr
* #return int Number of unique items in $arr
*/
public function count() {
$E = 0;
$ez = 0;
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
if ($this->registers[$i] !== 0) {
$E += (1.0 / pow(2, $this->registers[$i]));
} else {
$ez++;
$E += 1.0;
}
}
$E = (1 / $E) * $this->ALPHA * $this->HLL_REGISTERS * $this->HLL_REGISTERS;
/* Use the LINEARCOUNTING algorithm for small cardinalities.
* For larger values but up to 72000 HyperLogLog raw approximation is
* used since linear counting error starts to increase. However HyperLogLog
* shows a strong bias in the range 2.5*16384 - 72000, so we try to
* compensate for it. */
if ($E < $this->HLL_REGISTERS * 2.5 && $ez != 0) {
$E = $this->HLL_REGISTERS * log($this->HLL_REGISTERS / $ez);
}
else if ($this->HLL_REGISTERS == 16384 && $E < 72000) {
// We did polynomial regression of the bias for this range, this
// way we can compute the bias for a given cardinality and correct
// according to it. Only apply the correction for P=14 that's what
// we use and the value the correction was verified with.
$bias = 5.9119 * 1.0e-18 * ($E*$E*$E*$E)
-1.4253 * 1.0e-12 * ($E*$E*$E)+
1.2940 * 1.0e-7 * ($E*$E)
-5.2921 * 1.0e-3 * $E+
83.3216;
$E -= $E * ($bias/100);
}
return floor($E);
}
}
I implemented loglog and hyperloglog in JS and PHP and well-commented code https://github.com/buryat/loglog

Categories

Resources