LogLog and HyperLogLog algorithms for counting of large cardinalities - javascript

Where can I find a valid implementation of LogLog algorithm? Have tried to implement it by myself but my draft implementation yields strange results.
Here it is:
function LogLog(max_error, max_count)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}
var m = 1.30 / max_error;
var k = Math.ceil(log2(m * m));
m = Math.pow(2, k);
var k_comp = 32 - k;
var l = log2(log2(max_count / m));
if (isNaN(l)) l = 1; else l = Math.ceil(l);
var l_mask = ((1 << l) - 1) >>> 0;
var M = [];
for (var i = 0; i < m; ++i) M[i] = 0;
function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
var rank = 0;
for (var i = 0; i < k_comp; ++i)
{
if ((hash >>> i) & 1)
{
rank = i + 1;
break;
}
}
M[j] = Math.max(M[j], rank & l_mask);
}
else
{
var c = 0;
for (var i = 0; i < m; ++i) c += M[i];
return 0.79402 * m * Math.pow(2, c / m);
}
}
return {count: count};
}
function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words
var log_log = LogLog(0.01, 100000);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]));
alert(log_log.count());
For unknown reason implementation is very sensitive to max_error parameter, it is the main factor that determines the magnitude of the result. I'm sure, there is some stupid mistake :)
UPDATE: This problem is solved in the newer version of algorithm. I will post its implementation later.

Here it is the updated version of the algorithm based on the newer paper:
var pow_2_32 = 0xFFFFFFFF + 1;
function HyperLogLog(std_error)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}
function rank(hash, max)
{
var r = 1;
while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
return r;
}
var m = 1.04 / std_error;
var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
m = Math.pow(2, k);
var alpha_m = m == 16 ? 0.673
: m == 32 ? 0.697
: m == 64 ? 0.709
: 0.7213 / (1 + 1.079 / m);
var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
M[j] = Math.max(M[j], rank(hash, k_comp));
}
else
{
var c = 0.0;
for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
var E = alpha_m * m * m / c;
// -- make corrections
if (E <= 5/2 * m)
{
var V = 0;
for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
if (V > 0) E = m * Math.log(m / V);
}
else if (E > 1/30 * pow_2_32)
E = -pow_2_32 * Math.log(1 - E / pow_2_32);
// --
return E;
}
}
return {count: count};
}
function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words
var seed = Math.floor(Math.random() * pow_2_32); // make more fun
var log_log = HyperLogLog(0.065);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed);
var count = log_log.count();
alert(count + ', error ' +
(count - words.length) / (words.length / 100.0) + '%');

Here is a slightly modified version which adds the merge operation.
Merge allows you to take the counters from several instances of HyperLogLog,
and determine the unique counters overall.
For example, if you have unique visitors collected on Monday, Tuesday and Wednesday,
then you can merge the buckets together and count the number of unique visitors
over the three day span:
var pow_2_32 = 0xFFFFFFFF + 1;
function HyperLogLog(std_error)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}
function rank(hash, max)
{
var r = 1;
while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
return r;
}
var m = 1.04 / std_error;
var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
m = Math.pow(2, k);
var alpha_m = m == 16 ? 0.673
: m == 32 ? 0.697
: m == 64 ? 0.709
: 0.7213 / (1 + 1.079 / m);
var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
function merge(other)
{
for (var i = 0; i < m; i++)
M[i] = Math.max(M[i], other.buckets[i]);
}
function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
M[j] = Math.max(M[j], rank(hash, k_comp));
}
else
{
var c = 0.0;
for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
var E = alpha_m * m * m / c;
// -- make corrections
if (E <= 5/2 * m)
{
var V = 0;
for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
if (V > 0) E = m * Math.log(m / V);
}
else if (E > 1/30 * pow_2_32)
E = -pow_2_32 * Math.log(1 - E / pow_2_32);
// --
return E;
}
}
return {count: count, merge: merge, buckets: M};
}
function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
Then you can do something like this:
// initialize one counter per day
var ll_monday = HyperLogLog(0.01);
var ll_tuesday = HyperLogLog(0.01);
var ll_wednesday = HyperLogLog(0.01);
// add 5000 unique values in each day
for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random()));
// add 5000 values which appear every day
for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i)); ll_wednesday.count(fnv1a('' + i));}
// merge three days together
together = HyperLogLog(0.01);
together.merge(ll_monday);
together.merge(ll_tuesday);
together.merge(ll_wednesday);
// report
console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count()));
console.log('unique numbers overall: ' + Math.round(together.count()));

We've open sourced a project called Stream-Lib that has a LogLog implementation. The work was based on this paper.

Using the js version #actual provided, I tried to implement the same in C#, which seems close enough. Just changed fnv1a function a little bit and renamed it to getHashCode. (Credit goes to Jenkins hash function, http://en.wikipedia.org/wiki/Jenkins_hash_function)
public class HyperLogLog
{
private double mapSize, alpha_m, k;
private int kComplement;
private Dictionary<int, int> Lookup = new Dictionary<int, int>();
private const double pow_2_32 = 4294967297;
public HyperLogLog(double stdError)
{
mapSize = (double)1.04 / stdError;
k = (long)Math.Ceiling(log2(mapSize * mapSize));
kComplement = 32 - (int)k;
mapSize = (long)Math.Pow(2, k);
alpha_m = mapSize == 16 ? (double)0.673
: mapSize == 32 ? (double)0.697
: mapSize == 64 ? (double)0.709
: (double)0.7213 / (double)(1 + 1.079 / mapSize);
for (int i = 0; i < mapSize; i++)
Lookup[i] = 0;
}
private static double log2(double x)
{
return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2
}
private static int getRank(uint hash, int max)
{
int r = 1;
uint one = 1;
while ((hash & one) == 0 && r <= max)
{
++r;
hash >>= 1;
}
return r;
}
public static uint getHashCode(string text)
{
uint hash = 0;
for (int i = 0, l = text.Length; i < l; i++)
{
hash += (uint)text[i];
hash += hash << 10;
hash ^= hash >> 6;
}
hash += hash << 3;
hash ^= hash >> 6;
hash += hash << 16;
return hash;
}
public int Count()
{
double c = 0, E;
for (var i = 0; i < mapSize; i++)
c += 1d / Math.Pow(2, (double)Lookup[i]);
E = alpha_m * mapSize * mapSize / c;
// Make corrections & smoothen things.
if (E <= (5 / 2) * mapSize)
{
double V = 0;
for (var i = 0; i < mapSize; i++)
if (Lookup[i] == 0) V++;
if (V > 0)
E = mapSize * Math.Log(mapSize / V);
}
else
if (E > (1 / 30) * pow_2_32)
E = -pow_2_32 * Math.Log(1 - E / pow_2_32);
// Made corrections & smoothen things, or not.
return (int)E;
}
public void Add(object val)
{
uint hashCode = getHashCode(val.ToString());
int j = (int)(hashCode >> kComplement);
Lookup[j] = Math.Max(Lookup[j], getRank(hashCode, kComplement));
}
}

I know this is an old post but the #buryat implementation has moved, and is in any case incomplete, and a bit on the slow side (sorry o_o ).
I've taken the implementation used by the new Redis release which can be found here and ported it to PHP. The repo is here https://github.com/joegreen0991/HyperLogLog
<?php
class HyperLogLog {
private $HLL_P_MASK;
private $HLL_REGISTERS;
private $ALPHA;
private $registers;
public function __construct($HLL_P = 14)
{
$this->HLL_REGISTERS = (1 << $HLL_P); /* With P=14, 16384 registers. */
$this->HLL_P_MASK = ($this->HLL_REGISTERS - 1); /* Mask to index register. */
$this->ALPHA = 0.7213 / (1 + 1.079 / $this->HLL_REGISTERS);
$this->registers = new SplFixedArray($this->HLL_REGISTERS);
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
$this->registers[$i] = 0;
}
}
public function add($v)
{
$h = crc32(md5($v));
$h |= 1 << 63; /* Make sure the loop terminates. */
$bit = $this->HLL_REGISTERS; /* First bit not used to address the register. */
$count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
while(($h & $bit) == 0) {
$count++;
$bit <<= 1;
}
/* Update the register if this element produced a longer run of zeroes. */
$index = $h & $this->HLL_P_MASK; /* Index a register inside registers. */
if ($this->registers[$index] < $count) {
$this->registers[$index] = $count;
}
}
public function export()
{
$str = '';
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
$str .= chr($this->registers[$i]);
}
return $str;
}
public function import($str)
{
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
$this->registers[$i] = isset($str[$i]) ? ord($str[$i]) : 0;
}
}
public function merge($str)
{
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
if(isset($str[$i]))
{
$ord = ord($str[$i]);
if ($this->registers[$i] < $ord) {
$this->registers[$i] = $ord;
}
}
}
}
/**
* #static
* #param $arr
* #return int Number of unique items in $arr
*/
public function count() {
$E = 0;
$ez = 0;
for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
if ($this->registers[$i] !== 0) {
$E += (1.0 / pow(2, $this->registers[$i]));
} else {
$ez++;
$E += 1.0;
}
}
$E = (1 / $E) * $this->ALPHA * $this->HLL_REGISTERS * $this->HLL_REGISTERS;
/* Use the LINEARCOUNTING algorithm for small cardinalities.
* For larger values but up to 72000 HyperLogLog raw approximation is
* used since linear counting error starts to increase. However HyperLogLog
* shows a strong bias in the range 2.5*16384 - 72000, so we try to
* compensate for it. */
if ($E < $this->HLL_REGISTERS * 2.5 && $ez != 0) {
$E = $this->HLL_REGISTERS * log($this->HLL_REGISTERS / $ez);
}
else if ($this->HLL_REGISTERS == 16384 && $E < 72000) {
// We did polynomial regression of the bias for this range, this
// way we can compute the bias for a given cardinality and correct
// according to it. Only apply the correction for P=14 that's what
// we use and the value the correction was verified with.
$bias = 5.9119 * 1.0e-18 * ($E*$E*$E*$E)
-1.4253 * 1.0e-12 * ($E*$E*$E)+
1.2940 * 1.0e-7 * ($E*$E)
-5.2921 * 1.0e-3 * $E+
83.3216;
$E -= $E * ($bias/100);
}
return floor($E);
}
}

I implemented loglog and hyperloglog in JS and PHP and well-commented code https://github.com/buryat/loglog

Related

using Damerau-Levenshtein distance to compare sets of text in code.org

Not very knowledgeable with coding, I usually use block coding and not typing.
I've used many different Levenshtein distance codes I've found online and most of them didn't work for one reason or another
var levDist = function (s, t) {
var d = []; //2d matrix
// Step 1
var n = s.length;
var m = t.length;
if (n == 0) return m;
if (m == 0) return n;
//Create an array of arrays in javascript (a descending loop is quicker)
for (var i = n; i >= 0; i--) d[i] = [];
// Step 2
for (i = n; i >= 0; i--) d[i][0] = i;
for (var j = m; j >= 0; j--) d[0][j] = j;
// Step 3
for (i = 1; i <= n; i++) {
var s_i = s.charAt(i - 1);
// Step 4
for (j = 1; j <= m; j++) {
//Check the jagged ld total so far
if (i == j && d[i][j] > 4) return n;
var t_j = t.charAt(j - 1);
var cost = (s_i == t_j) ? 0 : 1; // Step 5
//Calculate the minimum
var mi = d[i - 1][j] + 1;
var b = d[i][j - 1] + 1;
var c = d[i - 1][j - 1] + cost;
if (b < mi) mi = b;
if (c < mi) mi = c;
d[i][j] = mi; // Step 6
//Damerau transposition
if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
}
}
}
// Step 7
return d[n][m];
};
This is all the code I’ve written (including the most recent attempt of getting the levenshtein distance)
var levDist = function (s, t) {
var d = []; //2d matrix
// Step 1
var n = s.length;
var m = t.length;
if (n == 0) return m;
if (m == 0) return n;
//Create an array of arrays in javascript (a descending loop is quicker)
for (var i = n; i >= 0; i--) d[i] = [];
// Step 2
for (i = n; i >= 0; i--) d[i][0] = i;
for (var j = m; j >= 0; j--) d[0][j] = j;
// Step 3
for (i = 1; i <= n; i++) {
var s_i = s.charAt(i - 1);
// Step 4
for (j = 1; j <= m; j++) {
//Check the jagged ld total so far
if (i == j && d[i][j] > 4) return n;
var t_j = t.charAt(j - 1);
var cost = (s_i == t_j) ? 0 : 1; // Step 5
//Calculate the minimum
var mi = d[i - 1][j] + 1;
var b = d[i][j - 1] + 1;
var c = d[i - 1][j - 1] + cost;
if (b < mi) mi = b;
if (c < mi) mi = c;
d[i][j] = mi; // Step 6
//Damerau transposition
if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
}
}
}
// Step 7
return d[n][m];
};
var S = "Hello World";
var grossWPM;
var Transparency = 1;
var Timer = 60;
var InitialTime = Timer;
var Texts = getColumn("Texts", "Texts");
var TextLength = getColumn("Texts", "Number of Characters");
var Title = getColumn("Texts", "Titles");
var Author = getColumn("Texts", "Authors");
var TextSelector = randomNumber(0, 19);
console.log("Article #" + (TextSelector + 1));
console.log(TextLength[TextSelector] + " Characters in total");
console.log(Title[TextSelector]);
console.log("By: " + Author[TextSelector]);
var Countdown;
var Countdown = 6;
//Texts are obtained from
//https://data.typeracer.com/pit/texts
onEvent("button1", "click", function( ) {
timedLoop(1000, function() {
Countdown = Countdown - 1;
setText("button1", Countdown - 0);
timedLoop(100, function() {
setText("text_area2", "");
});
if (Countdown <= 1) {
stopTimedLoop();
setTimeout(function() {
setText("button1", "GO!");
setText("text_area1", Texts[TextSelector]);
if (getText("button1") == "GO!") {
var TransparentLoop = timedLoop(100, function() {
Transparency = Transparency - 0.1;
setProperty("Warning", "text-color", rgb(77,87,95, Transparency));
if (Transparency <= 0) {
deleteElement("Warning");
showElement("label2");
stopTimedLoop(TransparentLoop);
}
});
var TimerLoop = timedLoop(1000, function() {
Timer = Timer - 1;
setText("label2", Timer);
if (Timer <= 0) {
grossWPM = (TextLength[TextSelector] / 5) / ((InitialTime - Timer) / 60);
console.log(grossWPM);
setScreen("screen2");
if (Timer == 1) {
S = " second";
} else {
S = " seconds";
}
setText("label1", "Your typing speed was approximately " + (Math.round(grossWPM) + (" WPM* with " + (Timer + (S + " left")))));
stopTimedLoop(TimerLoop);
}
});
console.log("Timer Started");
timedLoop(10, function() {
var str = getText("text_area2");
if (str.length == TextLength[TextSelector]) {
stopTimedLoop(TimerLoop);
grossWPM = (TextLength[TextSelector] / 5) / ((InitialTime - Timer) / 60);
setScreen("screen2");
levDist(str, Texts[TextSelector]);
if (Timer == 1) {
S = " second";
} else {
S = " seconds";
}
setText("label1", "Your typing speed was approximately " + (Math.round(grossWPM) + (" WPM* with " + (Timer + (S + " left")))));
if (grossWPM == 69) {
setText("label4", "Nice");
}
stopTimedLoop();
}
});
}
}, 1000);
}
});
});
Obviously not that good at this so can anyone help?
I want to compare two sets of text
Something the user types in.
Paragraph that the user was supposed to type.
This is for a WPM test and I want a way to get a measurement for WPM that includes errors the user makes while typing.
If there is a way to check this besides the Levenshtein distance please tell me, I just looked up a way to do that and Levenshtein distance seemed like the way to do so
The error given by code.org says:
ERROR: Line: 50: TypeError: d[n] is undefined
I fixed the issue, I used this code
function levenshtein(s1, s2) {
if (s1 == s2) {
return 0;
}
var s1_len = s1.length;
var s2_len = s2.length;
if (s1_len === 0) {
return s2_len;
}
if (s2_len === 0) {
return s1_len;
}
// BEGIN STATIC
var split = false;
try {
split = !('0')[0];
} catch (e) {
// Earlier IE may not support access by string index
split = true;
}
// END STATIC
if (split) {
s1 = s1.split('');
s2 = s2.split('');
}
var v0 = new Array(s1_len + 1);
var v1 = new Array(s1_len + 1);
var s1_idx = 0,
s2_idx = 0,
cost = 0;
for (s1_idx = 0; s1_idx < s1_len + 1; s1_idx++) {
v0[s1_idx] = s1_idx;
}
var char_s1 = '',
char_s2 = '';
for (s2_idx = 1; s2_idx <= s2_len; s2_idx++) {
v1[0] = s2_idx;
char_s2 = s2[s2_idx - 1];
for (s1_idx = 0; s1_idx < s1_len; s1_idx++) {
char_s1 = s1[s1_idx];
cost = (char_s1 == char_s2) ? 0 : 1;
var m_min = v0[s1_idx + 1] + 1;
var b = v1[s1_idx] + 1;
var c = v0[s1_idx] + cost;
if (b < m_min) {
m_min = b;
}
if (c < m_min) {
m_min = c;
}
v1[s1_idx + 1] = m_min;
}
var v_tmp = v0;
v0 = v1;
v1 = v_tmp;
}
return v0[s1_len];
}
and I got that code from this question
This is levenshtein distance NOT damerau-levenshtein distance

Javascript Memoization in Browser Not Seeing Speed Up

I am attempting to memoize a function in javascript, to be run in browser, client side. Writing this function in R (the language I am most comfortable using). In R, I see significant benefits from using memoization (4 minutes run time to 0.02 seconds for P_n(7/10, 20, 15, 6, 1) ). When I rewrite the function in javascript, I see almost no benefit. What is the problem with my javascript code (or am I going about this the wrong way entirely)?
Below are the memoized functions in R and javascript respectively. The R function (first of the two) runs very fast compared to the naive recursion, while javascript essentially sees no difference. Some amount of memoization is happening, however, because if I run the exact same function call twice, i.e. P_memo(7/10, 20, 15, 6, 1) and then P_memo(7/10, 20, 15, 6, 1) again, the second call takes 0 time. The first call should be dramatically quicker due to re-use of intermediate calls in the recursion.
P_n <- (function() {
# Memoization handled through the use of cache
cache <- NULL
cache_reset <- function() {
cache <<- new.env(TRUE, emptyenv())
}
cache_set <- function(key, value) {
assign(key, value, envir = cache)
}
cache_get <- function(key) {
get(key, envir = cache, inherits = FALSE)
}
cache_has_key <- function(key) {
exists(key, envir = cache, inherits = FALSE)
}
# Initialize the cache
cache_reset()
# This is the function that gets returned by the anonymous function and
# becomes P_n
function(rho, n, i, k, s) {
nc <- paste(rho, n, i, k, s)
# Handle "vectors" by element
if(length(n) > 1){
return(lapply(n, function(n) sapply(n, P_n, rho = rho, i = 1:(n+k), k = k, s = s)))
}
if (length(i) > 1) {
return(sapply(i, P_n, rho = rho, n = n, k = k, s = s))
}
# Cached cases
if (cache_has_key(nc))
return(cache_get(nc))
# Everything else
#proposition 1
if(i == (n+k)){
#print('Proposition 1')
if(k >= s){
return((rho / (rho + 1))^n)
}else if( (k+n) <= s){
product_iter = 1
for(j in 1:n){
product_iter = product_iter * ( rho + (k + j - 1)/s )
}
out = rho^n / product_iter
cache_set(nc, out)
return(out)
}else if( k < s & s < (k + n)){
product_iter2 = 1
for(j in 1:(s-k)){
product_iter2 = product_iter2 * ( rho + (k + j - 1)/s )
}
product_denom = ((rho + 1)^(n-s+k)) * product_iter2
out = rho^n / product_denom
cache_set(nc, out)
return(out)
}
}
#proposition 2
else if(k == 0 & n == i){
#print('Proposition 2')
if(n <= s){
product_iter11 = 1
for(j in 1:n){
product_iter11 = product_iter11 * (rho + (j - 1)/s)
}
return(rho^n / product_iter11)
}else if(n > s){
product_iter12 = 1
for(j in 1:s){
product_iter12 = product_iter12 * ( rho + (j - 1)/s )
}
product_denom12 = ((rho + 1)^(n-s)) * product_iter12
out = rho^n / product_denom12
cache_set(nc, out)
return(out)
}
}
#if i = 1
else if(i == 1){
upsum = 0
for(j in 2:(n + k)){
upsum = upsum + P_n(rho, n, j, k, s)
}
out = 1 - upsum
cache_set(nc, out)
return(out)
}
#proposition 3
else if(n == 1 & 2 <= i & i <= k){
#print('Proposition 3')
if(k <= s){
begin = rho / (rho + (i - 1)/s)
product_iter13 = 1
for(j in 1:(k-i+1)){
product_iter13 = product_iter13 * (1 - rho / (rho + (k - j + 1)/s) )
}
out = begin * product_iter13
cache_set(nc, out)
return(out)
}else if(k > s & i > s){
out = rho / (rho+1)^(k-i+2)
cache_set(nc, out)
return(out)
}else if(i <= s & s <= k){
begin2 = rho / (( rho + 1 )^(k - s + 1) * ( rho + (i - 1)/s))
product_iter14 = 1
for(j in 1:(s-i)){
product_iter14 = product_iter14 * (1 - rho / (rho + (s - j)/s) )
}
out = begin2 * product_iter14
cache_set(nc, out)
return(out)
}
}
#proposition 4
else if( n >= 2 & 2 <= i & i <= (k + n - 1)){
#print('Proposition 4')
if(i>s){
begin11 = rho/(rho+1)
product_iter21 = 0
for(j in (i-1):(k+n-1)){
product_iter21 = product_iter21 + (1 / (rho+1))^(j-i+1) * P_n(rho, n-1, j, k, s)
}
out = begin11 * product_iter21
cache_set(nc, out)
return(out)
}else if(i <= s){
begin12 = rho / (rho + (i-1)/s)
summer1 = 0
for(j in (i-1):(s-1)){
product_iter22 = 1
for(h in 1:(j-i+1)){
product_iter22 = product_iter22 * (1 - rho / (rho + (j - h + 1) / s))
}
summer1 = summer1 + product_iter22 * P_n(rho, n-1, j, k, s)
}
product_iter23 = 1
for(h in 1:(s-i)){
product_iter23 = product_iter23 * (1 - rho / (rho + (s-h) / s))
}
summer2 = 0
for(j in s:(k+n-1)){
summer2 = summer2 + ((1 / (rho + 1))^(j-s+1) * P_n(rho, n-1, j, k, s))
}
bottom = product_iter23 * summer2
inner = summer1 + bottom
out = begin12 * inner
cache_set(nc, out)
return(out)
}
}
#check if missed all propositions
else{
stop("No proposition detected")
}
}
})()
var P_memo = (function() {
var memo = {};
var slice = Array.prototype.slice;
function f(rho, n, i, k, s){
var args = slice.call(arguments);
var value;
if (args in memo) {
return(memo[args]);
}else{
// NOTES ON THE UNITS OF INPUTS
//rho: ratio of lambda / tau
// n: arrival of nth customer
// i: are i customers in queue
// k : number of customers at t = 0
// s: machines in use
//proposition 1
if(i == (n+k)){
//print('Proposition 1')
if(k >= s){
return(Math.pow(rho / (rho + 1), n));
}else if( (k+n) <= s){
var product_iter = 1;
for(var j=1; j<= n; j++){
product_iter = product_iter * ( rho + (k + j - 1)/s );
}
return(Math.pow(rho, n) / product_iter);
}else if( k < s && s < (k + n)){
var product_iter2 = 1;
for(var j=1; j<= s-k; j++){
product_iter2 = product_iter2 * ( rho + (k + j - 1)/s );
}
product_denom = Math.pow((rho + 1), (n-s+k)) * product_iter2;
return(Math.pow(rho, n) / product_denom);
}
}
//proposition 2
else if(k == 0 && n == i){
if(n <= s){
var product_iter11 = 1;
for(var j=1; j<= n; j++){
product_iter11 = product_iter11 * (rho + (j - 1)/s);
}
return(Math.pow(rho, n) / product_iter11);
}else if(n > s){
var product_iter12 = 1;
for(var j=1; j<= s; j++){
product_iter12 = product_iter12 * ( rho + (j - 1)/s );
}
product_denom12 = Math.pow((rho + 1), (n-s)) * product_iter12;
return(Math.pow(rho, n) / product_denom12);
}
}
//if i = 1
else if(i == 1){
var upsum = 0;
for(var j=2; j<= (n+k); j++){
upsum = upsum + f(rho, n, j, k, s);
}
return(1 - upsum);
}
//proposition 3
else if(n == 1 && 2 <= i && i <= k){
if(k <= s){
begin = rho / (rho + (i - 1)/s);
var product_iter13 = 1;
for(var j=1; j<= (k-i+1); j++){
product_iter13 = product_iter13 * (1 - rho / (rho + (k - j + 1)/s) );
}
return(begin * product_iter13);
}else if(k > s && i > s){
return(rho / Math.pow((rho+1), (k-i+2)));
}else if(i <= s && s <= k){
begin2 = rho / (Math.pow( (rho + 1), (k - s + 1)) * ( rho + (i - 1)/s));
var product_iter14 = 1;
for(var j=1; j<= (s-i); j++){
product_iter14 = product_iter14 * (1 - rho / (rho + (s - j)/s) );
}
return(begin2 * product_iter14);
}
}
//proposition 4
else if( n >= 2 && 2 <= i && i <= (k + n - 1)){
if(i>s){
begin11 = rho/(rho+1);
var product_iter21 = 0;
for(var j=(i-1); j<= (k+n-1); j++){
product_iter21 = product_iter21 + Math.pow((1 / (rho+1)),(j-i+1)) * f(rho, n-1, j, k, s);
}
return(begin11 * product_iter21);
}else if(i <= s){
begin12 = rho / (rho + (i-1)/s);
var summer1 = 0;
for(var j=(i-1); j<= (s-1); j++){
var product_iter22 = 1;
for(var h=1; h<=(j-1+1); h++){
product_iter22 = product_iter22 * (1 - rho / (rho + (j - h + 1) / s));
}
summer1 = summer1 + product_iter22 * f(rho, n-1, j, k, s);
}
var product_iter23 = 1;
for(var h=1; h<=(s-i); h++){
product_iter23 = product_iter23 * (1 - rho / (rho + (s-h) / s));
}
var summer2 = 0;
for(var j=s; j<= (k+n-1); j++){
summer2 = summer2 + (Math.pow((1 / (rho + 1)), (j-s+1)) * f(rho, n-1, j, k, s)) ;
}
bottom = product_iter23 * summer2;
inner = summer1 + bottom;
return(begin12 * inner);
}
}
}
}
//Closure of f(), self-executing anonymous function
return f;
})();
Your memoization has two flaws:
(1) You never add results to memo.
(2) args in memo casts args to a string. That will work for an array of numbers, but it might fail for other inputs.
I'd write a generic version of memo like this:
const memo = fn => {
const cache = {};
return (...args) => {
const key = JSON.stringify(args);
if(key in memo) return memo[key];
return memo[key] = fn(...args);
};
};
const memoizedF = memo(f);

Python and Javascript Pseudo Random Number Generator PRNG

I am looking for a way to generate the same sequence of pseudo random integer numbers from both python and javascript.
When I seed in python like this I get the below results:
random.seed(3909461935)
random.randint(0, 2147483647) = 162048056
random.randint(0, 2147483647) = 489743869
random.randint(0, 2147483647) = 1561110296
I need the same sequence in javascript.
Note: I used 2147483647 as the range in the randint method because I am assuming javascript can only handle 32 bit INTs.
Are there any libraries on both sides I can use to generate the same set of pseudo random numbers given the same seed?
I have found two implementations of Mersenne Twister that generate the same 32 bit integer values given the same seed.
This way you can generate a server side sequence in Python, and have the browser independently generate the same sequence in javascript.
Python:
from mt_random import *
r = mersenne_rng(seed = 12345)
r.get_random_number() # Prints 3992670690
r.get_random_number() # Prints 3823185381
r.get_random_number() # Prints 1358822685
Javascript:
r = new MersenneTwister();
r.init_genrand(12345);
r = mersenne_rng(seed = 12345)
r.genrand_int32(); # Prints 3992670690
r.genrand_int32(); # Prints 3823185381
r.genrand_int32(); # Prints 1358822685
The JS is here:
/*
* 疑似乱数生成機 移植
*
* Mersenne Twister with improved initialization (2002)
* http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/mt.html
* http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/mt19937ar.html
*/
// = 移植元ラインセンス =======================================================
// ======================================================================
/*
A C-program for MT19937, with initialization improved 2002/2/10.
Coded by Takuji Nishimura and Makoto Matsumoto.
This is a faster version by taking Shawn Cokus's optimization,
Matthe Bellew's simplification, Isaku Wada's real version.
Before using, initialize the state by using init_genrand(seed)
or init_by_array(init_key, key_length).
Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or another materials provided with the distribution.
3. The names of its contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Any feedback is very welcome.
http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
email: m-mat # math.sci.hiroshima-u.ac.jp (remove space)
*/
// ======================================================================
function MersenneTwister() {
// 整数を扱うクラス
function Int32(value) {
var bits = new Array(0, 0, 0, 0);
var i;
var v = value;
if (v != 0) {
for (i = 0; i < 4; ++i) {
bits[i] = v & 0xff;
v = v >> 8;
}
}
this.getValue = function () {
return (bits[0] | (bits[1] << 8) | (bits[2] << 16)) + ((bits[3] << 16) * 0x100);
};
this.getBits = function (i) { return bits[i & 3]; };
this.setBits = function (i, val) { return (bits[i & 3] = val & 0xff); };
this.add = function (another) {
var tmp = new Int32(0);
var i, fl = 0, b;
for (i = 0; i < 4; ++i) {
b = bits[i] + another.getBits(i) + fl;
tmp.setBits(i, b);
fl = b >> 8;
}
return tmp;
};
this.sub = function (another) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0);
var i;
for (i = 0; i < 4; ++i) {
bb[i] = bits[i] - another.getBits(i);
if ((i > 0) && (bb[i - 1] < 0)) {
--bb[i];
}
}
for (i = 0; i < 4; ++i) {
tmp.setBits(i, bb[i]);
}
return tmp;
};
this.mul = function (another) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0, 0);
var i, j;
for (i = 0; i < 4; ++i) {
for (j = 0; i + j < 4; ++j) {
bb[i + j] += bits[i] * another.getBits(j);
}
tmp.setBits(i, bb[i]);
bb[i + 1] += bb[i] >> 8;
}
return tmp;
};
this.and = function (another) {
var tmp = new Int32(0);
var i;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, bits[i] & another.getBits(i));
}
return tmp;
};
this.or = function (another) {
var tmp = new Int32(0);
var i;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, bits[i] | another.getBits(i));
}
return tmp;
};
this.xor = function (another) {
var tmp = new Int32(0);
var i;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, bits[i] ^ another.getBits(i));
}
return tmp;
};
this.rshifta = function (s) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0, 0);
var p = s >> 3;
var i, sg = 0;
if ((bits[3] & 0x80) > 0) {
bb[4] = sg = 0xff;
}
for (i = 0; i + p < 4; ++i) {
bb[i] = bits[i + p];
}
for (; i < 4; ++i) {
bb[i] = sg;
}
p = s & 0x7;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, ((bb[i] | (bb[i + 1] << 8)) >> p) & 0xff);
}
return tmp;
};
this.rshiftl = function (s) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0, 0);
var p = s >> 3;
var i;
for (i = 0; i + p < 4; ++i) {
bb[i] = bits[i + p];
}
p = s & 0x7;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, ((bb[i] | (bb[i + 1] << 8)) >> p) & 0xff);
}
return tmp;
};
this.lshift = function (s) {
var tmp = new Int32(0);
var bb = new Array(0, 0, 0, 0, 0);
var p = s >> 3;
var i;
for (i = 0; i + p < 4; ++i) {
bb[i + p + 1] = bits[i];
}
p = s & 0x7;
for (i = 0; i < 4; ++i) {
tmp.setBits(i, (((bb[i] | (bb[i + 1] << 8)) << p) >> 8) & 0xff);
}
return tmp;
};
this.equals = function (another) {
var i;
for (i = 0; i < 4; ++i) {
if (bits[i] != another.getBits(i)) {
return false;
}
}
return true;
};
this.compare = function (another) {
var i;
for (i = 3; i >= 0; --i) {
if (bits[i] > another.getBits(i)) {
return 1;
} else if (bits[i] < another.getBits(i)) {
return -1;
}
}
return 0;
};
}
// End of Int32
/* Period parameters */
var N = 624;
var M = 397;
var MATRIX_A = new Int32(0x9908b0df); /* constant vector a */
var UMASK = new Int32(0x80000000); /* most significant w-r bits */
var LMASK = new Int32(0x7fffffff); /* least significant r bits */
var INT32_ZERO = new Int32(0);
var INT32_ONE = new Int32(1);
var MIXBITS = function (u, v) {
return (u.and(UMASK)).or(v.and(LMASK));
};
var TWIST = function (u, v) {
return ((MIXBITS(u, v).rshiftl(1)).xor((v.and(INT32_ONE)).equals(INT32_ZERO) ? INT32_ZERO : MATRIX_A));
};
var state = new Array(); /* the array for the state vector */
var left = 1;
var initf = 0;
var next = 0;
var i;
for (i = 0; i < N; ++i) {
state[i] = INT32_ZERO;
}
/* initializes state[N] with a seed */
var _init_genrand = function (s) {
var lt1812433253 = new Int32(1812433253);
var j;
state[0]= new Int32(s);
for (j = 1; j < N; ++j) {
state[j] = ((lt1812433253.mul(state[j - 1].xor(state[j - 1].rshiftl(30)))).add(new Int32(j)));
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
/* In the previous versions, MSBs of the seed affect */
/* only MSBs of the array state[]. */
/* 2002/01/09 modified by Makoto Matsumoto */
//state[j] &= 0xffffffff; /* for >32 bit machines */
}
left = 1; initf = 1;
};
this.init_genrand = _init_genrand;
/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
/* slight change for C++, 2004/2/26 */
this.init_by_array = function (init_key, key_length) {
var lt1664525 = new Int32(1664525);
var lt1566083941 = new Int32(1566083941);
var i, j, k;
_init_genrand(19650218);
i = 1; j = 0;
k = (N > key_length ? N : key_length);
for (; k; --k) {
state[i] = ((state[i].xor((state[i - 1].xor(state[i - 1].rshiftl(30))).mul(lt1664525))).add(
new Int32(init_key[j]))).add(new Int32(j)); /* non linear */
//state[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
i++; j++;
if (i >= N) {
state[0] = state[N - 1];
i = 1;
}
if (j >= key_length) {
j = 0;
}
}
for (k = N - 1; k; --k) {
state[i] = (state[i].xor((state[i-1].xor(state[i - 1].rshiftl(30))).mul(lt1566083941))).sub(
new Int32(i)); /* non linear */
//state[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
i++;
if (i >= N) {
state[0] = state[N - 1];
i = 1;
}
}
state[0] = new Int32(0x80000000); /* MSB is 1; assuring non-zero initial array */
left = 1; initf = 1;
};
var next_state = function () {
var p = 0;
var j;
/* if init_genrand() has not been called, */
/* a default initial seed is used */
if (initf == 0) {
_init_genrand(5489);
}
left = N;
next = 0;
for (j = N - M + 1; --j; ++p) {
state[p] = state[p + M].xor(TWIST(state[p], state[p + 1]));
}
for (j = M; --j; ++p) {
state[p] = state[p + M - N].xor(TWIST(state[p], state[p + 1]));
}
state[p] = state[p + M - N].xor(TWIST(state[p], state[0]));
};
var lt0x9d2c5680 = new Int32(0x9d2c5680);
var lt0xefc60000 = new Int32(0xefc60000);
/* generates a random number on [0,0xffffffff]-interval */
var _genrand_int32 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return y.getValue();
};
this.genrand_int32 = _genrand_int32;
/* generates a random number on [0,0x7fffffff]-interval */
this.genrand_int31 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return (y.rshiftl(1)).getValue();
};
/* generates a random number on [0,1]-real-interval */
this.genrand_real1 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return y.getValue() * (1.0/4294967295.0);
/* divided by 2^32-1 */
};
/* generates a random number on [0,1)-real-interval */
this.genrand_real2 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return y.getValue() * (1.0 / 4294967296.0);
/* divided by 2^32 */
};
/* generates a random number on (0,1)-real-interval */
this.genrand_real3 = function () {
var y;
if (--left == 0) {
next_state();
}
y = state[next];
++next;
/* Tempering */
y = y.xor(y.rshiftl(11));
y = y.xor((y.lshift(7)).and(lt0x9d2c5680));
y = y.xor((y.lshift(15)).and(lt0xefc60000));
y = y.xor(y.rshiftl(18));
return (y.getValue() + 0.5) * (1.0 / 4294967296.0);
/* divided by 2^32 */
};
/* generates a random number on [0,1) with 53-bit resolution*/
this.genrand_res53 = function () {
var a = ((new Int32(_genrand_int32())).rshiftl(5)).getValue();
var b = ((new Int32(_genrand_int32())).rshiftl(6)).getValue();
return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0);
};
/* These real versions are due to Isaku Wada, 2002/01/09 added */
}
The corresponding Python implementation is here:
class mersenne_rng(object):
def __init__(self, seed = 5489):
self.state = [0]*624
self.f = 1812433253
self.m = 397
self.u = 11
self.s = 7
self.b = 0x9D2C5680
self.t = 15
self.c = 0xEFC60000
self.l = 18
self.index = 624
self.lower_mask = (1<<31)-1
self.upper_mask = 1<<31
# update state
self.state[0] = seed
for i in range(1,624):
self.state[i] = self.int_32(self.f*(self.state[i-1]^(self.state[i-1]>>30)) + i)
def twist(self):
for i in range(624):
temp = self.int_32((self.state[i]&self.upper_mask)+(self.state[(i+1)%624]&self.lower_mask))
temp_shift = temp>>1
if temp%2 != 0:
temp_shift = temp_shift^0x9908b0df
self.state[i] = self.state[(i+self.m)%624]^temp_shift
self.index = 0
def get_random_number(self):
if self.index >= 624:
self.twist()
y = self.state[self.index]
y = y^(y>>self.u)
y = y^((y<<self.s)&self.b)
y = y^((y<<self.t)&self.c)
y = y^(y>>self.l)
self.index+=1
return self.int_32(y)
def int_32(self, number):
return int(0xFFFFFFFF & number)
if __name__ == "__main__":
rng = mersenne_rng(1131464071)
for i in range(10):
print rng.get_random_number()

Speeding up Levenshtein distance calculation in Ionic app

What I'm doing: I'm developing a mobile dictionary app for a number of languages
How I'm doing it: Using ionic framework with combination of some angular and some pure js (imported from a working online dictionary site of the same languages)
The problem: Our search function is an approximate search that uses a Levenstein distance calculator to rank all entries in the dictionary with respect to the query form. When the dictionary has up to 1,500 words, this isn't a problem at all on phones, but when the dictionary has around 10,000 words, there is about a 5-8 second delay before results are shown, despite it being instantaneous on a web browser using "ionic serve". When I run firebug, the javascript that takes the longest time to process are the distance calculations, so my working assumption is that this is where I should start, but I'm open to any suggestions at all.
Here's the distance calculator:
/**
* editDistance.js
*
* A simple Levenshtein distance calculator, except weighted such
* that insertions at the beginning and deletions at the end cost less.
*
* AUTHOR: Pat Littell
* LAST UPDATED: 2015-05-16
*/
var distanceCalculator = {
insertionCost : 1.0,
deletionCost : 1.0,
insertionAtBeginningCost : 0.11,
deletionAtEndCost : 0.1,
substitutionCost : 1.0,
getEditDistance : function(a, b) {
if(a.length === 0) return b.length;
if(b.length === 0) return a.length;
var matrix = [];
// var currentInsertionCost, currentDeletionCost, currentSubstitutionCost = 0;
// increment along the first column of each row
var i;
for(i = 0; i <= b.length; i++){
matrix[i] = [i * this.insertionAtBeginningCost];
}
// increment each column in the first row
var j;
for(j = 0; j <= a.length; j++){
matrix[0][j] = j;
}
// Fill in the rest of the matrix
for(i = 1; i <= b.length; i++){
for(j = 1; j <= a.length; j++){
currentInsertionCost = matrix[i][j-1] + this.insertionCost;
currentSubstitutionCost = matrix[i-1][j-1] + (b.charAt(i-1) != a.charAt(j-1) ? this.substitutionCost : 0);
currentDeletionCost = matrix[i-1][j] + (j==a.length ? this.deletionAtEndCost : this.deletionCost);
matrix[i][j] = Math.min(currentSubstitutionCost, Math.min(currentInsertionCost, currentDeletionCost));
}
}
return matrix[b.length][a.length];
},
// Given a query <a> and a series of targets <bs>, return the least distance to any target
getLeastEditDistance : function(a, bs) {
var that = this;
return Math.min.apply(null, bs.map(function(b) {
return that.getEditDistance(a,b);
}));
}
}
First of all, if you have a known dictionary you will get the fastest solution with something like a Levenshtein Automata, which will solve this in linear time to get all candidates. You can't beat this with a general purpose implementation.
With that said, this implementation of levenshtein distance is a few times faster than yours.
function distance(s, t) {
if (s === t) {
return 0;
}
var n = s.length, m = t.length;
if (n === 0 || m === 0) {
return n + m;
}
var x = 0, y, py, a, b, c, d, e, f, k;
var p = new Array(n);
for (y = 0; y < n;) {
p[y] = ++y;
}
for (; (x + 3) < m; x += 4) {
var tx0 = t.charCodeAt(x);
var tx1 = t.charCodeAt(x + 1);
var tx2 = t.charCodeAt(x + 2);
var tx3 = t.charCodeAt(x + 3);
a = x;
b = x + 1;
c = x + 2;
d = x + 3;
e = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
py = p[y];
if (py < a || b < a) {
a = (py > b ? b + 1 : py + 1);
}
else {
if (tx0 !== k) {
a++;
}
}
if (a < b || c < b) {
b = (a > c ? c + 1 : a + 1);
}
else {
if (tx1 !== k) {
b++;
}
}
if (b < c || d < c) {
c = (b > d ? d + 1 : b + 1);
}
else {
if (tx2 !== k) {
c++;
}
}
if (c < d || e < d) {
d = (c > e ? e + 1 : c + 1);
}
else {
if (tx3 !== k) {
d++;
}
}
p[y] = e = d;
d = c;
c = b;
b = a;
a = py;
}
}
for (; x < m;) {
tx0 = t.charCodeAt(x);
a = x;
b = ++x;
for (y = 0; y < n; y++) {
py = p[y];
if (py < a || b < a) {
b = (py > b ? b + 1 : py + 1);
}
else {
if (tx0 !== s.charCodeAt(y)) {
b = a + 1;
}
else {
b = a;
}
}
p[y] = b;
a = py;
}
f = b;
}
return f;
}
I would also not use map in getLeastEditDistance, it is very slow. Just use a normal loop. Also Math.min with many arguments is not very performant.
I am working with Levenstein distances by my self and I have not found a good way to improve performance and will not recommend using it in a non-batch application.
I suggest you use another approach by using a search tree. A binary or ternary search tree can also find near match.
A good place to start is those articles:
http://www.codeproject.com/Articles/5819/Ternary-Search-Tree-Dictionary-in-C-Faster-String
or
http://www.codeproject.com/Articles/68500/Balanced-Binary-Search-Tree-BST-Search-Delete-InOr
The code is relatively simple sp you should not use much time to port it to JavaScript.

Compare strings with 'similar' letters [duplicate]

So I have a random javascript array of names...
[#larry,#nicholas,#notch] etc.
They all start with the # symbol. I'd like to sort them by the Levenshtein Distance so that the the ones at the top of the list are closest to the search term. At the moment, I have some javascript that uses jQuery's .grep() on it using javascript .match() method around the entered search term on key press:
(code edited since first publish)
limitArr = $.grep(imTheCallback, function(n){
return n.match(searchy.toLowerCase())
});
modArr = limitArr.sort(levenshtein(searchy.toLowerCase(), 50))
if (modArr[0].substr(0, 1) == '#') {
if (atRes.childred('div').length < 6) {
modArr.forEach(function(i){
atRes.append('<div class="oneResult">' + i + '</div>');
});
}
} else if (modArr[0].substr(0, 1) == '#') {
if (tagRes.children('div').length < 6) {
modArr.forEach(function(i){
tagRes.append('<div class="oneResult">' + i + '</div>');
});
}
}
$('.oneResult:first-child').addClass('active');
$('.oneResult').click(function(){
window.location.href = 'http://hashtag.ly/' + $(this).html();
});
It also has some if statements detecting if the array contains hashtags (#) or mentions (#). Ignore that. The imTheCallback is the array of names, either hashtags or mentions, then modArr is the array sorted. Then the .atResults and .tagResults elements are the elements that it appends each time in the array to, this forms a list of names based on the entered search terms.
I also have the Levenshtein Distance algorithm:
var levenshtein = function(min, split) {
// Levenshtein Algorithm Revisited - WebReflection
try {
split = !("0")[0]
} catch(i) {
split = true
};
return function(a, b) {
if (a == b)
return 0;
if (!a.length || !b.length)
return b.length || a.length;
if (split) {
a = a.split("");
b = b.split("")
};
var len1 = a.length + 1,
len2 = b.length + 1,
I = 0,
i = 0,
d = [[0]],
c, j, J;
while (++i < len2)
d[0][i] = i;
i = 0;
while (++i < len1) {
J = j = 0;
c = a[I];
d[i] = [i];
while(++j < len2) {
d[i][j] = min(d[I][j] + 1, d[i][J] + 1, d[I][J] + (c != b[J]));
++J;
};
++I;
};
return d[len1 - 1][len2 - 1];
}
}(Math.min, false);
How can I work with algorithm (or a similar one) into my current code to sort it without bad performance?
UPDATE:
So I'm now using James Westgate's Lev Dist function. Works WAYYYY fast. So performance is solved, the issue now is using it with source...
modArr = limitArr.sort(function(a, b){
levDist(a, searchy)
levDist(b, searchy)
});
My problem now is general understanding on using the .sort() method. Help is appreciated, thanks.
Thanks!
I wrote an inline spell checker a few years ago and implemented a Levenshtein algorithm - since it was inline and for IE8 I did quite a lot of performance optimisation.
var levDist = function(s, t) {
var d = []; //2d matrix
// Step 1
var n = s.length;
var m = t.length;
if (n == 0) return m;
if (m == 0) return n;
//Create an array of arrays in javascript (a descending loop is quicker)
for (var i = n; i >= 0; i--) d[i] = [];
// Step 2
for (var i = n; i >= 0; i--) d[i][0] = i;
for (var j = m; j >= 0; j--) d[0][j] = j;
// Step 3
for (var i = 1; i <= n; i++) {
var s_i = s.charAt(i - 1);
// Step 4
for (var j = 1; j <= m; j++) {
//Check the jagged ld total so far
if (i == j && d[i][j] > 4) return n;
var t_j = t.charAt(j - 1);
var cost = (s_i == t_j) ? 0 : 1; // Step 5
//Calculate the minimum
var mi = d[i - 1][j] + 1;
var b = d[i][j - 1] + 1;
var c = d[i - 1][j - 1] + cost;
if (b < mi) mi = b;
if (c < mi) mi = c;
d[i][j] = mi; // Step 6
//Damerau transposition
if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
}
}
}
// Step 7
return d[n][m];
}
I came to this solution:
var levenshtein = (function() {
var row2 = [];
return function(s1, s2) {
if (s1 === s2) {
return 0;
} else {
var s1_len = s1.length, s2_len = s2.length;
if (s1_len && s2_len) {
var i1 = 0, i2 = 0, a, b, c, c2, row = row2;
while (i1 < s1_len)
row[i1] = ++i1;
while (i2 < s2_len) {
c2 = s2.charCodeAt(i2);
a = i2;
++i2;
b = i2;
for (i1 = 0; i1 < s1_len; ++i1) {
c = a + (s1.charCodeAt(i1) === c2 ? 0 : 1);
a = row[i1];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
row[i1] = b;
}
}
return b;
} else {
return s1_len + s2_len;
}
}
};
})();
See also http://jsperf.com/levenshtein-distance/12
Most speed was gained by eliminating some array usages.
Updated: http://jsperf.com/levenshtein-distance/5
The new Revision annihilates all other benchmarks. I was specifically chasing Chromium/Firefox performance as I don't have an IE8/9/10 test environment, but the optimisations made should apply in general to most browsers.
Levenshtein Distance
The matrix to perform Levenshtein Distance can be reused again and again. This was an obvious target for optimisation (but be careful, this now imposes a limit on string length (unless you were to resize the matrix dynamically)).
The only option for optimisation not pursued in jsPerf Revision 5 is memoisation. Depending on your use of Levenshtein Distance, this could help drastically but was omitted due to its implementation specific nature.
// Cache the matrix. Note this implementation is limited to
// strings of 64 char or less. This could be altered to update
// dynamically, or a larger value could be used.
var matrix = [];
for (var i = 0; i < 64; i++) {
matrix[i] = [i];
matrix[i].length = 64;
}
for (var i = 0; i < 64; i++) {
matrix[0][i] = i;
}
// Functional implementation of Levenshtein Distance.
String.levenshteinDistance = function(__this, that, limit) {
var thisLength = __this.length, thatLength = that.length;
if (Math.abs(thisLength - thatLength) > (limit || 32)) return limit || 32;
if (thisLength === 0) return thatLength;
if (thatLength === 0) return thisLength;
// Calculate matrix.
var this_i, that_j, cost, min, t;
for (i = 1; i <= thisLength; ++i) {
this_i = __this[i-1];
for (j = 1; j <= thatLength; ++j) {
// Check the jagged ld total so far
if (i === j && matrix[i][j] > 4) return thisLength;
that_j = that[j-1];
cost = (this_i === that_j) ? 0 : 1; // Chars already match, no ++op to count.
// Calculate the minimum (much faster than Math.min(...)).
min = matrix[i - 1][j ] + 1; // Deletion.
if ((t = matrix[i ][j - 1] + 1 ) < min) min = t; // Insertion.
if ((t = matrix[i - 1][j - 1] + cost) < min) min = t; // Substitution.
matrix[i][j] = min; // Update matrix.
}
}
return matrix[thisLength][thatLength];
};
Damerau-Levenshtein Distance
jsperf.com/damerau-levenshtein-distance
Damerau-Levenshtein Distance is a small modification to Levenshtein Distance to include transpositions. There is very little to optimise.
// Damerau transposition.
if (i > 1 && j > 1 && this_i === that[j-2] && this[i-2] === that_j
&& (t = matrix[i-2][j-2]+cost) < matrix[i][j]) matrix[i][j] = t;
Sorting Algorithm
The second part of this answer is to choose an appropriate sort function. I will upload optimised sort functions to http://jsperf.com/sort soon.
I implemented a very performant implementation of levenshtein distance calculation if you still need this.
function levenshtein(s, t) {
if (s === t) {
return 0;
}
var n = s.length, m = t.length;
if (n === 0 || m === 0) {
return n + m;
}
var x = 0, y, a, b, c, d, g, h, k;
var p = new Array(n);
for (y = 0; y < n;) {
p[y] = ++y;
}
for (; (x + 3) < m; x += 4) {
var e1 = t.charCodeAt(x);
var e2 = t.charCodeAt(x + 1);
var e3 = t.charCodeAt(x + 2);
var e4 = t.charCodeAt(x + 3);
c = x;
b = x + 1;
d = x + 2;
g = x + 3;
h = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
a = p[y];
if (a < c || b < c) {
c = (a > b ? b + 1 : a + 1);
}
else {
if (e1 !== k) {
c++;
}
}
if (c < b || d < b) {
b = (c > d ? d + 1 : c + 1);
}
else {
if (e2 !== k) {
b++;
}
}
if (b < d || g < d) {
d = (b > g ? g + 1 : b + 1);
}
else {
if (e3 !== k) {
d++;
}
}
if (d < g || h < g) {
g = (d > h ? h + 1 : d + 1);
}
else {
if (e4 !== k) {
g++;
}
}
p[y] = h = g;
g = d;
d = b;
b = c;
c = a;
}
}
for (; x < m;) {
var e = t.charCodeAt(x);
c = x;
d = ++x;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || d < c) {
d = (a > d ? d + 1 : a + 1);
}
else {
if (e !== s.charCodeAt(y)) {
d = c + 1;
}
else {
d = c;
}
}
p[y] = d;
c = a;
}
h = d;
}
return h;
}
It was my answer to a similar SO question
Fastest general purpose Levenshtein Javascript implementation
Update
A improved version of the above is now on github/npm see
https://github.com/gustf/js-levenshtein
The obvious way of doing this is to map each string to a (distance, string) pair, then sort this list, then drop the distances again. This way you ensure the levenstein distance only has to be computed once. Maybe merge duplicates first, too.
I would definitely suggest using a better Levenshtein method like the one in #James Westgate's answer.
That said, DOM manipulations are often a great expense. You can certainly improve your jQuery usage.
Your loops are rather small in the example above, but concatenating the generated html for each oneResult into a single string and doing one append at the end of the loop will be much more efficient.
Your selectors are slow. $('.oneResult') will search all elements in the DOM and test their className in older IE browsers. You may want to consider something like atRes.find('.oneResult') to scope the search.
In the case of adding the click handlers, we may want to do one better avoid setting handlers on every keyup. You could leverage event delegation by setting a single handler on atRest for all results in the same block you are setting the keyup handler:
atRest.on('click', '.oneResult', function(){
window.location.href = 'http://hashtag.ly/' + $(this).html();
});
See http://api.jquery.com/on/ for more info.
I just wrote an new revision: http://jsperf.com/levenshtein-algorithms/16
function levenshtein(a, b) {
if (a === b) return 0;
var aLen = a.length;
var bLen = b.length;
if (0 === aLen) return bLen;
if (0 === bLen) return aLen;
var len = aLen + 1;
var v0 = new Array(len);
var v1 = new Array(len);
var i = 0;
var j = 0;
var c2, min, tmp;
while (i < len) v0[i] = i++;
while (j < bLen) {
c2 = b.charAt(j++);
v1[0] = j;
i = 0;
while (i < aLen) {
min = v0[i] - (a.charAt(i) === c2 ? 1 : 0);
if (v1[i] < min) min = v1[i];
if (v0[++i] < min) min = v0[i];
v1[i] = min + 1;
}
tmp = v0;
v0 = v1;
v1 = tmp;
}
return v0[aLen];
}
This revision is faster than the other ones. Works even on IE =)

Categories

Resources