I am looking for a good general purpose Levenshtein implementation in Javascript. It must be fast and be useful for short and long strings. It should also be used many times (hence the caching). The most important thing is that it calculates a plain simple Levenshtein distance. I came up with this:
var levenshtein = (function() {
var row2 = [];
return function(s1, s2) {
if (s1 === s2) {
return 0;
} else {
var s1_len = s1.length, s2_len = s2.length;
if (s1_len && s2_len) {
var i1 = 0, i2 = 0, a, b, c, c2, row = row2;
while (i1 < s1_len)
row[i1] = ++i1;
while (i2 < s2_len) {
c2 = s2.charCodeAt(i2);
a = i2;
++i2;
b = i2;
for (i1 = 0; i1 < s1_len; ++i1) {
c = a + (s1.charCodeAt(i1) === c2 ? 0 : 1);
a = row[i1];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
row[i1] = b;
}
}
return b;
} else {
return s1_len + s2_len;
}
}
};
})();
Now I have two questions:
Can this be faster? I know by writing out the first iteration of each loop one can gain about 20%.
Is this code well written to serve as general purpose code, to be used in a library for instance?
We had a competition for fun at work about making the fastest levenshtein implementation and I came up with a faster one. First of all, I must say that it was not easy to beat your solution which was the fastest to find "out there". :)
This is tested with node.js and it my benchmarks results indicates that this implementation is ~15% faster on small texts (random words size 2-10 characters) and over twice as fast on longer texts (with lengths 30+ containing random characters)
Note: I removed array caching of all implementations
function levenshtein(s, t) {
if (s === t) {
return 0;
}
var n = s.length, m = t.length;
if (n === 0 || m === 0) {
return n + m;
}
var x = 0, y, a, b, c, d, g, h, k;
var p = new Array(n);
for (y = 0; y < n;) {
p[y] = ++y;
}
for (; (x + 3) < m; x += 4) {
var e1 = t.charCodeAt(x);
var e2 = t.charCodeAt(x + 1);
var e3 = t.charCodeAt(x + 2);
var e4 = t.charCodeAt(x + 3);
c = x;
b = x + 1;
d = x + 2;
g = x + 3;
h = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
a = p[y];
if (a < c || b < c) {
c = (a > b ? b + 1 : a + 1);
}
else {
if (e1 !== k) {
c++;
}
}
if (c < b || d < b) {
b = (c > d ? d + 1 : c + 1);
}
else {
if (e2 !== k) {
b++;
}
}
if (b < d || g < d) {
d = (b > g ? g + 1 : b + 1);
}
else {
if (e3 !== k) {
d++;
}
}
if (d < g || h < g) {
g = (d > h ? h + 1 : d + 1);
}
else {
if (e4 !== k) {
g++;
}
}
p[y] = h = g;
g = d;
d = b;
b = c;
c = a;
}
}
for (; x < m;) {
var e = t.charCodeAt(x);
c = x;
d = ++x;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || d < c) {
d = (a > d ? d + 1 : a + 1);
}
else {
if (e !== s.charCodeAt(y)) {
d = c + 1;
}
else {
d = c;
}
}
p[y] = d;
c = a;
}
h = d;
}
return h;
}
On longer texts it will get almost up to 3 times the speed of your implementation if it initially cache the inner loop's s.charCodeAt(y) in an Uint32Array. Longer texts also seemed to benefit from using a Uint16Array as a the distance cost array. Here is the code for that solution
function levenshtein(s, t) {
if (s === t) {
return 0;
}
var n = s.length, m = t.length;
if (n === 0 || m === 0) {
return n + m;
}
var x = 0, y, a, b, c, d, g, h;
var p = new Uint16Array(n);
var u = new Uint32Array(n);
for (y = 0; y < n;) {
u[y] = s.charCodeAt(y);
p[y] = ++y;
}
for (; (x + 3) < m; x += 4) {
var e1 = t.charCodeAt(x);
var e2 = t.charCodeAt(x + 1);
var e3 = t.charCodeAt(x + 2);
var e4 = t.charCodeAt(x + 3);
c = x;
b = x + 1;
d = x + 2;
g = x + 3;
h = x + 4;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || b < c) {
c = (a > b ? b + 1 : a + 1);
}
else {
if (e1 !== u[y]) {
c++;
}
}
if (c < b || d < b) {
b = (c > d ? d + 1 : c + 1);
}
else {
if (e2 !== u[y]) {
b++;
}
}
if (b < d || g < d) {
d = (b > g ? g + 1 : b + 1);
}
else {
if (e3 !== u[y]) {
d++;
}
}
if (d < g || h < g) {
g = (d > h ? h + 1 : d + 1);
}
else {
if (e4 !== u[y]) {
g++;
}
}
p[y] = h = g;
g = d;
d = b;
b = c;
c = a;
}
}
for (; x < m;) {
var e = t.charCodeAt(x);
c = x;
d = ++x;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || d < c) {
d = (a > d ? d + 1 : a + 1);
}
else {
if (e !== u[y]) {
d = c + 1;
}
else {
d = c;
}
}
p[y] = d;
c = a;
}
h = d;
}
return h;
}
All benchmark results is from my tests and test-data might be different with your test-data.
The main 2 difference in this solution than to yours (and some other fast ones) I think is
Not always do compare of the characters in the inner loop if not necessary.
Sort of "Loop unrolling" in the outer loop doing 4 rows at a time in the levenshtein "matrix". This was a major performance win.
http://jsperf.com/levenshtein-distance/24
I will put this solution on github when I find the time :)
Update:
Finally, I put the solution on github
https://github.com/gustf/js-levenshtein. Its a bit modified/optimized but it is the same base algorithm.
Related
Lets say I have the following array of strings, var = array_of_strings["abc","abcd"]
My goal is to run a function and have this return roughly 75% (0.75). Implying that the results are roughly 75% in common. Roughly being defined as within a certain error range, let us say 5% or some settable number.
I'm currently using the the Levenshtein algorithm to compute differences in the strings, however, this is extremely slow and taxing on the CPU in my situation as the strings I'm using are thousands and thousands of lines long.
Levenshtein gives me what the differences are; and while useful in certain situations, my particular use case is simply looking to see what percentage the strings are roughly different from each other and not what each difference actually is necessarily.
The current levenshtein algorithm I'm using is below (which I borrowed from another answer here on stackoverflow). It will return how many differences it found which I can then use to calculate a percentage difference, but it's very slow! Sometimes taking a couple of seconds to run and freezes up the computer as well.
async function levenshtein(s, t) {
return new Promise((resolve, reject) => {
console.log("levenshtein active");
if (s === t) {
return 0;
}
var n = s.length, m = t.length;
if (n === 0 || m === 0) {
return n + m;
}
var x = 0, y, a, b, c, d, g, h, k;
var p = new Array(n);
for (y = 0; y < n;) {
p[y] = ++y;
}
for (; (x + 3) < m; x += 4) {
var e1 = t.charCodeAt(x);
var e2 = t.charCodeAt(x + 1);
var e3 = t.charCodeAt(x + 2);
var e4 = t.charCodeAt(x + 3);
c = x;
b = x + 1;
d = x + 2;
g = x + 3;
h = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
a = p[y];
if (a < c || b < c) {
c = (a > b ? b + 1 : a + 1);
}
else {
if (e1 !== k) {
c++;
}
}
if (c < b || d < b) {
b = (c > d ? d + 1 : c + 1);
}
else {
if (e2 !== k) {
b++;
}
}
if (b < d || g < d) {
d = (b > g ? g + 1 : b + 1);
}
else {
if (e3 !== k) {
d++;
}
}
if (d < g || h < g) {
g = (d > h ? h + 1 : d + 1);
}
else {
if (e4 !== k) {
g++;
}
}
p[y] = h = g;
g = d;
d = b;
b = c;
c = a;
}
}
for (; x < m;) {
var e = t.charCodeAt(x);
c = x;
d = ++x;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || d < c) {
d = (a > d ? d + 1 : a + 1);
}
else {
if (e !== s.charCodeAt(y)) {
d = c + 1;
}
else {
d = c;
}
}
p[y] = d;
c = a;
}
h = d;
}
resolve(h);
})
}
My question is, is there a way to calculate the difference faster when large string sets are used? In my case accuracy is not too important just as long as a rough difference is known of a certain percentage.
For example, if a research paper was published and I have the original paper and the students paper I want to know if roughly 10% of the students paper is plagiarized.
Maybe if I cut a random parts out of the strings this can help to save on time but this feels very dirty/inefficient.
I'm working on a game where I only need to check if there's a distance of 0 or 1 between two words and return true if that's the case. I found a general purpose levenshtein distance algorithm:
function levenshtein(s, t) {
if (s === t) { return 0; }
var n = s.length, m = t.length;
if (n === 0 || m === 0) { return n + m; }
var x = 0, y, a, b, c, d, g, h, k;
var p = new Array(n);
for (y = 0; y < n;) { p[y] = ++y; }
for (;
(x + 3) < m; x += 4) {
var e1 = t.charCodeAt(x);
var e2 = t.charCodeAt(x + 1);
var e3 = t.charCodeAt(x + 2);
var e4 = t.charCodeAt(x + 3);
c = x; b = x + 1; d = x + 2; g = x + 3; h = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
a = p[y];
if (a < c || b < c) { c = (a > b ? b + 1 : a + 1); }
else { if (e1 !== k) { c++; } }
if (c < b || d < b) { b = (c > d ? d + 1 : c + 1); }
else { if (e2 !== k) { b++; } }
if (b < d || g < d) { d = (b > g ? g + 1 : b + 1); }
else { if (e3 !== k) { d++; } }
if (d < g || h < g) { g = (d > h ? h + 1 : d + 1); }
else { if (e4 !== k) { g++; } }
p[y] = h = g; g = d; d = b; b = c; c = a;
}
}
for (; x < m;) {
var e = t.charCodeAt(x);
c = x;
d = ++x;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || d < c) { d = (a > d ? d + 1 : a + 1); }
else {
if (e !== s.charCodeAt(y)) { d = c + 1; }
else { d = c; }
}
p[y] = d;
c = a;
}
h = d;
}
return h;
}
Which works, but this spot is going to be a hotspot and be run potentially hundreds of thousands of times a second and I want to optimize it because I don't need a general purpose algorithm, just one that checks if there's a distance of 0 or 1.
I tried writing it and came up with this:
function closeGuess(guess, word) {
if (Math.abs(word.length - guess.length) > 1) { return false; }
var errors = 0, guessIndex = 0, wordIndex = 0;
while (guessIndex < guess.length || wordIndex < word.length) {
if (errors > 1) { return false; }
if (guess[guessIndex] !== word[wordIndex]) {
if (guess.length < word.length) { wordIndex++; }
else { guessIndex++; }
errors++;
} else {
wordIndex++;
guessIndex++;
}
}
return true;
}
But after profiling it I found that my code was twice as slow, which surprised me because I think the general purpose algorithm is O(n*m) and I think mine is O(n).
I've been testing the performance difference on this fiddle: https://jsfiddle.net/aubtze2L/3/
Are there any better algorithms I can use or any way I can optimize my code to be faster?
I don't see a more elegant way which is at the same time faster than the good old for-loop:
function lev01(a, b) {
let la = a.length;
let lb = b.length;
let d = 0;
switch (la - lb) {
case 0: // mutation
for (let i = 0; i < la; ++i) {
if (a.charAt(i) != b.charAt(i) && ++d > 1) {
return false;
}
}
return true;
case -1: // insertion
for (let i = 0; i < la + d; ++i) {
if (a.charAt(i - d) != b.charAt(i) && ++d > 1) {
return false;
}
}
return true;
case +1: // deletion
for (let i = 0; i < lb + d; ++i) {
if (a.charAt(i) != b.charAt(i - d) && ++d > 1) {
return false;
}
}
return true;
}
return false;
}
console.log(lev01("abc", "abc"));
console.log(lev01("abc", "abd"));
console.log(lev01("abc", "ab"));
console.log(lev01("abc", "abcd"));
console.log(lev01("abc", "cba"));
Performance comparison (Chrome):
80.33ms - lev01 (this answer)
234.84ms - lev
708.12ms - close
Consider the following cases:
If the difference in lengths of the terms is greater than 1, then
the Levenshtein distance between them will be greater than 1.
If the difference in lengths is exactly 1, then the shortest string must be equal to the longest string, with a single deletion (or insertion).
If the strings are the same length then you should
consider a modified version of Hamming distance which returns false
if two, different characters are found:
Here is a sample implementation:
var areSimilar;
areSimilar = function(guess, word) {
var charIndex, foundDiff, guessLength, lengthDiff, substring, wordLength, shortest, longest, shortestLength, offset;
guessLength = guess.length;
wordLength = word.length;
lengthDiff = guessLength - wordLength;
if (lengthDiff < -1 || lengthDiff > 1) {
return false;
}
if (lengthDiff !== 0) {
if (guessLength < wordLength) {
shortest = guess;
longest = word;
shortestLength = guessLength;
} else {
shortest = word;
longest = guess;
shortestLength = wordLength;
}
offset = 0;
for (charIndex = 0; charIndex < shortestLength; charIndex += 1) {
if (shortest[charIndex] !== longest[offset + charIndex]) {
if (offset > 0) {
return false; // second error
}
offset = 1;
if (shortest[charIndex] !== longest[offset + charIndex]) {
return false; // second error
}
}
}
return true; // only one error
}
foundDiff = false;
for (charIndex = 0; charIndex < guessLength; charIndex += 1) {
if (guess[charIndex] !== word[charIndex]) {
if (foundDiff) {
return false;
}
foundDiff = true;
}
}
return true;
};
I've updated your fiddle to include this method. Here are the results on my machine:
close: 154.61
lev: 176.72500000000002
sim: 32.48000000000013
Fiddle: https://jsfiddle.net/dylon/aubtze2L/11/
If you know that you are looking for distance 0 and 1, then the general purpose DP algorithm does not make sense (and by the way the algorithm you showed looks convoluted, take a look at a better explanation here).
To check that the distance is 0, all you need is to check whether 2 strings are the same. Now if the distance is one, it means that either insertion, deletion or substitution should have happened. So generate all possible deletion from the original string and check whether it is equal to second string. So you will get something like this:
for (var i = 0; i < s_1.length; i++) {
if s_2 == s_1.slice(0, i) + s_1.slice(i + 1) {
return true
}
}
For insertions and substitution you will need to know the alphabet of all characters. You can define it as a big string var alphabet = "abcde....". Now you do a similar thing, but when you introduce substitution or insertion, you also iterate over all elements in your alphabet. I am not planning to write the whole code here.
A couple of additional things. You can make a lot of micro-optimizations here. For example if the length of two strings are different by more than 1, they clearly can't have a distance 1. Another one relates to the frequencies of underlying characters in the string.
What I'm doing: I'm developing a mobile dictionary app for a number of languages
How I'm doing it: Using ionic framework with combination of some angular and some pure js (imported from a working online dictionary site of the same languages)
The problem: Our search function is an approximate search that uses a Levenstein distance calculator to rank all entries in the dictionary with respect to the query form. When the dictionary has up to 1,500 words, this isn't a problem at all on phones, but when the dictionary has around 10,000 words, there is about a 5-8 second delay before results are shown, despite it being instantaneous on a web browser using "ionic serve". When I run firebug, the javascript that takes the longest time to process are the distance calculations, so my working assumption is that this is where I should start, but I'm open to any suggestions at all.
Here's the distance calculator:
/**
* editDistance.js
*
* A simple Levenshtein distance calculator, except weighted such
* that insertions at the beginning and deletions at the end cost less.
*
* AUTHOR: Pat Littell
* LAST UPDATED: 2015-05-16
*/
var distanceCalculator = {
insertionCost : 1.0,
deletionCost : 1.0,
insertionAtBeginningCost : 0.11,
deletionAtEndCost : 0.1,
substitutionCost : 1.0,
getEditDistance : function(a, b) {
if(a.length === 0) return b.length;
if(b.length === 0) return a.length;
var matrix = [];
// var currentInsertionCost, currentDeletionCost, currentSubstitutionCost = 0;
// increment along the first column of each row
var i;
for(i = 0; i <= b.length; i++){
matrix[i] = [i * this.insertionAtBeginningCost];
}
// increment each column in the first row
var j;
for(j = 0; j <= a.length; j++){
matrix[0][j] = j;
}
// Fill in the rest of the matrix
for(i = 1; i <= b.length; i++){
for(j = 1; j <= a.length; j++){
currentInsertionCost = matrix[i][j-1] + this.insertionCost;
currentSubstitutionCost = matrix[i-1][j-1] + (b.charAt(i-1) != a.charAt(j-1) ? this.substitutionCost : 0);
currentDeletionCost = matrix[i-1][j] + (j==a.length ? this.deletionAtEndCost : this.deletionCost);
matrix[i][j] = Math.min(currentSubstitutionCost, Math.min(currentInsertionCost, currentDeletionCost));
}
}
return matrix[b.length][a.length];
},
// Given a query <a> and a series of targets <bs>, return the least distance to any target
getLeastEditDistance : function(a, bs) {
var that = this;
return Math.min.apply(null, bs.map(function(b) {
return that.getEditDistance(a,b);
}));
}
}
First of all, if you have a known dictionary you will get the fastest solution with something like a Levenshtein Automata, which will solve this in linear time to get all candidates. You can't beat this with a general purpose implementation.
With that said, this implementation of levenshtein distance is a few times faster than yours.
function distance(s, t) {
if (s === t) {
return 0;
}
var n = s.length, m = t.length;
if (n === 0 || m === 0) {
return n + m;
}
var x = 0, y, py, a, b, c, d, e, f, k;
var p = new Array(n);
for (y = 0; y < n;) {
p[y] = ++y;
}
for (; (x + 3) < m; x += 4) {
var tx0 = t.charCodeAt(x);
var tx1 = t.charCodeAt(x + 1);
var tx2 = t.charCodeAt(x + 2);
var tx3 = t.charCodeAt(x + 3);
a = x;
b = x + 1;
c = x + 2;
d = x + 3;
e = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
py = p[y];
if (py < a || b < a) {
a = (py > b ? b + 1 : py + 1);
}
else {
if (tx0 !== k) {
a++;
}
}
if (a < b || c < b) {
b = (a > c ? c + 1 : a + 1);
}
else {
if (tx1 !== k) {
b++;
}
}
if (b < c || d < c) {
c = (b > d ? d + 1 : b + 1);
}
else {
if (tx2 !== k) {
c++;
}
}
if (c < d || e < d) {
d = (c > e ? e + 1 : c + 1);
}
else {
if (tx3 !== k) {
d++;
}
}
p[y] = e = d;
d = c;
c = b;
b = a;
a = py;
}
}
for (; x < m;) {
tx0 = t.charCodeAt(x);
a = x;
b = ++x;
for (y = 0; y < n; y++) {
py = p[y];
if (py < a || b < a) {
b = (py > b ? b + 1 : py + 1);
}
else {
if (tx0 !== s.charCodeAt(y)) {
b = a + 1;
}
else {
b = a;
}
}
p[y] = b;
a = py;
}
f = b;
}
return f;
}
I would also not use map in getLeastEditDistance, it is very slow. Just use a normal loop. Also Math.min with many arguments is not very performant.
I am working with Levenstein distances by my self and I have not found a good way to improve performance and will not recommend using it in a non-batch application.
I suggest you use another approach by using a search tree. A binary or ternary search tree can also find near match.
A good place to start is those articles:
http://www.codeproject.com/Articles/5819/Ternary-Search-Tree-Dictionary-in-C-Faster-String
or
http://www.codeproject.com/Articles/68500/Balanced-Binary-Search-Tree-BST-Search-Delete-InOr
The code is relatively simple sp you should not use much time to port it to JavaScript.
I have translated the following C++ code:
#include <iostream>
using namespace std;
#define NDIGITS 100
#define LEN (NDIGITS/4+1)*14
long a[LEN];
long b;
long c = LEN;
long d;
long e = 0;
long f = 10000;
long g;
long h = 0;
int main(void) {
cout<<b<<endl;
for(; (b=c-=14) > 0 ;){
for(; --b > 0 ;){
d *= b;
if( h == 0 )
d += 2000*f;
else
d += a[b]*f;
g=b+b-1;
a[b] = d % g;
d /= g;
}
h = printf("%ld",e+d/f);
d = e = d % f;
}
getchar();
return 0;
}
Into JavaScript:
function mod(n, m) {
return ((m % n) + n) % n;
} // mod function to fix javascript modulo bug
function calculate(NDIGITS){
var LEN = (NDIGITS / 4 + 1) * 14,
out = "",
a = [],
b = 0,
c = LEN,
d = 0,
e = 0,
f = 10000,
g = 0,
h = 0;
for( ; a.length != LEN; a.push(0));
for( ; (b=c-=14) > 0 ; ){
for(; --b > 0 ;){
d *= b;
if(h == 0)
d += 2000*f;
else
d += a[b]*f;
g=b+b-1;
a[b] = mod(d, g);
d /= g;
};
h = 4;
out += e + d / f;
d = e = mod(d, f);
};
return out;
};
calculate(100);
The problem is, the C++ (which is correct) output looks like this:
314159265358979323846264338327952884197169399375105820974944592307816406286208998628034825342117067
But the JavaScript (which is wrong) output looks like this:
3141.59265358979345928.3358757688158002.0385670499462603.1996016540431161.44919092773639662.2465149363658988.6127837844255865.38922090756173.61883094848226189.6324225085448150.3443440509899223.2179589088062808.1943642437717982.8973948575671840.86646781354151140.38694447211833938.5632867441137341.458720505086448.7384444661472807.14448220310268936.5521832735086764.9290682040381301.76585926509928223.4135991546457438.115065010927
Where did I mess up in my coding? Thanks for the help.
JavaScript does floating point division.
Arguments exchanged in modulo calculation function.
Here is code that produces the same result as the C++ code provided for the given sample (100) digits:
function mod(m, n) {
return ((m % n) + n) % n;
} // mod function to fix javascript modulo bug
function calculate(NDIGITS) {
var LEN = (NDIGITS / 4 + 1) * 14,
out = "",
a = [],
b = 0,
c = LEN,
d = 0,
e = 0,
f = 10000,
g = 0,
h = 0;
for (; a.length !== LEN; a.push(0));
for (; (b = c -= 14) > 0;) {
for (; --b > 0;) {
d *= b;
if (h === 0) {
d += 2000 * f;
} else {
d += a[b] * f;
}
g = b + b - 1;
a[b] = mod(d, g);
d = Math.floor(d / g);
}
h = Math.floor(e + d / f);
out += h;
h = h.length;
d = e = mod(d, f);
}
return out;
}
console.log(calculate(100));
This question already has answers here:
Sort an array by the "Levenshtein Distance" with best performance in Javascript
(7 answers)
Closed 9 years ago.
For a client side search tool I need to find the Levenshtein distance of a word with millions of other words. A user should be able to compare a short text of about twenty words with a book. The user could do this by finding locations of the most characterizing words of the text in the book. 'Finding locations'does not mean looking for an exact match but nearly match as with levenshtein. I started with already available implementations but I needed more speed. I ended up with this:
var rowA = new Uint16Array(1e6);
var rowB = new Uint16Array(1e6);
function levenshtein(s1, s2) {
var s1_len = s1.length, s2_len = s2.length, i1, i2 = 0, a, b, c, c2, i = 0;
if (s1_len === 0)
return s2_len;
if (s2_len === 0)
return s1_len;
while (i < s1_len)
rowA[i] = ++i;
while (i2 < s2_len) {
c2 = s2[i2];
a = i2;
++i2;
b = i2;
for (i1 = 0; i1 < s1_len; ++i1) {
c = a + (s1[i1] !== c2 );
a = rowA[i1];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
rowB[i1] = b;
}
if (i2 === s2_len)
return b;
c2 = s2[i2];
a = i2;
++i2;
b = i2;
for (i1 = 0; i1 < s1_len; ++i1) {
c = a + (s1[i1] !== c2 );
a = rowB[i1];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
rowA[i1] = b;
}
}
return b;
}
As you see I used techniques like placing the objects out of the function in order to re use them. I also repeated myself a bit by linearizing the loop somewhat. Could it be faster? I am curious of your advice.
Update:
After tips from Bergi and some more thinking I came to this solution:
var row = new Uint16Array(1e6);
function levenshtein(s1, s2) {
var s1_len = s1.length, s2_len = s2.length, i2 = 1, a, b = 0, c, c2, i1 = 0;
if (s1_len === 0)
return s2_len;
if (s2_len === 0)
return s1_len;
c2 = s2[0];
if (s1[0] === c2) {
while (i1 < s1_len) {
row[i1] = i1++;
}
b = s1_len - 1;
} else {
row[0] = 1;
++b;
if (s1_len > 1)
for (i1 = 1; i1 < s1_len; ++i1) {
if (s1[i1] === c2) {
row[i1] = b;
for (++i1; i1 < s1_len; ++i1) {
row[i1] = ++b;
}
} else {
row[i1] = ++b;
}
}
}
if (s2_len > 1)
while (i2 < s2_len) {
c2 = s2[i2];
c = i2 + (s1[0] !== c2);
a = row[0];
++i2;
b = i2 < a ? (i2 < c ? i2 + 1 : c) : (a < c ? a + 1 : c);
row[0] = b;
if (s1_len > 1) {
for (i1 = 1; i1 < s1_len; ++i1) {
c = a + (s1[i1] !== c2);
a = row[i1];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
row[i1] = b;
}
}
}
return b;
}
This is again much faster. I cannot squeeze more out of it. I keep looking for other ideas and will try some more.
Since you are comparing against the same word over and over, you can get a little performance improvement by using partial application and caching there:
function levenshtein(s1) {
var row0 = [], row1 = [], s1_len = s1.length;
if (s1_len === 0)
return function(s2) {
return s2.length;
};
return function(s2) {
var s2_len = s2.length, s1_idx, s2_idx = 0, a, b, c, c2, i = 0;
if (s2_len === 0)
return s1_len;
…
return b;
};
}
I also repeated myself a bit by linearizing the loop somewhat.
Not sure whether it gets lot faster, but you can omit one of the arrays - you do not need to read/write them in an alternating manner:
function levenshtein(s1) {
var s1_len = s1.length, row = new Array(s1_len);
if (s1_len === 0)
return function(s2) {
return s2.length;
};
return function(s2) {
var s2_len = s2.length, s1_idx, s2_idx = 0, a, b, c, c2, i = 0;
if (s2_len === 0)
return s1_len;
while (i < s1_len)
row[i] = ++i;
while (s2_idx < s2_len) {
c2 = s2[s2_idx];
a = s2_idx;
++s2_idx;
b = s2_idx;
for (s1_idx = 0; s1_idx < s1_len; ++s1_idx) {
c = a + (s1[s1_idx] === c2 ? 0 : 1);
a = row[s1_idx];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
row[s1_idx] = b;
}
}
return b;
};
}
I don't think further optimisations can be made without putting your millions of words in a dedicated data structure (e.g. a prefix trie).