I am looking for a JavaScript function that can compare two strings and return the likeliness that they are alike. I have looked at soundex but that's not really great for multi-word strings or non-names. I am looking for a function like:
function compare(strA,strB){
}
compare("Apples","apple") = Some X Percentage.
The function would work with all types of strings, including numbers, multi-word values, and names. Perhaps there's a simple algorithm I could use?
Ultimately none of these served my purpose so I used this:
function compare(c, u) {
var incept = false;
var ca = c.split(",");
u = clean(u);
//ca = correct answer array (Collection of all correct answer)
//caa = a single correct answer word array (collection of words of a single correct answer)
//u = array of user answer words cleaned using custom clean function
for (var z = 0; z < ca.length; z++) {
caa = $.trim(ca[z]).split(" ");
var pc = 0;
for (var x = 0; x < caa.length; x++) {
for (var y = 0; y < u.length; y++) {
if (soundex(u[y]) != null && soundex(caa[x]) != null) {
if (soundex(u[y]) == soundex(caa[x])) {
pc = pc + 1;
}
}
else {
if (u[y].indexOf(caa[x]) > -1) {
pc = pc + 1;
}
}
}
}
if ((pc / caa.length) > 0.5) {
return true;
}
}
return false;
}
// create object listing the SOUNDEX values for each letter
// -1 indicates that the letter is not coded, but is used for coding
// 0 indicates that the letter is omitted for modern census archives
// but acts like -1 for older census archives
// 1 is for BFPV
// 2 is for CGJKQSXZ
// 3 is for DT
// 4 is for L
// 5 is for MN my home state
// 6 is for R
function makesoundex() {
this.a = -1
this.b = 1
this.c = 2
this.d = 3
this.e = -1
this.f = 1
this.g = 2
this.h = 0
this.i = -1
this.j = 2
this.k = 2
this.l = 4
this.m = 5
this.n = 5
this.o = -1
this.p = 1
this.q = 2
this.r = 6
this.s = 2
this.t = 3
this.u = -1
this.v = 1
this.w = 0
this.x = 2
this.y = -1
this.z = 2
}
var sndx = new makesoundex()
// check to see that the input is valid
function isSurname(name) {
if (name == "" || name == null) {
return false
} else {
for (var i = 0; i < name.length; i++) {
var letter = name.charAt(i)
if (!(letter >= 'a' && letter <= 'z' || letter >= 'A' && letter <= 'Z')) {
return false
}
}
}
return true
}
// Collapse out directly adjacent sounds
// 1. Assume that surname.length>=1
// 2. Assume that surname contains only lowercase letters
function collapse(surname) {
if (surname.length == 1) {
return surname
}
var right = collapse(surname.substring(1, surname.length))
if (sndx[surname.charAt(0)] == sndx[right.charAt(0)]) {
return surname.charAt(0) + right.substring(1, right.length)
}
return surname.charAt(0) + right
}
// Collapse out directly adjacent sounds using the new National Archives method
// 1. Assume that surname.length>=1
// 2. Assume that surname contains only lowercase letters
// 3. H and W are completely ignored
function omit(surname) {
if (surname.length == 1) {
return surname
}
var right = omit(surname.substring(1, surname.length))
if (!sndx[right.charAt(0)]) {
return surname.charAt(0) + right.substring(1, right.length)
}
return surname.charAt(0) + right
}
// Output the coded sequence
function output_sequence(seq) {
var output = seq.charAt(0).toUpperCase() // Retain first letter
output += "-" // Separate letter with a dash
var stage2 = seq.substring(1, seq.length)
var count = 0
for (var i = 0; i < stage2.length && count < 3; i++) {
if (sndx[stage2.charAt(i)] > 0) {
output += sndx[stage2.charAt(i)]
count++
}
}
for (; count < 3; count++) {
output += "0"
}
return output
}
// Compute the SOUNDEX code for the surname
function soundex(value) {
if (!isSurname(value)) {
return null
}
var stage1 = collapse(value.toLowerCase())
//form.result.value=output_sequence(stage1);
var stage1 = omit(value.toLowerCase())
var stage2 = collapse(stage1)
return output_sequence(stage2);
}
function clean(u) {
var u = u.replace(/\,/g, "");
u = u.toLowerCase().split(" ");
var cw = ["ARRAY OF WORDS TO BE EXCLUDED FROM COMPARISON"];
var n = [];
for (var y = 0; y < u.length; y++) {
var test = false;
for (var z = 0; z < cw.length; z++) {
if (u[y] != "" && u[y] != cw[z]) {
test = true;
break;
}
}
if (test) {
//Don't use & or $ in comparison
var val = u[y].replace("$", "").replace("&", "");
n.push(val);
}
}
return n;
}
Here's an answer based on Levenshtein distance https://en.wikipedia.org/wiki/Levenshtein_distance
function similarity(s1, s2) {
var longer = s1;
var shorter = s2;
if (s1.length < s2.length) {
longer = s2;
shorter = s1;
}
var longerLength = longer.length;
if (longerLength == 0) {
return 1.0;
}
return (longerLength - editDistance(longer, shorter)) / parseFloat(longerLength);
}
For calculating edit distance
function editDistance(s1, s2) {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
var costs = new Array();
for (var i = 0; i <= s1.length; i++) {
var lastValue = i;
for (var j = 0; j <= s2.length; j++) {
if (i == 0)
costs[j] = j;
else {
if (j > 0) {
var newValue = costs[j - 1];
if (s1.charAt(i - 1) != s2.charAt(j - 1))
newValue = Math.min(Math.min(newValue, lastValue),
costs[j]) + 1;
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
}
if (i > 0)
costs[s2.length] = lastValue;
}
return costs[s2.length];
}
Usage
similarity('Stack Overflow','Stack Ovrflw')
returns 0.8571428571428571
You can play with it below:
function checkSimilarity(){
var str1 = document.getElementById("lhsInput").value;
var str2 = document.getElementById("rhsInput").value;
document.getElementById("output").innerHTML = similarity(str1, str2);
}
function similarity(s1, s2) {
var longer = s1;
var shorter = s2;
if (s1.length < s2.length) {
longer = s2;
shorter = s1;
}
var longerLength = longer.length;
if (longerLength == 0) {
return 1.0;
}
return (longerLength - editDistance(longer, shorter)) / parseFloat(longerLength);
}
function editDistance(s1, s2) {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
var costs = new Array();
for (var i = 0; i <= s1.length; i++) {
var lastValue = i;
for (var j = 0; j <= s2.length; j++) {
if (i == 0)
costs[j] = j;
else {
if (j > 0) {
var newValue = costs[j - 1];
if (s1.charAt(i - 1) != s2.charAt(j - 1))
newValue = Math.min(Math.min(newValue, lastValue),
costs[j]) + 1;
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
}
if (i > 0)
costs[s2.length] = lastValue;
}
return costs[s2.length];
}
<div><label for="lhsInput">String 1:</label> <input type="text" id="lhsInput" oninput="checkSimilarity()" /></div>
<div><label for="rhsInput">String 2:</label> <input type="text" id="rhsInput" oninput="checkSimilarity()" /></div>
<div>Match: <span id="output">No Input</span></div>
Using this library for string similarity worked like a charm for me!
Here's the Example -
var similarity = stringSimilarity.compareTwoStrings("Apples","apple"); // => 0.88
Here is a very simple function that does a comparison and returns a percentage based on equivalency. While it has not been tested for all possible scenarios, it may help you get started.
function similar(a,b) {
var equivalency = 0;
var minLength = (a.length > b.length) ? b.length : a.length;
var maxLength = (a.length < b.length) ? b.length : a.length;
for(var i = 0; i < minLength; i++) {
if(a[i] == b[i]) {
equivalency++;
}
}
var weight = equivalency / maxLength;
return (weight * 100) + "%";
}
alert(similar("test","tes")); // 75%
alert(similar("test","test")); // 100%
alert(similar("test","testt")); // 80%
alert(similar("test","tess")); // 75%
To Find degree of similarity between two strings; we can use more than one or two methods but I am mostly inclined towards the usage of 'Dice's Coefficient' . which is better! well in my knowledge than using 'Levenshtein distance'
Using this 'string-similarity' package from npm you will be able to work on what I said above.
some easy usage examples are
var stringSimilarity = require('string-similarity');
var similarity = stringSimilarity.compareTwoStrings('healed', 'sealed');
var matches = stringSimilarity.findBestMatch('healed', ['edward', 'sealed', 'theatre']);
for more please visit the link given above. Thankyou.
Just one I quickly wrote that might be good enough for your purposes:
function Compare(strA,strB){
for(var result = 0, i = strA.length; i--;){
if(typeof strB[i] == 'undefined' || strA[i] == strB[i]);
else if(strA[i].toLowerCase() == strB[i].toLowerCase())
result++;
else
result += 4;
}
return 1 - (result + 4*Math.abs(strA.length - strB.length))/(2*(strA.length+strB.length));
}
This weighs characters that are the same but different case 1 quarter as heavily as characters that are completely different or missing. It returns a number between 0 and 1, 1 meaning the strings are identical. 0 meaning they have no similarities. Examples:
Compare("Apple", "Apple") // 1
Compare("Apples", "Apple") // 0.8181818181818181
Compare("Apples", "apple") // 0.7727272727272727
Compare("a", "A") // 0.75
Compare("Apples", "appppp") // 0.45833333333333337
Compare("a", "b") // 0
How about function similar_text from PHP.js library?
It is based on a PHP function with the same name.
function similar_text (first, second) {
// Calculates the similarity between two strings
// discuss at: http://phpjs.org/functions/similar_text
if (first === null || second === null || typeof first === 'undefined' || typeof second === 'undefined') {
return 0;
}
first += '';
second += '';
var pos1 = 0,
pos2 = 0,
max = 0,
firstLength = first.length,
secondLength = second.length,
p, q, l, sum;
max = 0;
for (p = 0; p < firstLength; p++) {
for (q = 0; q < secondLength; q++) {
for (l = 0;
(p + l < firstLength) && (q + l < secondLength) && (first.charAt(p + l) === second.charAt(q + l)); l++);
if (l > max) {
max = l;
pos1 = p;
pos2 = q;
}
}
}
sum = max;
if (sum) {
if (pos1 && pos2) {
sum += this.similar_text(first.substr(0, pos2), second.substr(0, pos2));
}
if ((pos1 + max < firstLength) && (pos2 + max < secondLength)) {
sum += this.similar_text(first.substr(pos1 + max, firstLength - pos1 - max), second.substr(pos2 + max, secondLength - pos2 - max));
}
}
return sum;
}
fuzzyset - A fuzzy string set for javascript.
fuzzyset is a data structure that performs something akin to fulltext search against data to determine likely mispellings and approximate string matching. Note that this is a javascript port of a python library.
To some extent, I like the ideas of Dice's coefficient embedded in the string-similarity module. But I feel that considering the bigrams only and not taking into account their multiplicities is missing some important data. Below is a version that also handles multiplicities, and I think is a simpler implementation overall. I don't try to use their API, offering only a function which compares two strings after some manipulation (removing non-alphanumeric characters, lower-casing everything, and compressing but not removing whitespace), built atop one which compares them without that manipulation. It would be easy enough to wrap this back in their API, but I see little need.
const stringSimilarity = (a, b) =>
_stringSimilarity (prep (a), prep (b))
const _stringSimilarity = (a, b) => {
const bg1 = bigrams (a)
const bg2 = bigrams (b)
const c1 = count (bg1)
const c2 = count (bg2)
const combined = uniq ([... bg1, ... bg2])
.reduce ((t, k) => t + (Math .min (c1 [k] || 0, c2 [k] || 0)), 0)
return 2 * combined / (bg1 .length + bg2 .length)
}
const prep = (str) => // TODO: unicode support?
str .toLowerCase () .replace (/[^\w\s]/g, ' ') .replace (/\s+/g, ' ')
const bigrams = (str) =>
[...str] .slice (0, -1) .map ((c, i) => c + str [i + 1])
const count = (xs) =>
xs .reduce ((a, x) => ((a [x] = (a [x] || 0) + 1), a), {})
const uniq = (xs) =>
[... new Set (xs)]
console .log (stringSimilarity (
'foobar',
'Foobar'
)) //=> 1
console .log (stringSimilarity (
"healed",
"sealed"
))//=> 0.8
console .log (stringSimilarity (
"Olive-green table for sale, in extremely good condition.",
"For sale: table in very good condition, olive green in colour."
)) //=> 0.7787610619469026
console .log (stringSimilarity (
"Olive-green table for sale, in extremely good condition.",
"For sale: green Subaru Impreza, 210,000 miles"
)) //=> 0.38636363636363635
console .log (stringSimilarity (
"Olive-green table for sale, in extremely good condition.",
"Wanted: mountain bike with at least 21 gears."
)) //=> 0.1702127659574468
console .log (stringSimilarity (
"The rain in Spain falls mainly on the plain.",
"The run in Spun falls munly on the plun.",
)) //=> 0.7560975609756098
console .log (stringSimilarity (
"Fa la la la la, la la la la",
"Fa la la la la, la la",
)) //=> 0.8636363636363636
console .log (stringSimilarity (
"car crash",
"carcrash",
)) //=> 0.8
console .log (stringSimilarity (
"Now is the time for all good men to come to the aid of their party.",
"Huh?",
)) //=> 0
.as-console-wrapper {max-height: 100% !important; top: 0}
Some of the test cases are from string-similarity, others are my own. They show some significant differences from that package, but nothing untoward. The only one I would call out is the difference between "car crash" and "carcrash", which string-similarity sees as identical and I report with a similarity of 0.8. My version finds more similarity in all the olive-green test-cases than does string-similarity, but as these are in any case fairly arbitrary numbers, I'm not sure how much difference it makes; they certainly position them in the same relative order.
string-similarity lib vs Top answer (by #overloard1234) performance comparation you can find below
Based on #Tushar Walzade's advice to use string-similarity library, you can find, that for example
stringSimilatityLib.findBestMatch('KIA','Kia').bestMatch.rating
will return 0.0
So, looks like better to compare it in lowerCase.
Better base usage (for arrays) :
findBestMatch(str, strArr) {
const lowerCaseArr = strArr.map(element => element.toLowerCase());//creating lower case array
const match = stringSimilatityLib.findBestMatch(str.toLowerCase(), lowerCaseArr).bestMatch; //trying to find bestMatch
if (match.rating > 0) {
const foundIndex = lowerCaseArr.findIndex(x => x === match.target); //finding the index of found best case
return strArr[foundIndex]; //returning initial value from array
}
return null;
},
Performance
Also, i compared top answer here (made by #overloard1234) and string-similarity lib (v4.0.4).
The results you can find here : https://jsbench.me/szkzojoskq/1
Result : string-similarity is ~ twice faster
Just for fun : v2.0 of string-similarity library slower, than latest 4.0.4 about 2.2 times. So update it, if you are still using < 3.0 :)
const str1 = " pARTH PARmar r ";
const str2 = " parmar r par ";
function calculateSimilarity(str1 = "", str2 = "") {
let longer = str1.trim();
let shorter = str2.trim();
let a1 = longer.toLowerCase().split(" ");
let b1 = shorter.toLowerCase().split(" ");
let result = a1.every((aa, i) => aa[0] === b1[i][0]);
if (longer.length < shorter.length) [longer,shorter] = [shorter,longer];
var arr = [];
let count = 0;
for(var i = 0;i<longer.length;i++){
if(shorter && shorter.includes(longer[i])) {
shorter = shorter.replace(longer[i],"")
count++
};
}
return {
score : (count*100)/longer.length,
result
}
}
console.log(calculateSimilarity(str1, str2));
I used #overlord1234 function, but corrected ь: '', cuz English words don't have this letter, and next need return a[char] ?? char instead of return a[char] || char
Related
I am having an issue with the following code that simulates a card deck.
The deck is created properly (1 array containing 4 arrays (suits) containing 13 elements each (face values)) and when I use the G.test(); function it is correctly pulling 13 random cards but then returns 39x "Empty" (A total of 52).
I hate to ask for help, but I have left the problem overnight and then some and I still cannot find the reason that this is happening. I appreciate any and all insight that can be offered.
var G = {};
G.cards = [[], [], [], []];
G.newCard = function(v) { //currently a useless function, tried a few things
return v;
};
G.deck = {
n: function() { //new deck
var x; var list = [];
list.push(G.newCard("A"));
for (x = 2; x <= 10; x += 1) {
list.push(G.newCard(x.toString()));
}
list.push(G.newCard("J"), G.newCard("Q"), G.newCard("K"));
for (x = 0; x < G.cards.length; x += 1) {
G.cards[x] = list;
}
},
d: function() { //random card - returns suit & value
var s; var c; var v; var drawn = false; var n;
s = random(0, G.cards.length);
c = random(0, G.cards[s].length);
n = 0;
while (!drawn) {
if (G.cards[s].length > 0) {
if (G.cards[s][c]) {
v = G.cards[s].splice(c, 1);
drawn = true;
} else {
c = random(0, G.cards[s].length);
}
} else {
s = (s + 1 >= G.cards.length) ? 0 : s + 1;
n += 1;
console.log(s);
if (n >= G.cards.length) {
console.log(n);
return "Empty";
}
}
}
return {s: s, v: v[0]};
},
}; //G.deck
G.test = function() {
var x; var v;
G.deck.n();
for (x = 0; x < 52; x += 1) {
v = G.deck.d();
console.log(v);
}
};
Replace
for (x = 0; x < G.cards.length; x += 1) {
G.cards[x] = list;
}
with
for (x = 0; x < G.cards.length; x += 1) {
G.cards[x] = list.slice();
}
as this prevents all elements of G.cards[x] binding to the same (single) array instance.
If all elements bind to the same instance, mutating one element equals mutating all elements. list.slice() creates a new copy of list and thus a new array instance to prevent the aforementioned issue.
I won't go through your code, but I built a code that will do what you wanted. I only built this for one deck and not multiple deck play. There are two functions, one will generate the deck, and the other will drawn cards from the deck, bases on how many hands you need and how many cards you wanted for each hand. One a card is drawn, it will not be re-drawn. I might publish a short article for how a card dealing program work or similar in the short future at http://kevinhng86.iblog.website.
function random(min, max){
return Math.floor(Math.random() * (max - min)) + min;
}
function deckGenerate(){
var output = [];
var face = {1: "A", 11: "J", 12: "Q", 13: "K"};
// Heart Space Diamond & Club;
var suit = ["H", "S", "D", "C"];
// Delimiter between card identification and suit identification.
var d = "-";
for(i = 0; i < 4; i++){
output[i] = [];
for(ind = 0; ind < 13; ind++ ){
card = (ind + 1);
output[i][ind] = (card > 10) || (card === 1)? face[card] + d + suit[i] : card.toString() + d + suit[i];
}
}
return output;
}
function randomCard(deck, hand, card){
var output = [];
var randS = 0;
var randC = 0;
if( hand * card > 52 ) throw("Too many card, I built this for one deck only");
for(i = 0; i < hand; i++){
output[i] = [];
for(ind = 0; ind < card; ind++){
randS = random(0, deck.length);
randC = random(0, deck[randS].length);
output[i][ind] = deck[randS][randC];
deck[randS].splice(randC,1);
if(deck[randS].length === 0) deck.splice(randS,1);
}
}
document.write( JSON.stringify(deck, null, 2) );
return output;
}
var deck = deckGenerate()
document.write( JSON.stringify(deck, null, 2) );
document.write("<br><br>");
var randomhands = randomCard(deck, 5, 8);
document.write("<br><br>");
document.write("<br><br>");
document.write( JSON.stringify(randomhands, null, 2) );
I'm working on a game where I only need to check if there's a distance of 0 or 1 between two words and return true if that's the case. I found a general purpose levenshtein distance algorithm:
function levenshtein(s, t) {
if (s === t) { return 0; }
var n = s.length, m = t.length;
if (n === 0 || m === 0) { return n + m; }
var x = 0, y, a, b, c, d, g, h, k;
var p = new Array(n);
for (y = 0; y < n;) { p[y] = ++y; }
for (;
(x + 3) < m; x += 4) {
var e1 = t.charCodeAt(x);
var e2 = t.charCodeAt(x + 1);
var e3 = t.charCodeAt(x + 2);
var e4 = t.charCodeAt(x + 3);
c = x; b = x + 1; d = x + 2; g = x + 3; h = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
a = p[y];
if (a < c || b < c) { c = (a > b ? b + 1 : a + 1); }
else { if (e1 !== k) { c++; } }
if (c < b || d < b) { b = (c > d ? d + 1 : c + 1); }
else { if (e2 !== k) { b++; } }
if (b < d || g < d) { d = (b > g ? g + 1 : b + 1); }
else { if (e3 !== k) { d++; } }
if (d < g || h < g) { g = (d > h ? h + 1 : d + 1); }
else { if (e4 !== k) { g++; } }
p[y] = h = g; g = d; d = b; b = c; c = a;
}
}
for (; x < m;) {
var e = t.charCodeAt(x);
c = x;
d = ++x;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || d < c) { d = (a > d ? d + 1 : a + 1); }
else {
if (e !== s.charCodeAt(y)) { d = c + 1; }
else { d = c; }
}
p[y] = d;
c = a;
}
h = d;
}
return h;
}
Which works, but this spot is going to be a hotspot and be run potentially hundreds of thousands of times a second and I want to optimize it because I don't need a general purpose algorithm, just one that checks if there's a distance of 0 or 1.
I tried writing it and came up with this:
function closeGuess(guess, word) {
if (Math.abs(word.length - guess.length) > 1) { return false; }
var errors = 0, guessIndex = 0, wordIndex = 0;
while (guessIndex < guess.length || wordIndex < word.length) {
if (errors > 1) { return false; }
if (guess[guessIndex] !== word[wordIndex]) {
if (guess.length < word.length) { wordIndex++; }
else { guessIndex++; }
errors++;
} else {
wordIndex++;
guessIndex++;
}
}
return true;
}
But after profiling it I found that my code was twice as slow, which surprised me because I think the general purpose algorithm is O(n*m) and I think mine is O(n).
I've been testing the performance difference on this fiddle: https://jsfiddle.net/aubtze2L/3/
Are there any better algorithms I can use or any way I can optimize my code to be faster?
I don't see a more elegant way which is at the same time faster than the good old for-loop:
function lev01(a, b) {
let la = a.length;
let lb = b.length;
let d = 0;
switch (la - lb) {
case 0: // mutation
for (let i = 0; i < la; ++i) {
if (a.charAt(i) != b.charAt(i) && ++d > 1) {
return false;
}
}
return true;
case -1: // insertion
for (let i = 0; i < la + d; ++i) {
if (a.charAt(i - d) != b.charAt(i) && ++d > 1) {
return false;
}
}
return true;
case +1: // deletion
for (let i = 0; i < lb + d; ++i) {
if (a.charAt(i) != b.charAt(i - d) && ++d > 1) {
return false;
}
}
return true;
}
return false;
}
console.log(lev01("abc", "abc"));
console.log(lev01("abc", "abd"));
console.log(lev01("abc", "ab"));
console.log(lev01("abc", "abcd"));
console.log(lev01("abc", "cba"));
Performance comparison (Chrome):
80.33ms - lev01 (this answer)
234.84ms - lev
708.12ms - close
Consider the following cases:
If the difference in lengths of the terms is greater than 1, then
the Levenshtein distance between them will be greater than 1.
If the difference in lengths is exactly 1, then the shortest string must be equal to the longest string, with a single deletion (or insertion).
If the strings are the same length then you should
consider a modified version of Hamming distance which returns false
if two, different characters are found:
Here is a sample implementation:
var areSimilar;
areSimilar = function(guess, word) {
var charIndex, foundDiff, guessLength, lengthDiff, substring, wordLength, shortest, longest, shortestLength, offset;
guessLength = guess.length;
wordLength = word.length;
lengthDiff = guessLength - wordLength;
if (lengthDiff < -1 || lengthDiff > 1) {
return false;
}
if (lengthDiff !== 0) {
if (guessLength < wordLength) {
shortest = guess;
longest = word;
shortestLength = guessLength;
} else {
shortest = word;
longest = guess;
shortestLength = wordLength;
}
offset = 0;
for (charIndex = 0; charIndex < shortestLength; charIndex += 1) {
if (shortest[charIndex] !== longest[offset + charIndex]) {
if (offset > 0) {
return false; // second error
}
offset = 1;
if (shortest[charIndex] !== longest[offset + charIndex]) {
return false; // second error
}
}
}
return true; // only one error
}
foundDiff = false;
for (charIndex = 0; charIndex < guessLength; charIndex += 1) {
if (guess[charIndex] !== word[charIndex]) {
if (foundDiff) {
return false;
}
foundDiff = true;
}
}
return true;
};
I've updated your fiddle to include this method. Here are the results on my machine:
close: 154.61
lev: 176.72500000000002
sim: 32.48000000000013
Fiddle: https://jsfiddle.net/dylon/aubtze2L/11/
If you know that you are looking for distance 0 and 1, then the general purpose DP algorithm does not make sense (and by the way the algorithm you showed looks convoluted, take a look at a better explanation here).
To check that the distance is 0, all you need is to check whether 2 strings are the same. Now if the distance is one, it means that either insertion, deletion or substitution should have happened. So generate all possible deletion from the original string and check whether it is equal to second string. So you will get something like this:
for (var i = 0; i < s_1.length; i++) {
if s_2 == s_1.slice(0, i) + s_1.slice(i + 1) {
return true
}
}
For insertions and substitution you will need to know the alphabet of all characters. You can define it as a big string var alphabet = "abcde....". Now you do a similar thing, but when you introduce substitution or insertion, you also iterate over all elements in your alphabet. I am not planning to write the whole code here.
A couple of additional things. You can make a lot of micro-optimizations here. For example if the length of two strings are different by more than 1, they clearly can't have a distance 1. Another one relates to the frequencies of underlying characters in the string.
I have some code that compares two strings and returns the one with the lowest value, then compares that value to the letterGrade to see which one is the lowest and then returns the higher of the two. My issue is that I am having issues getting this code to work using javascript.
Javascript code:
var result1 = (finalexam.getletterGrade().compareTo(midexam.getletterGrade()));
var result2 = " ";
if (result1 == 0)
{
result2 = midexam.getletterGrade();
}
else if (result1 < 0)
{
result2 = midexam.getletterGrade();
}
else if (result1 > 0)
{
result2 = finalexam.getletterGrade();
}
var result3 = (letterGrade.compareTo(result2));
if (result3 < 0)
{
result2 = letterGrade;
}
JavaScript int doesn't have a ternary compareTo like Java. You could write one. Something like,
function compare(a, b) {
if (a < b) {
return -1;
} else if (a > b) {
return 1;
}
return 0;
}
Then change your compareTo calls to invoke it. Something like
var result1 = compare(finalexam.getletterGrade(), midexam.getletterGrade());
and
var result3 = compare(letterGrade, result2);
my issue was that our backend code sorted our content using Java's compareTo, because of a misunderstanding I attempted to rewrite compareTo in JavaScript, whereas the fix ended up something completely different, however this may solve your issue
String.prototype.compareTo = function (anotherString) {
var len1 = this.length;
var len2 = anotherString.length;
var n = Math.min(len1,len2); //If the lengths will always be the same skip n to be assigned to len1 only, get rid of len2
var v1 = this;
var v2= new String(anotherString);
// You could instead set these vars i and j to passed in offsets if you want to start sorting from, in which case else case will be possible, or only the first i == j if statement will always be run
var i = 0;
var j = 0;
if (i == j) {
var k = i;
var lim = n + i;
while (k < lim) {
var c1 = v1[k];
var c2 = v2[k];
if (c1 != c2) {
var returnVal = c1.charCodeAt(0) - c2.charCodeAt(0);
if (returnVal > 0)
return 1;
else
return -1
return 0;
}
k++;
}
} else {
while (n-- != 0) {
var c1 = v1[i++];
var c2 = v2[j++];
if (c1 != c2) {
var returnVal = c1.charCodeAt(0) - c2.charCodeAt(0);
if (returnVal > 0)
return 1;
else
return -1
return 0;
}
}
}
return len1 - len2;
//return 0; //if lengths are always the same, comment the above line, uncomment this line
}
Call it like below
var a = "whatever";
var b = "however";
a.compareTo(b);
Hope this helps
I have an array being built like this:
var entries = ['L','L','L','L','L','L','L','L','L','L','R','R','R','R','R','R','R','R','R','R','M','M','M','M','M']
This means the array is always filled with 10 times L, 10 times R and 5 times M
The output I want to achieve is a randomly generated array, so I came up with the simple solution to just shuffle it with
function shuffle(o){
for(var j, x, i = o.length; i; j = Math.floor(Math.random() * i), x = o[--i], o[i] = o[j], o[j] = x);
return o;
}
The problem I have now is that there is a rule for the outcome, never have one of these letters more than 2 times in a row. So I thought I just use a do/while loop to shuffle until that criteria is met. But in my test runs this totally fails with long loops.
So my question is - what is the best way to build this array without depending on luck. My full program that fails is something like this
function shuffle(o){
for(var j, x, i = o.length; i; j = Math.floor(Math.random() * i), x = o[--i], o[i] = o[j], o[j] = x);
return o;
}
function createProgram(numShooters){
var programs = [];
for(var s = 0; s < numShooters; s++){
//Build array with L/R/M
for( d=0; d < 10; d++ ){
program.push('L');
program.push('R');
if(d < 5){
program.push('M');
}
}
// This will run way too long and is not reliable
//do{
// program = shuffle(program);
//}while(!checkProgram(program))
if(!checkProgram(program)){
console.log('invalid program at ' + s);
}
programs[s] = program;
}
return programs;
}
function checkProgram(program){
var len = program.length;
var last = null;
var dups = 0;
for(var i=0; i<len; len++){
if(program[i] == last){
dups++;
}else{
dups = 0;
}
if(dups == 2){
return false;
}
last = program[i];
}
return true;
}
createProgram(5);
Instead of just shuffle arrays and hope for one without duplicates, you can create them by picking characters by random and specifically avoid to pick the character that was picked previously.
If you keep track of how many there are left to pick of each character, you can control the odds for the character to pick so that the distribution is correct. If for example the first two characters are L, then there are 10 R and 5 M left to pick from (and 8 L, but they are excluded for the next pick), so there should be a 2 in 3 chance to pick an R and a 1 in 3 chance to pick and M.
This approach can run into a dead end, where the array can't be completed, so it has to start over. Running it a few hundred times I have seen something like a 10% overhead, so if you create five arrays you should by average see a retry every other time.
function createProgram(numShooters){
var programs = [];
for(var s = 0; s < numShooters; s++){
var chars = [ 'L', 'R', 'M' ];
var program;
do {
program = [];
var cnt = [ 10, 10, 5, 0 ]; // picks left
var prev = 3; // previous pick
var tot = 25; // total picks left
while (program.length < 25) {
// check for duplicates
var x = program.length >= 2 && program[program.length - 2] == program[program.length - 1] ? prev : 3;
// check if more picks are possible
if (tot - cnt[x] <= 0) {
console.log('invalid program ' + program);
break;
}
// pick from the possible
var r = Math.floor(Math.random() * (tot - cnt[x]));
// determine what character was picked
var c = 0;
while (c == x || r >= cnt[c]) {
if (c != x) r -= cnt[c];
c++;
}
program.push(chars[c]);
cnt[c]--;
tot--;
prev = c;
}
} while (program.length < 25);
programs[s] = program;
}
return programs;
}
console.log(createProgram(1).toString());
So this is the final solution I came up with, as commented Guffas solution also works nice and smooth. But after doing some tests mine is about 30% faster and more readable, but way longer so I'll accept Guffas - thanks to everybody for their input!
function createProgram(numShooters){
var programs = [];
for(var s = 0; s < numShooters; s++){
var program = buildProgram();
programs[s] = program;
}
return programs;
}
function buildProgram(){
var program = [];
var ls = fillArray('L',10);
var rs = fillArray('R',10);
var ms = fillArray('M',5);
//Use either L or R to mix into M - adds variation
var side = Math.random() > 0.5 ? ls : rs;
var otherSide = side == ls ? rs : ls;
var initProg = side.concat(ms);
initProg = shuffle(initProg);
var program = [];
//Correcting invalid positions as suggested
for(var p1 = 0; p1 < initProg.length; p1++){
if(p1 > 1 && initProg[p1-1] == initProg[p1-2] && initProg[p1-1] == initProg[p1]){
if(otherSide.length > 0){
program.push(otherSide.pop());
}else{
return buildProgram(); //impossible state, redo...
}
}
program.push(initProg[p1]);
}
//Fill into remaining other pos
for(var p2 = 0; p2 < otherSide.length; p2++){
program = addAtRandomPos(program,otherSide[p2]);
}
return program;
}
function addAtRandomPos(arr,chr){
var pos = getRandomInt( 0, arr.length - 1 );
var charCur = arr[pos];
var pprev = pos > 1 ? arr[pos-2] : null;
var prev = pos > 0 ? arr[pos-1] : null;
var next = pos < arr.length - 1 ? arr[pos] : null;
var nnext = pos < arr.length - 2 ? arr[pos+1] : null;
var str = pprev + prev + chr + next + nnext;
if(str.indexOf('MMM') !== -1 || str.indexOf('RRR') !== -1 || str.indexOf('LLL') !== -1){
return addAtRandomPos(arr,chr);
}else{
arr.splice(pos,0,chr);
}
return arr;
}
function getRandomInt (min, max) {
return Math.floor(Math.random() * (max - min + 1)) + min;
}
function fillArray(chr,num){
var arr = [];
for(i = 0; i < num; i++){
arr.push(chr);
}
return arr;
}
So I have a random javascript array of names...
[#larry,#nicholas,#notch] etc.
They all start with the # symbol. I'd like to sort them by the Levenshtein Distance so that the the ones at the top of the list are closest to the search term. At the moment, I have some javascript that uses jQuery's .grep() on it using javascript .match() method around the entered search term on key press:
(code edited since first publish)
limitArr = $.grep(imTheCallback, function(n){
return n.match(searchy.toLowerCase())
});
modArr = limitArr.sort(levenshtein(searchy.toLowerCase(), 50))
if (modArr[0].substr(0, 1) == '#') {
if (atRes.childred('div').length < 6) {
modArr.forEach(function(i){
atRes.append('<div class="oneResult">' + i + '</div>');
});
}
} else if (modArr[0].substr(0, 1) == '#') {
if (tagRes.children('div').length < 6) {
modArr.forEach(function(i){
tagRes.append('<div class="oneResult">' + i + '</div>');
});
}
}
$('.oneResult:first-child').addClass('active');
$('.oneResult').click(function(){
window.location.href = 'http://hashtag.ly/' + $(this).html();
});
It also has some if statements detecting if the array contains hashtags (#) or mentions (#). Ignore that. The imTheCallback is the array of names, either hashtags or mentions, then modArr is the array sorted. Then the .atResults and .tagResults elements are the elements that it appends each time in the array to, this forms a list of names based on the entered search terms.
I also have the Levenshtein Distance algorithm:
var levenshtein = function(min, split) {
// Levenshtein Algorithm Revisited - WebReflection
try {
split = !("0")[0]
} catch(i) {
split = true
};
return function(a, b) {
if (a == b)
return 0;
if (!a.length || !b.length)
return b.length || a.length;
if (split) {
a = a.split("");
b = b.split("")
};
var len1 = a.length + 1,
len2 = b.length + 1,
I = 0,
i = 0,
d = [[0]],
c, j, J;
while (++i < len2)
d[0][i] = i;
i = 0;
while (++i < len1) {
J = j = 0;
c = a[I];
d[i] = [i];
while(++j < len2) {
d[i][j] = min(d[I][j] + 1, d[i][J] + 1, d[I][J] + (c != b[J]));
++J;
};
++I;
};
return d[len1 - 1][len2 - 1];
}
}(Math.min, false);
How can I work with algorithm (or a similar one) into my current code to sort it without bad performance?
UPDATE:
So I'm now using James Westgate's Lev Dist function. Works WAYYYY fast. So performance is solved, the issue now is using it with source...
modArr = limitArr.sort(function(a, b){
levDist(a, searchy)
levDist(b, searchy)
});
My problem now is general understanding on using the .sort() method. Help is appreciated, thanks.
Thanks!
I wrote an inline spell checker a few years ago and implemented a Levenshtein algorithm - since it was inline and for IE8 I did quite a lot of performance optimisation.
var levDist = function(s, t) {
var d = []; //2d matrix
// Step 1
var n = s.length;
var m = t.length;
if (n == 0) return m;
if (m == 0) return n;
//Create an array of arrays in javascript (a descending loop is quicker)
for (var i = n; i >= 0; i--) d[i] = [];
// Step 2
for (var i = n; i >= 0; i--) d[i][0] = i;
for (var j = m; j >= 0; j--) d[0][j] = j;
// Step 3
for (var i = 1; i <= n; i++) {
var s_i = s.charAt(i - 1);
// Step 4
for (var j = 1; j <= m; j++) {
//Check the jagged ld total so far
if (i == j && d[i][j] > 4) return n;
var t_j = t.charAt(j - 1);
var cost = (s_i == t_j) ? 0 : 1; // Step 5
//Calculate the minimum
var mi = d[i - 1][j] + 1;
var b = d[i][j - 1] + 1;
var c = d[i - 1][j - 1] + cost;
if (b < mi) mi = b;
if (c < mi) mi = c;
d[i][j] = mi; // Step 6
//Damerau transposition
if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
}
}
}
// Step 7
return d[n][m];
}
I came to this solution:
var levenshtein = (function() {
var row2 = [];
return function(s1, s2) {
if (s1 === s2) {
return 0;
} else {
var s1_len = s1.length, s2_len = s2.length;
if (s1_len && s2_len) {
var i1 = 0, i2 = 0, a, b, c, c2, row = row2;
while (i1 < s1_len)
row[i1] = ++i1;
while (i2 < s2_len) {
c2 = s2.charCodeAt(i2);
a = i2;
++i2;
b = i2;
for (i1 = 0; i1 < s1_len; ++i1) {
c = a + (s1.charCodeAt(i1) === c2 ? 0 : 1);
a = row[i1];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
row[i1] = b;
}
}
return b;
} else {
return s1_len + s2_len;
}
}
};
})();
See also http://jsperf.com/levenshtein-distance/12
Most speed was gained by eliminating some array usages.
Updated: http://jsperf.com/levenshtein-distance/5
The new Revision annihilates all other benchmarks. I was specifically chasing Chromium/Firefox performance as I don't have an IE8/9/10 test environment, but the optimisations made should apply in general to most browsers.
Levenshtein Distance
The matrix to perform Levenshtein Distance can be reused again and again. This was an obvious target for optimisation (but be careful, this now imposes a limit on string length (unless you were to resize the matrix dynamically)).
The only option for optimisation not pursued in jsPerf Revision 5 is memoisation. Depending on your use of Levenshtein Distance, this could help drastically but was omitted due to its implementation specific nature.
// Cache the matrix. Note this implementation is limited to
// strings of 64 char or less. This could be altered to update
// dynamically, or a larger value could be used.
var matrix = [];
for (var i = 0; i < 64; i++) {
matrix[i] = [i];
matrix[i].length = 64;
}
for (var i = 0; i < 64; i++) {
matrix[0][i] = i;
}
// Functional implementation of Levenshtein Distance.
String.levenshteinDistance = function(__this, that, limit) {
var thisLength = __this.length, thatLength = that.length;
if (Math.abs(thisLength - thatLength) > (limit || 32)) return limit || 32;
if (thisLength === 0) return thatLength;
if (thatLength === 0) return thisLength;
// Calculate matrix.
var this_i, that_j, cost, min, t;
for (i = 1; i <= thisLength; ++i) {
this_i = __this[i-1];
for (j = 1; j <= thatLength; ++j) {
// Check the jagged ld total so far
if (i === j && matrix[i][j] > 4) return thisLength;
that_j = that[j-1];
cost = (this_i === that_j) ? 0 : 1; // Chars already match, no ++op to count.
// Calculate the minimum (much faster than Math.min(...)).
min = matrix[i - 1][j ] + 1; // Deletion.
if ((t = matrix[i ][j - 1] + 1 ) < min) min = t; // Insertion.
if ((t = matrix[i - 1][j - 1] + cost) < min) min = t; // Substitution.
matrix[i][j] = min; // Update matrix.
}
}
return matrix[thisLength][thatLength];
};
Damerau-Levenshtein Distance
jsperf.com/damerau-levenshtein-distance
Damerau-Levenshtein Distance is a small modification to Levenshtein Distance to include transpositions. There is very little to optimise.
// Damerau transposition.
if (i > 1 && j > 1 && this_i === that[j-2] && this[i-2] === that_j
&& (t = matrix[i-2][j-2]+cost) < matrix[i][j]) matrix[i][j] = t;
Sorting Algorithm
The second part of this answer is to choose an appropriate sort function. I will upload optimised sort functions to http://jsperf.com/sort soon.
I implemented a very performant implementation of levenshtein distance calculation if you still need this.
function levenshtein(s, t) {
if (s === t) {
return 0;
}
var n = s.length, m = t.length;
if (n === 0 || m === 0) {
return n + m;
}
var x = 0, y, a, b, c, d, g, h, k;
var p = new Array(n);
for (y = 0; y < n;) {
p[y] = ++y;
}
for (; (x + 3) < m; x += 4) {
var e1 = t.charCodeAt(x);
var e2 = t.charCodeAt(x + 1);
var e3 = t.charCodeAt(x + 2);
var e4 = t.charCodeAt(x + 3);
c = x;
b = x + 1;
d = x + 2;
g = x + 3;
h = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
a = p[y];
if (a < c || b < c) {
c = (a > b ? b + 1 : a + 1);
}
else {
if (e1 !== k) {
c++;
}
}
if (c < b || d < b) {
b = (c > d ? d + 1 : c + 1);
}
else {
if (e2 !== k) {
b++;
}
}
if (b < d || g < d) {
d = (b > g ? g + 1 : b + 1);
}
else {
if (e3 !== k) {
d++;
}
}
if (d < g || h < g) {
g = (d > h ? h + 1 : d + 1);
}
else {
if (e4 !== k) {
g++;
}
}
p[y] = h = g;
g = d;
d = b;
b = c;
c = a;
}
}
for (; x < m;) {
var e = t.charCodeAt(x);
c = x;
d = ++x;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || d < c) {
d = (a > d ? d + 1 : a + 1);
}
else {
if (e !== s.charCodeAt(y)) {
d = c + 1;
}
else {
d = c;
}
}
p[y] = d;
c = a;
}
h = d;
}
return h;
}
It was my answer to a similar SO question
Fastest general purpose Levenshtein Javascript implementation
Update
A improved version of the above is now on github/npm see
https://github.com/gustf/js-levenshtein
The obvious way of doing this is to map each string to a (distance, string) pair, then sort this list, then drop the distances again. This way you ensure the levenstein distance only has to be computed once. Maybe merge duplicates first, too.
I would definitely suggest using a better Levenshtein method like the one in #James Westgate's answer.
That said, DOM manipulations are often a great expense. You can certainly improve your jQuery usage.
Your loops are rather small in the example above, but concatenating the generated html for each oneResult into a single string and doing one append at the end of the loop will be much more efficient.
Your selectors are slow. $('.oneResult') will search all elements in the DOM and test their className in older IE browsers. You may want to consider something like atRes.find('.oneResult') to scope the search.
In the case of adding the click handlers, we may want to do one better avoid setting handlers on every keyup. You could leverage event delegation by setting a single handler on atRest for all results in the same block you are setting the keyup handler:
atRest.on('click', '.oneResult', function(){
window.location.href = 'http://hashtag.ly/' + $(this).html();
});
See http://api.jquery.com/on/ for more info.
I just wrote an new revision: http://jsperf.com/levenshtein-algorithms/16
function levenshtein(a, b) {
if (a === b) return 0;
var aLen = a.length;
var bLen = b.length;
if (0 === aLen) return bLen;
if (0 === bLen) return aLen;
var len = aLen + 1;
var v0 = new Array(len);
var v1 = new Array(len);
var i = 0;
var j = 0;
var c2, min, tmp;
while (i < len) v0[i] = i++;
while (j < bLen) {
c2 = b.charAt(j++);
v1[0] = j;
i = 0;
while (i < aLen) {
min = v0[i] - (a.charAt(i) === c2 ? 1 : 0);
if (v1[i] < min) min = v1[i];
if (v0[++i] < min) min = v0[i];
v1[i] = min + 1;
}
tmp = v0;
v0 = v1;
v1 = tmp;
}
return v0[aLen];
}
This revision is faster than the other ones. Works even on IE =)