Multiple specials characters replacement optimization

Multiple specials characters replacement optimization - javascript

I need to replace all the specials characters in a string with javascript or jQuery.
I am sure there is a better way to do this.
But I currently have no clue.
Anyone got an idea?
function Unaccent(str) {
var norm = new Array('À','Á','Â','Ã','Ä','Å','Æ','Ç','È','É','Ê','Ë','Ì','Í','Î','Ï', 'Ð','Ñ','Ò','Ó','Ô','Õ','Ö','Ø','Ù','Ú','Û','Ü','Ý','Þ','ß', 'à','á','â','ã','ä','å','æ','ç','è','é','ê','ë','ì','í','î','ï','ð','ñ', 'ò','ó','ô','õ','ö','ø','ù','ú','û','ü','ý','ý','þ','ÿ');
var spec = new Array('A','A','A','A','A','A','A','C','E','E','E','E','I','I','I','I', 'D','N','O','O','O','0','O','O','U','U','U','U','Y','b','s', 'a','a','a','a','a','a','a','c','e','e','e','e','i','i','i','i','d','n', 'o','o','o','o','o','o','u','u','u','u','y','y','b','y');
for (var i = 0; i < spec.length; i++) {
str = replaceAll(str, norm[i], spec[i]);
}
return str;
}
function replaceAll(str, search, repl) {
while (str.indexOf(search) != -1) {
str = str.replace(search, repl);
}
return str;
}

Here's a version using a lookup map that works a little more efficiently than nested loops:
function Unaccent(str) {
var map = Unaccent.map; // shortcut
var result = "", srcChar, replaceChar;
for (var i = 0, len = str.length; i < len; i++) {
srcChar = str.charAt(i);
// use hasOwnProperty so we never conflict with any
// methods/properties added to the Object prototype
if (map.hasOwnProperty(srcChar)) {
replaceChar = map[srcChar]
} else {
replaceChar = srcChar;
}
result += replaceChar;
}
return(result);
}
// assign this here so it is only created once
Unaccent.map = {'À':'A','Á':'A','Â':'A'}; // you fill in the rest of the map
Working demo: http://jsfiddle.net/jfriend00/rRpcy/
FYI, a Google search for "accent folding" returns many other implementations (many similar, but also some using regex).
Here's a bit higher performance version (2.5x faster) that can do a direct indexed lookup of the accented characters rather than having to do an object lookup:
function Unaccent(str) {
var result = "", code, lookup, replaceChar;
for (var i = 0, len = str.length; i < len; i++) {
replaceChar = str.charAt(i);
code = str.charCodeAt(i);
// see if code is in our map
if (code >= 192 && code <= 255) {
lookup = Unaccent.map.charAt(code - 192);
if (lookup !== ' ') {
replaceChar = lookup;
}
}
result += replaceChar;
}
return(result);
}
// covers chars from 192-255
// blank means no mapping for that char
Unaccent.map = "AAAAAAACEEEEIIIIDNOOOOO OUUUUY aaaaaaaceeeeiiiionooooo uuuuy y";
Working demo: http://jsfiddle.net/jfriend00/Jxr9u/
In this jsperf, the string lookup version (the 2nd example) is about 2.5x faster.

Using an object as a map is a good idea, but given the number of characters you're replacing, it's probably a good idea to pre-initialize the object so that it doesn't have to be re-initialized each time the function gets run (assuming you're running the function more than once):
var Unaccent = (function () {
var charMap = {'À':'A','Á':'A','Â':'A','Ã':'A','Ä':'A' /** etc. **/};
return function (str) {
var i, modified = "", cur;
for(i = 0; i < str.length; i++) {
cur = str.charAt(i);
modified += (charMap[cur] || cur);
}
return modified;
};
}());
This will front-load the heavy lifting of the function to page load time (you can do some modifications to delay it until the first call to the function if you like). But it will take some of the processing time out of the actual function call.
It's possible some browsers will actually optimize this part anyway, so you might not see a benefit. But on older browsers (where performance is of greater concern), you'll probably see some benefit to pre-processing your character map.

You can prepare key value pair type of array and via jquery each traverse that array.
Example :
function Unaccent(str) {
var replaceString = {'À':'A','Á':'A','Â':'A'}; // add more
$.each(replaceString, function(k, v) {
var regX = new RegExp(k, 'g');
str = str.replace(regX,v);
});
}
Working Demo
Good Luck !!

Related

Performance comparison with V8

I'm currently testing multiple cases for parsing lines.
Each line is formatted like that:
"dHdX5jOa7ww9cGsW7jQF=dHdX5jOa7ww9cGsW7jQF=dHdX5jOa7ww9cGsW7jQF=dHdX5jOa7ww9cGsW7jQF"
There are a lot of lines of course, and I need to extract the key, and the value.
The key is delimited by the first "=" found.
There is never a "=" char in the key.
The value is the rest of string next after the first "=" sign.
So for this exemple the result should be:
{
key: "dHdX5jOa7ww9cGsW7jQF",
value: "dHdX5jOa7ww9cGsW7jQF=dHdX5jOa7ww9cGsW7jQF=dHdX5jOa7ww9cGsW7jQF"
}
From here we can iterate on multiple solutions:
// the first one is not very efficient with split splice join method
function first(line) {
const lineSplit = line.split('='),
key = lineSplit[0],
value = lineSplit.splice(1, lineSplit.length).join('=');
return {
key,
value
};
}
// the second one execute only what i want to do
// with built-in String prototype's functions
function optimized(line) {
const index = line.indexOf("="),
key = line.substr(0, index),
value = line.substr(index + 1, line.length);
return {
key,
value
};
}
// i tried to code the logic myself
function homemade(line) {
const len = line.length;
let value = "", key = "", valued = false;
for (let i = 0; i < len; ++i) {
const char = line[i];
if (valued === false) {
if (char !== '=') {
key += char;
} else {
valued = true;
}
} else {
value += char;
}
}
return {
key,
value
};
}
// and next recode substr and foreach built-in to implemant the same
// function but with homemade substr&foreach
String.prototype.substr2 = function(from, to){
let str = "";
for (let i = from; i < to; ++i) {
str += this[i];
}
return str;
};
String.prototype.indexOf2 = function(occ){
const len = this.length;
for (let i = 0; i < len; ++i) {
if (this[i] === occ) {
return i;
}
}
return -1;
};
function overload(line) {
const index = line.indexOf2("="),
key = line.substr2(0, index),
value = line.substr2(index + 1, line.length);
return {
key,
value
};
}
And voila the results with jsBench:
[I'm using Google Chrome Version 59.0.3071.104 (Official Build) (64-bit)]
You can checkout the results of these functions with your browser in this jsBench
I don't understand what is going on. I imagined that cannot be possible since I wrote only the code i needed with native for() and other stuffs like this...
My questions are:
Why the builtin string operations are obviously much faster ?
Why this repeated string concatenation is inneficient ?
Is there an alternative to it ?

Why the builtin string operations are obviously much faster ?
Because they are optimized, and use internal implementation tricks that are not available to JavaScript code. For example, they avoid repeated string concatenation by building the result in one go.
Why this repeated string concatenation is inefficient ?
Because it creates many strings as intermediate results.
Is there an alternative to it ?
Use the builtin string operations :-)

UTF-8 support for regular expression in Javascript

I am trying to create a Javascript function that would find all positions for a set of patterns inside a UTF-8 string. For example:
I have a string "detaj" (it's a transcription written with International Phonetic Alphabet symbols, so I need a full UTF-8 support).
And I have an array of patterns: ["(?!dʒ)d", "(?!tʃ)t"] (each string is also UTF-8 encoded).
I need to find the position of each pattern and obtain the following array:
[0] => [0, "(?!dʒ)d"],
[1] => [2, "(?!tʃ)t"]
0 - is the position of the symbol "d", 2 - is the position of the symbol "t".
I started with this function:
https://stackoverflow.com/a/3410557/2006215
var str = "I learned to play the Ukulele in Lebanon."
var regex = /le/gi, result, indices = [];
while ( (result = regex.exec(str)) ) {
indices.push(result.index);
}
And I changed it to something like this:
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// straightforward approach doesn't work:
//var pattern = new RegExp(currentSoundRegex, "g");
// hexEncode is taken from here - https://stackoverflow.com/a/21648161/2006215 - doesn't work:
//var pattern = new RegExp(currentSoundRegex.hexEncode, "g");
// I'm trying to use utf8.js plugin from here - https://github.com/mathiasbynens/utf8.js - doesn't work:
var pattern = new RegExp(utf8.encode(currentSoundRegex), "g");
var indices = getIndicesOfRegex (pattern, word_transcription);
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSoundRegex ]);
}
}
return allIndices;
}
function getIndicesOfRegex (regex, str) {
var result, indices = [];
while (result = regex.exec(str)) {
indices.push(result.index);
}
return indices;
}
Anybody has any ideas?
UPDATE: I take both the transcription and the regex patterns from json file that I generate with PHP from UTF-8 strings. I am not sure how to call this, but it's not UTF-8. In any case it doesn't work with my Javascript function.
var questions = [{"word":"sorte","word_transcription":"s\u0254\u0281t","sounds_array":["d","t"],"sounds_regex_array":["(?!d\u0292)d","(?!t\u0283)t"]}];

I found where the problem was.
The error was triggered because I tried to execute lookbehind in Javascript, which is not supported.
The workaround for custom lookbehind functions is proposed here - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
But finally I just did my own modifications of the code. The above functions require XRegExp library, which is pretty heavy.
My solution:
function getIndicesOfRegex (currentSoundRegex, pattern, str) {
var result, indices = [];
while (result = pattern.exec(str)) {
if ((currentSoundRegex === "ʒ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "d")) { continue; }
if ((currentSoundRegex === "ʃ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "t")) { continue; }
indices.push(result.index);
}
return indices;
}
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_array, sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// lookbehind doesn't work in Javascript:
// possible workaround - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
if (currentSoundRegex === "(?<!d)ʒ") {
currentSoundRegex = "ʒ";
}
if (currentSoundRegex === "(?<!t)ʃ") {
currentSoundRegex = "ʃ";
}
var pattern = new RegExp(currentSoundRegex, "g");
var indices = getIndicesOfRegex (currentSoundRegex, pattern, word_transcription);
var currentSound = sounds_array[i];
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSound]);
}
}
return allIndices;
}

indexOf() : is there a better way to implement this?

EDIT
Thank you guys, and i apologize for not being more specific in my question.
This code was written to check if a characters in the second string is in the first string. If so, it'll return true, otherwise a false.
So my code works, I know that much, but I am positive there's gotta be a better way to implement this.
Keep in mind this is a coding challenge from Freecodecamp's Javascript tree.
Here's my code:
function mutation(arr) {
var stringOne = arr[0].toLowerCase();
var stringTwo = arr[1].toLowerCase().split("");
var i = 0;
var truthyFalsy = true;
while (i < arr[1].length && truthyFalsy) {
truthyFalsy = stringOne.indexOf(stringTwo[i]) > -1;
i++
}
console.log(truthyFalsy);
}
mutation(["hello", "hey"]);
//mutation(["hello", "yep"]);
THere's gotta be a better way to do this. I recently learned about the map function, but not sure how to use that to implement this, and also just recently learned of an Array.prototype.every() function, which I am going to read tonight.
Suggestions? Thoughts?

the question is very vague. however what i understood from the code is that you need to check for string match between two strings.
Since you know its two strings, i'd just pass them as two parameters. additionally i'd change the while into a for statement and add a break/continue to avoid using variable get and set.
Notice that in the worst case its almost the same, but in the best case its half computation time.
mutation bestCase 14.84499999999997
mutation worstCase 7.694999999999993
bestCase: 5.595000000000027
worstCase: 7.199999999999989
// your function (to check performance difference)
function mutation(arr) {
var stringOne = arr[0].toLowerCase();
var stringTwo = arr[1].toLowerCase().split("");
var i = 0;
var truthyFalsy = true;
while (i < arr[1].length && truthyFalsy) {
truthyFalsy = stringOne.indexOf(stringTwo[i]) > -1;
i++
}
return truthyFalsy;
}
function hasMatch(base, check) {
var strOne = base.toLowerCase();
var strTwo = check.toLowerCase().split("");
var truthyFalsy = false;
// define both variables (i and l) before the loop condition in order to avoid getting the length property of the string multiple times.
for (var i = 0, l = strTwo.length; i < l; i++) {
var hasChar = strOne.indexOf(strTwo[i]) > -1;
if (hasChar) {
//if has Char, set true and break;
truthyFalsy = true;
break;
}
}
return truthyFalsy;
}
var baseCase = "hello";
var bestCaseStr = "hey";
var worstCaseStr = "yap";
//bestCase find match in first iteration
var bestCase = hasMatch("hello", bestCaseStr);
console.log(bestCase);
//worstCase loop over all of them.
var worstCase = hasMatch("hello", worstCaseStr);
console.log(worstCase);
// on your function
console.log('mutation bestCase', checkPerf(mutation, [baseCase, bestCaseStr]));
console.log('mutation worstCase', checkPerf(mutation, [baseCase, worstCaseStr]));
// simple performance check
console.log('bestCase:', checkPerf(hasMatch, baseCase, bestCaseStr));
console.log('worstCase:', checkPerf(hasMatch, baseCase, worstCaseStr));
function checkPerf(fn) {
var t1 = performance.now();
for (var i = 0; i < 10000; i++) {
fn(arguments[1], arguments[2]);
}
var t2 = performance.now();
return t2 - t1;
}

Object Oriented Javascript Chapter 4 Exercise 4

Hi all I'm learning Javascript with the Stoyan Stefanov's book. I'm stuck on Chapter 4 Exercise 4:
Imagine the String()constructor didn't exist. Create a constructor
function MyString()that acts like String()as closely as possible.
You're not allowed to use any built-in string methods or properties,
and remember that String()doesn't exist. You can use this code to
test your constructor:
>>> var s = new MyString('hello');
>>> s[0];
"h"
I can't think on a way to achieve "s[0]", at least not with the knowledge I have now.
Any thoughts?
Thanks

Objects can have properties of themselves defined using array like syntax. String chars can be accessed with array like syntax.
function MyString (str) {
this.length = 0; // string length
var i = 0;
while(str[i] != undefined) {
this.length++;
i++;
}
for (var i=0; i< this.length;i++)
{
this[i]=str[i];
}
}
var s=new MyString('hello');
alert(s[0]); //h

here is my solution for this exercice :
function MyString(msg){
var array_msg = msg.split("");
array_msg.toString = function(){
return array_msg.join("");
};
array_msg.valueOf = function(){
return array_msg.toString();
};
array_msg.charAt = function(i){
if(array_msg[i] === undefined){
return array_msg[0];
}else{return array_msg[i];}
};
array_msg.concat = function(msg2){
return array_msg.join("")+" "+msg2;
};
array_msg.slice = function(d,f){
var res = "";
if(f<0){
f = array_msg.length + f;
}
for(var i=d; i<f; i++){
res += array_msg[i]
}
return res;
};
array_msg.split = function(el){
return array_msg.toString().split(el);
};
return array_msg;
}

A slight variation of the above...more of a tweak than anything
var MyString = function (s) {
for (var i = 0; i < s.length; i++){
this[i] = s[i]
}
this.length = function() .....
You also don't need to assign it to anything as extra as the comment suggests. this[i] will be created for the length of the string passed to s
EDIT:
Part of the question in the book says not to use existing string methods so can't use charAt so I've switched it to s[I]

This is another variation of one of the above solutions but instead of using a for loop I am using a while loop. I don't usually use while loops for these kinds of things but it worked really well here.
Adding the length property is optional.
function MyString(str) {
this.length = 0; // Creating an optional length property
this.value = str;
var i = 0;
while(str[i] != undefined) {
this[i] = str[i];
this.length++;
i++;
}
}
var name = new MyString('billy');
console.log(name.value); // 'billy'
console.log(name[0]); // 'b'
console.log(name.length); // 5

interpolate tags in strings using only text offsets

I've been struggling with javascript string methods and regexes, and I may be overlooking something obvious. I hope I violate no protocol by restating tofutim's question in some more detail. Responses to his question focus upon s.replace(), but for that to work, you have to know which occurrence of a substring to replace, replace all of them, or be able to identify somehow uniquely the string to replace by means of a regex. Like him, I only have an array of text offsets like this:
[[5,9], [23,27]]
and a string like this:
"eggs eggs spam and ham spam"
Given those constraints, is there a straightforward way (javaScript or some shortcut with jQuery) to arrive at a string like this?
"eggs <span>eggs</span> spam and ham <span>spam</span>"
I don't know in advance what the replacement strings are, or how many occurrences of them there might be in the base text. I only know their offsets, and it is only the occurrences identified by their offsets that I want to wrap with tags.
any thoughts?

I found a way to do it with regexp. Not sure about performance, but it's short and sweet:
/**
* replaceOffset
* #param str A string
* #param offs Array of offsets in ascending order [[2,4],[6,8]]
* #param tag HTML tag
*/
function replaceOffset(str, offs, tag) {
tag = tag || 'span';
offs.reverse().forEach(function(v) {
str = str.replace(
new RegExp('(.{'+v[0]+'})(.{'+(v[1]-v[0])+'})'),
'$1<'+tag+'>$2</'+tag+'>'
);
});
return str;
}
Demo: http://jsbin.com/aqowum/3/edit

iquick solution (not tested)
function indexWrap(indexArr,str){
// explode into array of each character
var chars = str.split('');
// loop through the MD array of indexes
for(var i=0; i<indexArr.length;i++){
var indexes = indexArr[i];
// if the two indexes exist in the character array
if(chars[indexes[0]] && chars[indexes[1]]){
// add the tag into each index
chars.splice(indexes[0],0,"<span>");
chars.splice(indexes[1],0,"</span>");
}
}
// return the joined string
return chars.join('');
}
Personally, I like a string replace solution, but if you dont want one, this might work

You can try slice method.
var arr = [[5,9], [23,27]];
arr = arr.reverse()
$.each(arr, function(i, v){
var f = v[0], last = v[1];
$('p').html(function(i, v){
var o = v.slice(0, f);
var a = '<span>' + v.slice(f, last) + '</span>';
var c = v.slice(last, -1);
return o+a+c
})
})
http://jsfiddle.net/rjQt7/

First, you'd want to iterate backwards, in order to make sure you won't eventually overwrite the replacements previously made, however, in my example it is not important because the string is reassembled all at once in the very end.
// > interpolateOnIndices([[5,9], [23,27]], "eggs eggs spam and ham spam");
// < 'eggs <span>eggs</span> spam and ham <span>spam</span>'
function interpolateOnIndices(indices, string) {
"use strict";
var i, pair, position = string.length,
len = indices.length - 1, buffer = [];
for (i = len; i >= 0; i -= 1) {
pair = indices[i];
buffer.unshift("<span>",
string.substring(pair[0], pair[1]),
"</span>",
string.substring(pair[1], position));
position = pair[0];
}
buffer.unshift(string.substr(0, position));
return buffer.join("");
}
This is a little bit better then the example with spliceing, because it doesn't create additional arrays (splice in itself will create additional arrays). Using mapping and creating functions repeatedly inside other functions is a certain memory hog, but it doesn't run very fast either... Although, it is a little bit shorter.
On large strings joining should, theoretically, give you an advantage over multiple concatenations because memory allocation will be made once, instead of subsequently throwing away a half-baked string. Of course, all these need not concern you, unless you are processing large amounts of data.
EDIT:
Because I had too much time on my hands, I decided to make a test, to see how variations will compare on a larger (but fairly realistic) set of data, below is my testing code with some results...
function interpolateOnIndices(indices, string) {
"use strict";
var i, pair, position = string.length,
len = indices.length - 1, buffer = [];
for (i = len; i >= 0; i -= 1) {
pair = indices[i];
buffer.unshift("<span>",
string.substring(pair[0], pair[1]),
"</span>",
string.substring(pair[1], position));
position = pair[0];
}
buffer.unshift(string.substr(0, position));
return buffer.join("");
}
function indexWrap(indexArr, str) {
var chars = str.split("");
for(var i = 0; i < indexArr.length; i++) {
var indexes = indexArr[i];
if(chars[indexes[0]] && chars[indexes[1]]){
chars.splice(indexes[0], 0, "<span>");
chars.splice(indexes[1], 0, "</span>");
}
}
return chars.join("");
}
function replaceOffset(str, offs, tag) {
tag = tag || "span";
offs.reverse().forEach(
function(v) {
str = str.replace(
new RegExp("(.{" + v[0] + "})(.{" + (v[1] - v[0]) + "})"),
"$1<" + tag + ">$2</" + tag + ">"
);
});
return str;
}
function generateLongString(pattern, times) {
"use strict";
var buffer = new Array(times);
while (times >= 0) {
buffer[times] = pattern;
times -= 1;
}
return buffer.join("");
}
function generateIndices(pattern, times, step) {
"use strict";
var buffer = pattern.concat(), block = pattern.concat();
while (times >= 0) {
block = block.concat();
block[0] += step;
block[1] += step;
buffer = buffer.concat(block);
times -= 1;
}
return buffer;
}
var longString = generateLongString("eggs eggs spam and ham spam", 100);
var indices = generateIndices([[5,9], [23,27]], 100,
"eggs eggs spam and ham spam".length);
function speedTest(thunk, times) {
"use strict";
var start = new Date();
while (times >= 0) {
thunk();
times -= 1;
}
return new Date() - start;
}
speedTest(
function() {
replaceOffset(longString, indices, "span"); },
100); // 1926
speedTest(
function() {
indexWrap(indices, longString); },
100); // 559
speedTest(
function() {
interpolateOnIndices(indices, longString); },
100); // 16
Tested against V8 (Node.js) on amd64 Linux (FC-17).
I didn't test the undefined's answer because I didn't want to load that library, especially so it doesn't do anything useful for this test. I would imagine it will lend somewhere between andbeyond's and elclanrs's variants, more towards elclanrs's answer though.

you may use the substring method
String.substring (startIndex, endIndex);
description: return the string between start & end index
usage:
var source="hello world";
var result=source.substring (3,7); //returns 'lo wo'
you already have an array with initial & final index, so you are almost done :)

Develop Reference

JavaScript is the programming language of the Web.

Multiple specials characters replacement optimization - javascript

Related

Performance comparison with V8

UTF-8 support for regular expression in Javascript

indexOf() : is there a better way to implement this?

Object Oriented Javascript Chapter 4 Exercise 4

interpolate tags in strings using only text offsets

Categories

Resources