UTF-8 support for regular expression in Javascript - javascript

I am trying to create a Javascript function that would find all positions for a set of patterns inside a UTF-8 string. For example:
I have a string "detaj" (it's a transcription written with International Phonetic Alphabet symbols, so I need a full UTF-8 support).
And I have an array of patterns: ["(?!dʒ)d", "(?!tʃ)t"] (each string is also UTF-8 encoded).
I need to find the position of each pattern and obtain the following array:
[0] => [0, "(?!dʒ)d"],
[1] => [2, "(?!tʃ)t"]
0 - is the position of the symbol "d", 2 - is the position of the symbol "t".
I started with this function:
https://stackoverflow.com/a/3410557/2006215
var str = "I learned to play the Ukulele in Lebanon."
var regex = /le/gi, result, indices = [];
while ( (result = regex.exec(str)) ) {
indices.push(result.index);
}
And I changed it to something like this:
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// straightforward approach doesn't work:
//var pattern = new RegExp(currentSoundRegex, "g");
// hexEncode is taken from here - https://stackoverflow.com/a/21648161/2006215 - doesn't work:
//var pattern = new RegExp(currentSoundRegex.hexEncode, "g");
// I'm trying to use utf8.js plugin from here - https://github.com/mathiasbynens/utf8.js - doesn't work:
var pattern = new RegExp(utf8.encode(currentSoundRegex), "g");
var indices = getIndicesOfRegex (pattern, word_transcription);
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSoundRegex ]);
}
}
return allIndices;
}
function getIndicesOfRegex (regex, str) {
var result, indices = [];
while (result = regex.exec(str)) {
indices.push(result.index);
}
return indices;
}
Anybody has any ideas?
UPDATE: I take both the transcription and the regex patterns from json file that I generate with PHP from UTF-8 strings. I am not sure how to call this, but it's not UTF-8. In any case it doesn't work with my Javascript function.
var questions = [{"word":"sorte","word_transcription":"s\u0254\u0281t","sounds_array":["d","t"],"sounds_regex_array":["(?!d\u0292)d","(?!t\u0283)t"]}];

I found where the problem was.
The error was triggered because I tried to execute lookbehind in Javascript, which is not supported.
The workaround for custom lookbehind functions is proposed here - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
But finally I just did my own modifications of the code. The above functions require XRegExp library, which is pretty heavy.
My solution:
function getIndicesOfRegex (currentSoundRegex, pattern, str) {
var result, indices = [];
while (result = pattern.exec(str)) {
if ((currentSoundRegex === "ʒ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "d")) { continue; }
if ((currentSoundRegex === "ʃ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "t")) { continue; }
indices.push(result.index);
}
return indices;
}
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_array, sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// lookbehind doesn't work in Javascript:
// possible workaround - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
if (currentSoundRegex === "(?<!d)ʒ") {
currentSoundRegex = "ʒ";
}
if (currentSoundRegex === "(?<!t)ʃ") {
currentSoundRegex = "ʃ";
}
var pattern = new RegExp(currentSoundRegex, "g");
var indices = getIndicesOfRegex (currentSoundRegex, pattern, word_transcription);
var currentSound = sounds_array[i];
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSound]);
}
}
return allIndices;
}

Related

Check if String has sequential or repeated characters in javascript (underscore)

I have code that I am trying to refactor. Im new to javascript so Im tring to make more readable code using functions in libraries like underscore.
The function below can detect when string
contains 3 or more ordered characters such as (234, efg, LmN)
and
when string contains 3 or more repeated (lll, 444, MMm, ###)
const input = "Dfdf123125";
const myStr = input.toLowerCase();
const n = 3;
let isRepeating = false;
let isSequential = false;
for (let i = 0; i < myStr.length; i++) {
if (i + (n - 1) <= myStr.length) {
let isRepeatingTemp = false;
let isSequentialTemp = false;
for (let j = i; j < i + n; j++) {
(myStr.charCodeAt(i) === myStr.charCodeAt(j)) ? isRepeatingTemp = true: isRepeatingTemp = false;
(myStr.charCodeAt(i) === myStr.charCodeAt(j) - (n - 1)) ? isSequentialTemp = true : isSequentialTemp = false;
}
if (isRepeatingTemp) isRepeating = true;
if (isSequentialTemp) isSequential = true;
}
}
Im trying to to see if I can optimize this and make it more readable with underscore and/or even make time/space complexity better. I know this can also be done with regx but im trying to get it done without it.
Instead of the inner for loop, I chunked the string to n using Array.prototype.slice() to see ahead n characters. I used Array.prototype.indexOf() to find if it's sequential based off the abc and num constants(ref). To see if it's repeating, I used Array.prototype.every() that loops through the chunk and check if they're similar and return a boolean based on the expression.
The result gives the output of each instance found, and if it was sequential or repeating.
const input = "Dfdf123125";
function RepSeq(str, n) {
var rep = false;
var seq = false;
var result = [];
const num = '0123456789';
const abc = 'abcdefghijklmnopqrstuvqxyz';
if (str.length < n) return false;
for (var i = 0; i < str.length; i++) {
if (i + n > str.length) break;
var chunk = str.slice(i, i + n);
var seqABC = abc.indexOf(chunk) > -1;
var seq123 = num.indexOf(chunk) > -1;
if (seq123 || seqABC) {
seq = true;
result.push(chunk);
}
if ([...chunk].every(v => v.toLowerCase() === chunk[0].toLowerCase())) {
rep = true;
result.push(chunk);
}
}
return {
repetition: rep,
sequential: seq,
out: result
};
}
console.log(RepSeq(input, 3));
// Output:
// {
// out: ["123"],
// repetition: false,
// sequential: true
// }
With this method, we're peeking at the string one block(i+n) at a time. Ex(n=3):
1. [Dfd]f123125
2. D[fdf]123125
3. Df[df1]23125
4. Dfd[f12]3125
5. Dfdf[123]125 - Sequential!
6. Dfdf1[231]25
7. Dfdf12[312]5
8. Dfdf123[125]

Join string based on startsWith() and endsWith()

I have string var str1 = 'foobarbaz' and var str2 = 'bazfoo'
I want to join them based on overlapping starting and ending characters. The result I am looking for is 'foobarbazfoo'.
I am currently doing it in a following way:
function merge(str1, str2) {
var size = Math.min(str1.length, str2.length);
index = 0;
for (var i = 0; i < size; i++) {
var ends = str1.substr(str1.length - i);
var starts = str2.substr(0, i);
if (ends === starts) {
index = i;
}
}
if (index === 0) {
throw 'Strings do not overlap';
} else {
return str1 + str2.substr(index, str2.length);
}
}
I wonder, if there is more elegant and efficient way of doing it ?
i think it would be a good idea to add the function to the String's prototype and using startsWith() and Conditional (ternary) Operator this what i could come up with :
String.prototype.merge = function(str) {
let match;
for (let i = this.length; i >= 0; i--)
(str.startsWith(this.slice(i))) && (match = this.slice(i));
return this.slice(0, this.indexOf(match)) + str.slice(str.indexOf(match), str.length)
}
let merged = 'foobarbaz'.merge('bazfoo')
console.log(merged);
in terms of speed, both methods are identical ( tested execution time with Performance.now() )
but less lines and a declarative rather than imperative code.
feel free to choose betwee slice and substring ( slice vs substring )

Multiple specials characters replacement optimization

I need to replace all the specials characters in a string with javascript or jQuery.
I am sure there is a better way to do this.
But I currently have no clue.
Anyone got an idea?
function Unaccent(str) {
var norm = new Array('À','Á','Â','Ã','Ä','Å','Æ','Ç','È','É','Ê','Ë','Ì','Í','Î','Ï', 'Ð','Ñ','Ò','Ó','Ô','Õ','Ö','Ø','Ù','Ú','Û','Ü','Ý','Þ','ß', 'à','á','â','ã','ä','å','æ','ç','è','é','ê','ë','ì','í','î','ï','ð','ñ', 'ò','ó','ô','õ','ö','ø','ù','ú','û','ü','ý','ý','þ','ÿ');
var spec = new Array('A','A','A','A','A','A','A','C','E','E','E','E','I','I','I','I', 'D','N','O','O','O','0','O','O','U','U','U','U','Y','b','s', 'a','a','a','a','a','a','a','c','e','e','e','e','i','i','i','i','d','n', 'o','o','o','o','o','o','u','u','u','u','y','y','b','y');
for (var i = 0; i < spec.length; i++) {
str = replaceAll(str, norm[i], spec[i]);
}
return str;
}
function replaceAll(str, search, repl) {
while (str.indexOf(search) != -1) {
str = str.replace(search, repl);
}
return str;
}
Here's a version using a lookup map that works a little more efficiently than nested loops:
function Unaccent(str) {
var map = Unaccent.map; // shortcut
var result = "", srcChar, replaceChar;
for (var i = 0, len = str.length; i < len; i++) {
srcChar = str.charAt(i);
// use hasOwnProperty so we never conflict with any
// methods/properties added to the Object prototype
if (map.hasOwnProperty(srcChar)) {
replaceChar = map[srcChar]
} else {
replaceChar = srcChar;
}
result += replaceChar;
}
return(result);
}
// assign this here so it is only created once
Unaccent.map = {'À':'A','Á':'A','Â':'A'}; // you fill in the rest of the map
Working demo: http://jsfiddle.net/jfriend00/rRpcy/
FYI, a Google search for "accent folding" returns many other implementations (many similar, but also some using regex).
Here's a bit higher performance version (2.5x faster) that can do a direct indexed lookup of the accented characters rather than having to do an object lookup:
function Unaccent(str) {
var result = "", code, lookup, replaceChar;
for (var i = 0, len = str.length; i < len; i++) {
replaceChar = str.charAt(i);
code = str.charCodeAt(i);
// see if code is in our map
if (code >= 192 && code <= 255) {
lookup = Unaccent.map.charAt(code - 192);
if (lookup !== ' ') {
replaceChar = lookup;
}
}
result += replaceChar;
}
return(result);
}
// covers chars from 192-255
// blank means no mapping for that char
Unaccent.map = "AAAAAAACEEEEIIIIDNOOOOO OUUUUY aaaaaaaceeeeiiiionooooo uuuuy y";
Working demo: http://jsfiddle.net/jfriend00/Jxr9u/
In this jsperf, the string lookup version (the 2nd example) is about 2.5x faster.
Using an object as a map is a good idea, but given the number of characters you're replacing, it's probably a good idea to pre-initialize the object so that it doesn't have to be re-initialized each time the function gets run (assuming you're running the function more than once):
var Unaccent = (function () {
var charMap = {'À':'A','Á':'A','Â':'A','Ã':'A','Ä':'A' /** etc. **/};
return function (str) {
var i, modified = "", cur;
for(i = 0; i < str.length; i++) {
cur = str.charAt(i);
modified += (charMap[cur] || cur);
}
return modified;
};
}());
This will front-load the heavy lifting of the function to page load time (you can do some modifications to delay it until the first call to the function if you like). But it will take some of the processing time out of the actual function call.
It's possible some browsers will actually optimize this part anyway, so you might not see a benefit. But on older browsers (where performance is of greater concern), you'll probably see some benefit to pre-processing your character map.
You can prepare key value pair type of array and via jquery each traverse that array.
Example :
function Unaccent(str) {
var replaceString = {'À':'A','Á':'A','Â':'A'}; // add more
$.each(replaceString, function(k, v) {
var regX = new RegExp(k, 'g');
str = str.replace(regX,v);
});
}
Working Demo
Good Luck !!

Javascript Function to split and return a value from a string

I am trying to grab a certain value. I am new to javascript and I can't figure out why this is not working.
If I parse "kid_2" I should get "kostas". Instead of "Kostas" I always get "02-23-2000". So I must have a logic problem in the loop but I am really stuck.
function getold_val(fieldname,str){
var chunks=str.split("||");
var allchunks = chunks.length-1;
for(k=0;k<allchunks;k++){
var n=str.indexOf(fieldname);
alert(chunks[k]);
if(n>0){
var chunkd=chunks[k].split("::");
alert(chunkd);
return chunkd[1];
}
}
}
var test = getold_val('kid_2','date_1::02-23-2000||date_2::06-06-1990||kid_1::George||kid_2::Kostas||');
alert(test);
A regex may be a little more appealing. Here's a fiddle:
function getValue(source, key){
return (new RegExp("(^|\\|)" + key + "::([^$\\|]+)", "i").exec(source) || {})[2];
}
getValue("date_1::02-23-2000||date_2::06-06-1990||kid_1::George||kid_2::Kostas||","kid_2");
But if you want something a little more involved, you can parse that string into a dictionary like so (fiddle):
function splitToDictionary(val, fieldDelimiter, valueDelimiter){
var dict = {},
fields = val.split(fieldDelimiter),
kvp;
for (var i = 0; i < fields.length; i++) {
if (fields[i] !== "") {
kvp = fields[i].split(valueDelimiter);
dict[kvp[0]] = kvp[1];
}
}
return dict;
}
var dict = splitToDictionary("date_1::02-23-2000||date_2::06-06-1990||kid_1::George||kid_2::Kostas||","||","::");
console.log(dict["date_1"]);
console.log(dict["date_2"]);
console.log(dict["kid_1"]);
console.log(dict["kid_2"]);​
This works, here's my fiddle.
function getold_val(fieldname,str) {
var chunks = str.split('||');
for(var i = 0; i < chunks.length-1; i++) {
if(chunks[i].indexOf(fieldname) >= 0) {
return(chunks[i].substring(fieldname.length+2));
}
}
}
alert(getold_val('kid_2', 'date_1::02-23-2000||date_2::06-06-1990||kid_1::George||kid_2::Kostas||'));
The issue with your code was (as #slebetman noticed as well) the fact that a string index can be 0 because it starts exactly in the first letter.
The code is almost the same as yours, I just didn't use the second .split('::') because I felt a .substring(...) would be easier.
There are two bugs. The first error is in the indexOf call:
var n = str.indexOf(fieldname);
This will always return a value greater than or equal to 0 since the field exists in the string. What you should be doing is:
var n = chunks[k].indexOf(fieldname);
The second error is in your if statement. It should be:
if(n >= 0) {
...
}
or
if(n > -1) {
...
}
The substring you are looking for could very well be the at the beginning of the string, in which case its index is 0. indexOf returns -1 if it cannot find what you're looking for.
That being said, here's a better way to do what you're trying to do:
function getold_val(fieldName, str) {
var keyValuePairs = str.split("||");
var returnValue = null;
if(/||$/.match(str)) {
keyValuePairs = keyValuePairs.slice(0, keyValuePairs.length - 1);
}
var found = false;
var i = 0;
while(i < keyValuePairs.length && !found) {
var keyValuePair = keyValuePairs[i].split("::");
var key = keyValuePair[0];
var value = keyValuePair[1];
if(fieldName === key) {
returnValue = value;
found = true;
}
i++;
}
return returnValue;
}

JavaScript strings outside of the BMP

BMP being Basic Multilingual Plane
According to JavaScript: the Good Parts:
JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.
This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.
Further investigation confirms this:
> String.fromCharCode(0x20001);
The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.
Question: is it at all possible to handle post-BMP characters in JavaScript?
2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:
Depends what you mean by ‘support’. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.
But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.
If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:
String.prototype.getCodePointLength= function() {
return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};
String.fromCodePoint= function() {
var chars= Array.prototype.slice.call(arguments);
for (var i= chars.length; i-->0;) {
var n = chars[i]-0x10000;
if (n>=0)
chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
}
return String.fromCharCode.apply(null, chars);
};
I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.
I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.
You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/
Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.
if (!String.prototype.ucLength) {
String.prototype.ucLength = function() {
// this solution was taken from
// http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
};
}
if (!String.prototype.codePointAt) {
String.prototype.codePointAt = function (ucPos) {
if (isNaN(ucPos)){
ucPos = 0;
}
var str = String(this);
var codePoint = null;
var pairFound = false;
var ucIndex = -1;
var i = 0;
while (i < str.length){
ucIndex += 1;
var code = str.charCodeAt(i);
var next = str.charCodeAt(i + 1);
pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
if (ucIndex == ucPos){
codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
break;
} else{
i += pairFound ? 2 : 1;
}
}
return codePoint;
};
}
if (!String.fromCodePoint) {
String.fromCodePoint = function () {
var strChars = [], codePoint, offset, codeValues, i;
for (i = 0; i < arguments.length; ++i) {
codePoint = arguments[i];
offset = codePoint - 0x10000;
if (codePoint > 0xFFFF){
codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
} else{
codeValues = [codePoint];
}
strChars.push(String.fromCharCode.apply(null, codeValues));
}
return strChars.join("");
};
}
if (!String.prototype.ucCharAt) {
String.prototype.ucCharAt = function (ucIndex) {
var str = String(this);
var codePoint = str.codePointAt(ucIndex);
var ucChar = String.fromCodePoint(codePoint);
return ucChar;
};
}
if (!String.prototype.ucIndexOf) {
String.prototype.ucIndexOf = function (searchStr, ucStart) {
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = 0;
}
var str = String(this);
var strUCLength = str.ucLength();
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i < strUCLength){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i++;
}
return -1;
};
}
if (!String.prototype.ucLastIndexOf) {
String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = strUCLength - 1;
}
if (ucStart >= strUCLength){
ucStart = strUCLength - 1;
}
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i >= 0){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i--;
}
return -1;
};
}
if (!String.prototype.ucSlice) {
String.prototype.ucSlice = function (ucStart, ucStop) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = strUCLength + ucStart;
if (ucStart < 0){ ucStart = 0;}
}
if (typeof(ucStop) == 'undefined'){
ucStop = strUCLength - 1;
}
if (ucStop < 0){
ucStop = strUCLength + ucStop;
if (ucStop < 0){ ucStop = 0;}
}
var ucChars = [];
var i = ucStart;
while (i < ucStop){
ucChars.push(str.ucCharAt(i));
i++;
}
return ucChars.join("");
};
}
if (!String.prototype.ucSplit) {
String.prototype.ucSplit = function (delimeter, limit) {
var str = String(this);
var strUCLength = str.ucLength();
var ucChars = [];
if (delimeter == ''){
for (var i = 0; i < strUCLength; i++){
ucChars.push(str.ucCharAt(i));
}
ucChars = ucChars.slice(0, 0 + limit);
} else{
ucChars = str.split(delimeter, limit);
}
return ucChars;
};
}
More recent JavaScript engines have String.fromCodePoint.
const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP
Also a code-point iterator, which gets you the code-point length.
function countCodePoints( str )
{
const i = str[Symbol.iterator]();
let count = 0;
while( !i.next().done ) ++count;
return count;
}
console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'
Yes, you can. Although support to non-BMP characters directly in source documents is optional according to the ECMAScript standard, modern browsers let you use them. Naturally, the document encoding must be properly declared, and for most practical purposes you would need to use the UTF-8 encoding. Moreover, you need an editor that can handle UTF-8, and you need some input method(s); see e.g. my Full Unicode Input utility.
Using suitable tools and settings, you can write var foo = '𠀁'.
The non-BMP characters will be internally represented as surrogate pairs, so each non-BMP character counts as 2 in the string length.
Using for (c of this) instruction, one can make various computations on a string that contains non-BMP characters. For instance, to compute the string length, and to get the nth character of the string:
String.prototype.magicLength = function()
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
k++;
}
return k;
}
String.prototype.magicCharAt = function(n)
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
if (k == n) return c + "";
k++;
}
return "";
}
This old topic has now a simple solution in ES6:
Split characters into an array
simple version
[..."😴😄😃⛔🎠🚓🚇"] // ["😴", "😄", "😃", "⛔", "🎠", "🚓", "🚇"]
Then having each one separated you can handle them easily for most common cases.
Credit: DownGoat
Full solution
To overcome special emojis as the one in the comment, one can search for the connection charecter (char code 8205 in UTF-16) and make some modifications. Here is how:
let myStr = "👩‍👩‍👧‍👧😃𝌆"
let arr = [...myStr]
for (i = arr.length-1; i--; i>= 0) {
if (arr[i].charCodeAt(0) == 8205) { // special combination character
arr[i-1] += arr[i] + arr[i+1]; // combine them back to a single emoji
arr.splice(i, 2)
}
}
console.log(arr.length) //3
Haven't found a case where this doesn't work. Comment if you do.
To conclude
it seems that JS uses the 8205 char code to represent UCS-2 characters as a UTF-16 combinations.

Categories

Resources