Javascript - Search unicode string in unicode string - javascript

When i try to search a unicode string in a unicode string, i find no solution.
Ex: check if string 'vie' is contained in string 'Mr. ViỆt has a blue house'
So i try a hard way as below:
// Convert string to Unicode
function toUnicode(theString) {
var unicodeString = '';
for (var i=0; i < theString.length; i++) {
var theUnicode = theString.charCodeAt(i).toString(16).toUpperCase();
while (theUnicode.length < 4) {
theUnicode = '0' + theUnicode;
}
theUnicode = '\\u' + theUnicode;
unicodeString += theUnicode;
}
return unicodeString;
}
// Convert string to be Regex Unicode
function toRegexUnicode(theString) {
var unicodeString = '';
for (var i=0; i < theString.length; i++) {
var theUnicode = theString.charCodeAt(i).toString(16).toUpperCase();
while (theUnicode.length < 4) {
theUnicode = '0' + theUnicode;
}
theUnicode = '\\u' + theUnicode;
unicodeString += theUnicode;
}
return new RegExp('[' + unicodeString + ']')
}
// Search
function searchUnicode() {
var strOriginal = "Mr. ViỆt has a blue house"
var regexUnicode = toRegexUnicode(strOriginal)
var strSearch = toUnicode('vie')
var result = regexUnicode.test(strSearch)
console.log(result)
}
Test at: https://www.w3schools.com/code/tryit.asp?filename=FY3NGXMQRMLA
Are there any better ways?

First, your regex expression is wrong. Remove the braces.
Second, you're creating your regex testing the wrong way around.
You're currently setting up your regex search using your full string.
You're also not converting your strOriginal to Unicode.
This means your searchUnicode function needs to appear as follows:
var strOriginal = "Mr. ViỆt has a blue house"
var strOriginalUnicode = toUnicode(strOriginal)
var strSearch = toUnicode('vie')
var regexUnicode = toRegexUnicode(strSearch)
var result = regexUnicode.test(strOriginalUnicode)
Next, we can simplify your toRegexUnicode function as such:
// Convert string to be Regex Unicode
function toRegexUnicode(theString) {
theString = theString.replace(/\\/g, "\\\\")
return new RegExp(theString)
}
No need to reuse your conversion method. You will also note global replacements of all \ to become \\. That's because Regex considers a backslash as an escape character so we need to escape our escape character.

I try another way, just convert all string to ASCII then search:
function stringToASCII(str) {
try {
return str.replace(/[àáảãạâầấẩẫậăằắẳẵặ]/g, 'a')
.replace(/[èéẻẽẹêềếểễệ]/g, 'e')
.replace(/[đ]/g, 'd')
.replace(/[ìíỉĩị]/g, 'i')
.replace(/[òóỏõọôồốổỗộơờớởỡợ]/g, 'o')
.replace(/[ùúủũụưừứửữự]/g, 'u')
.replace(/[ỳýỷỹỵ]/g, 'y')
} catch {
return ''
}
}
function searchASCII() {
var strOriginal = "Mr. ViỆt lê nguyễn thị tùng á à ạds"
var strSearch = "vie"
var strOriginalToASCII = stringToASCII(strOriginal.toLowerCase())
var strSearchToASCII = stringToASCII(strSearch.toLowerCase())
var result = strOriginalToASCII.includes(strSearchToASCII)
// Results
console.log('strOriginalToASCII: ', strOriginalToASCII)
console.log('strSearchToASCII: ', strSearchToASCII)
console.log('result: ', result)
}
Output:
strOriginalToASCII: mr. viet le nguyen thi tung a a ads
strSearchToASCII: vie
result: true
Test at: https://www.w3schools.com/code/tryit.asp?filename=FY3NGXMQRMLA

Related

Remove space without using string method trim

How can I remove all the left space without removing between & the right space of the string? And also when I changed the value of str the result will be the same. Only the left space will be removed.
function trimLeftSpace() {
var str = " Angry Bird ";
var splitTrim = str.split('');
var trimStr = "";
for (var index = 0; index < splitTrim.length; index++) { //trim left space
if(splitTrim[index] != " ") {
trimStr += str[index];
}
}
return trimStr;
}
Your current solution creates a new string which contains all the non-space characters of the original string, you need to stop looking for spaces as soon as you find a non-space character. Here is an example:
function trimLeftSpace(str) {
var doneTrimming = false
var ret = ""
for (var index = 0; index < str.length; index++) {
if(str[index] !== ' '){
doneTrimming = true
}
if(doneTrimming){
ret += str[index]
}
}
return ret;
}
var result = trimLeftSpace(" Angry Bird ");
console.log("|"+result+"|");
To trim the beginning of the string, use a simple regex to replace the whitespaces in the beginning of the string:
var str = " Angry Bird ";
function trimLeftSpace(str) {
return str.replace(/^\s+/, '');
}
console.log('"' + trimLeftSpace(str) + '"');
Or just use .trimStart():
var str = " Angry Bird ";
function trimLeftSpace(str) {
return str.trimStart();
}
console.log('"' + trimLeftSpace(str) + '"');
You could try a regex replacement:
var str = " Angry Bird ";
str = str.replace( new RegExp("^\\s+", "gm"),"");
console.log('"' + str + '"');
This will remove all whitespace on the left of the string:
function trimLeftSpace(str) {
var result = "";
for(var i = 0; i < str.length; i++) {
if(str[i] != " ") {
return str.slice(i);
break;
} else {
result += str[i];
}
}
return result;
}
console.log(trimLeftSpace(" Angry Birds Angry Birds"));
Try this
function trimLeftSpace(str) {
return str.replace(/\s+$/, '')
}
var result = trimLeftSpace(" Angry Bird ");
console.log("|"+result+"|");
If you want to use a function instead of regex solutions from the other answers, then make a function that looks for the first non-space character, then use slice to cut only the part of the string that's after it:
function customTrim(str) {
for(var i = 0; i < str.length; i++) {
if(str.charAt(i) !== " ") {
return str.slice(i);
}
}
}
var res = customTrim(" Snake Shot ");
console.log('"' + res + '"');
Notes:
This only looks for spaces ' '. If you want to look for tabs '\t', newlines '\n', ... then just add them to the if test (sperate them with &&).
If an empty or space-only strings are passed, then undefined is returned, if you don't want that then just return an empty string at the bottom of the function to make it the default return value.
You could try a regex replacement:
let variable = "hello world";
let removeRegex = /^\s+|\s+$/g;
let removeSpace = variable.replace(removeRegex,"");
console.log(removeSpace);

How to get unmatched keywords?

I am using this as keyword s='young girl jumping'
function selfreplace(s) {
var words = ['man', 'jumping'];
var re = new RegExp('\\b(' + words.join('|') + ')\\b', 'g');
var specials = [];
var match;
var str = "";
while(match = re.exec(s)) {
str += match[0] + '(k)';
}
return str;
}
It is returning jumping(k)
I want the result to be young(s) girl(s) jumping(k)
It would probably be easiest to check if it's in words outside of the regex:
function selfreplace(s) {
var words = ['man','jumping'];
var re = new RegExp('\\b(\\w+)\\b', 'g');
var specials = [];
var match;
var str = "";
while(match = re.exec(s))
{
if (words.indexOf(match[0]) !== -1))
str += match[0] + '(k)';
else
str += match[0] + '(s)';
}
return str;
}
You can use a replace with callback.
function selfreplace(s){
return s.replace(/man|jumping|(\w+)/g, function(word, misc){
return word + (misc? '(s)' : '(k)')
})
}
If the word matched is man or jumping, only the first argument (entire match) is set. If the word matched is any other, the first capturing group is set as well.
If you don't know the set of words ahead, you can still generate the regex on the fly. Assuming words don't contain non-word characters:
function selfreplace(s, words){ //or any other method of passing 'words'
var re = RegExp(words.join("|")+"|(\\w+)",'g');
return s.replace(re, function(word, misc){
return word + (misc? '(s)' : '(k)')
})
}
Just a different approach, probably not the best solution but thought i'd throw it out there.
var str = "young girl jumping";
function replaceStr(s){
var matched = new RegExp("man|jumping", "i");
var newStr = "";
var str = s.split(" ");
for(var i=0; i<str.length;i++){
if(str[i].match(matched)){
newStr += str[i]+"(k) ";
} else {
newStr += str[i]+"(s) ";
}
}
return newStr.substr(0, newStr.length-1);
}
//replaceStr(str) returns "young(s) girl(s) jumping(k)"
DEMO here
if the matched words might change then you can always amend this function so it accepts an array as the second argument and then creates the regexp dynamically
replaceStr(s, matchArr){} and
var matched = new RegExp("("+matchArr.join("|")+")", "i");
Something like this might give you a hint:
var s = "young girl jumping",
words = ['man','jumping'],
regex = new RegExp("(" + words.join("|") +")", "g"),
q = s.replace(regex, function( string ) {
return string + "(k)";
});
console.log(q); // "young girl jumping(k)"
If you match words only, you do not really need regexps at all, do you?
What about just looking for the words with ==
function selfreplace(s) {
var words = ['man','jumping'];
var input = s.split(" ");
var str = "";
for(var i=0; i<input.length; i++){
var tmpString = "(s)";
for(var j=0; j<words.length; j++){
if(input[i] == words[j]){
tmpString = "(k)";
}
}
str += input[i]+tmpString;
}
return str;
}
You could use a RegExp for this, but for what you are doing a RegExp is overkill. I would use Array methods instead:
var selfreplace = function selfreplace(s) {
var words = ['man', 'jumping'],
i = 0,
suffix = '(s)';
s = s.split(' ');
for (i = 0; i < s.length; i += 1) {
if (words.indexOf(s[i]) > -1) {
s[i] = s[i] + '(k)';
} else {
s[i] = s[i] + '(s)';
}
}
return s.join(' ');
};
Here's a fiddle in action: http://jsfiddle.net/4KAzw/

Javascript and regex: split string and keep the separator

I have a string:
var string = "aaaaaa<br />† bbbb<br />‡ cccc"
And I would like to split this string with the delimiter <br /> followed by a special character.
To do that, I am using this:
string.split(/<br \/>&#?[a-zA-Z0-9]+;/g);
I am getting what I need, except that I am losing the delimiter.
Here is the example: http://jsfiddle.net/JwrZ6/1/
How can I keep the delimiter?
I was having similar but slight different problem. Anyway, here are examples of three different scenarios for where to keep the deliminator.
"1、2、3".split("、") == ["1", "2", "3"]
"1、2、3".split(/(、)/g) == ["1", "、", "2", "、", "3"]
"1、2、3".split(/(?=、)/g) == ["1", "、2", "、3"]
"1、2、3".split(/(?!、)/g) == ["1、", "2、", "3"]
"1、2、3".split(/(.*?、)/g) == ["", "1、", "", "2、", "3"]
Warning: The fourth will only work to split single characters. ConnorsFan presents an alternative:
// Split a path, but keep the slashes that follow directories
var str = 'Animation/rawr/javascript.js';
var tokens = str.match(/[^\/]+\/?|\//g);
Use (positive) lookahead so that the regular expression asserts that the special character exists, but does not actually match it:
string.split(/<br \/>(?=&#?[a-zA-Z0-9]+;)/g);
See it in action:
var string = "aaaaaa<br />† bbbb<br />‡ cccc";
console.log(string.split(/<br \/>(?=&#?[a-zA-Z0-9]+;)/g));
If you wrap the delimiter in parantheses it will be part of the returned array.
string.split(/(<br \/>&#?[a-zA-Z0-9]+);/g);
// returns ["aaaaaa", "<br />†", "bbbb", "<br />‡", "cccc"]
Depending on which part you want to keep change which subgroup you match
string.split(/(<br \/>)&#?[a-zA-Z0-9]+;/g);
// returns ["aaaaaa", "<br />", "bbbb", "<br />", "cccc"]
You could improve the expression by ignoring the case of letters
string.split(/()&#?[a-z0-9]+;/gi);
And you can match for predefined groups like this: \d equals [0-9] and \w equals [a-zA-Z0-9_]. This means your expression could look like this.
string.split(/<br \/>(&#?[a-z\d]+;)/gi);
There is a good Regular Expression Reference on JavaScriptKit.
If you group the split pattern, its match will be kept in the output and it is by design:
If separator is a regular expression with capturing parentheses, then
each time separator matches, the results (including any undefined
results) of the capturing parentheses are spliced into the output
array.
https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/split#description
You don't need a lookahead or global flag unless your search pattern uses one.
const str = `How much wood would a woodchuck chuck, if a woodchuck could chuck wood?`
const result = str.split(/(\s+)/);
console.log(result);
// We can verify the result
const isSame = result.join('') === str;
console.log({ isSame });
You can use multiple groups. You can be as creative as you like and what remains outside the groups will be removed:
const str = `How much wood would a woodchuck chuck, if a woodchuck could chuck wood?`
const result = str.split(/(\s+)(\w{1,2})\w+/);
console.log(result, result.join(''));
answered it here also JavaScript Split Regular Expression keep the delimiter
use the (?=pattern) lookahead pattern in the regex
example
var string = '500x500-11*90~1+1';
string = string.replace(/(?=[$-/:-?{-~!"^_`\[\]])/gi, ",");
string = string.split(",");
this will give you the following result.
[ '500x500', '-11', '*90', '~1', '+1' ]
Can also be directly split
string = string.split(/(?=[$-/:-?{-~!"^_`\[\]])/gi);
giving the same result
[ '500x500', '-11', '*90', '~1', '+1' ]
I made a modification to jichi's answer, and put it in a function which also supports multiple letters.
String.prototype.splitAndKeep = function(separator, method='seperate'){
var str = this;
if(method == 'seperate'){
str = str.split(new RegExp(`(${separator})`, 'g'));
}else if(method == 'infront'){
str = str.split(new RegExp(`(?=${separator})`, 'g'));
}else if(method == 'behind'){
str = str.split(new RegExp(`(.*?${separator})`, 'g'));
str = str.filter(function(el){return el !== "";});
}
return str;
};
jichi's answers 3rd method would not work in this function, so I took the 4th method, and removed the empty spaces to get the same result.
edit:
second method which excepts an array to split char1 or char2
String.prototype.splitAndKeep = function(separator, method='seperate'){
var str = this;
function splitAndKeep(str, separator, method='seperate'){
if(method == 'seperate'){
str = str.split(new RegExp(`(${separator})`, 'g'));
}else if(method == 'infront'){
str = str.split(new RegExp(`(?=${separator})`, 'g'));
}else if(method == 'behind'){
str = str.split(new RegExp(`(.*?${separator})`, 'g'));
str = str.filter(function(el){return el !== "";});
}
return str;
}
if(Array.isArray(separator)){
var parts = splitAndKeep(str, separator[0], method);
for(var i = 1; i < separator.length; i++){
var partsTemp = parts;
parts = [];
for(var p = 0; p < partsTemp.length; p++){
parts = parts.concat(splitAndKeep(partsTemp[p], separator[i], method));
}
}
return parts;
}else{
return splitAndKeep(str, separator, method);
}
};
usage:
str = "first1-second2-third3-last";
str.splitAndKeep(["1", "2", "3"]) == ["first", "1", "-second", "2", "-third", "3", "-last"];
str.splitAndKeep("-") == ["first1", "-", "second2", "-", "third3", "-", "last"];
An extension function splits string with substring or RegEx and the delimiter is putted according to second parameter ahead or behind.
String.prototype.splitKeep = function (splitter, ahead) {
var self = this;
var result = [];
if (splitter != '') {
var matches = [];
// Getting mached value and its index
var replaceName = splitter instanceof RegExp ? "replace" : "replaceAll";
var r = self[replaceName](splitter, function (m, i, e) {
matches.push({ value: m, index: i });
return getSubst(m);
});
// Finds split substrings
var lastIndex = 0;
for (var i = 0; i < matches.length; i++) {
var m = matches[i];
var nextIndex = ahead == true ? m.index : m.index + m.value.length;
if (nextIndex != lastIndex) {
var part = self.substring(lastIndex, nextIndex);
result.push(part);
lastIndex = nextIndex;
}
};
if (lastIndex < self.length) {
var part = self.substring(lastIndex, self.length);
result.push(part);
};
// Substitution of matched string
function getSubst(value) {
var substChar = value[0] == '0' ? '1' : '0';
var subst = '';
for (var i = 0; i < value.length; i++) {
subst += substChar;
}
return subst;
};
}
else {
result.add(self);
};
return result;
};
The test:
test('splitKeep', function () {
// String
deepEqual("1231451".splitKeep('1'), ["1", "231", "451"]);
deepEqual("123145".splitKeep('1', true), ["123", "145"]);
deepEqual("1231451".splitKeep('1', true), ["123", "145", "1"]);
deepEqual("hello man how are you!".splitKeep(' '), ["hello ", "man ", "how ", "are ", "you!"]);
deepEqual("hello man how are you!".splitKeep(' ', true), ["hello", " man", " how", " are", " you!"]);
// Regex
deepEqual("mhellommhellommmhello".splitKeep(/m+/g), ["m", "hellomm", "hellommm", "hello"]);
deepEqual("mhellommhellommmhello".splitKeep(/m+/g, true), ["mhello", "mmhello", "mmmhello"]);
});
I've been using this:
String.prototype.splitBy = function (delimiter) {
var
delimiterPATTERN = '(' + delimiter + ')',
delimiterRE = new RegExp(delimiterPATTERN, 'g');
return this.split(delimiterRE).reduce((chunks, item) => {
if (item.match(delimiterRE)){
chunks.push(item)
} else {
chunks[chunks.length - 1] += item
};
return chunks
}, [])
}
Except that you shouldn't mess with String.prototype, so here's a function version:
var splitBy = function (text, delimiter) {
var
delimiterPATTERN = '(' + delimiter + ')',
delimiterRE = new RegExp(delimiterPATTERN, 'g');
return text.split(delimiterRE).reduce(function(chunks, item){
if (item.match(delimiterRE)){
chunks.push(item)
} else {
chunks[chunks.length - 1] += item
};
return chunks
}, [])
}
So you could do:
var haystack = "aaaaaa<br />† bbbb<br />‡ cccc"
var needle = '<br \/>&#?[a-zA-Z0-9]+;';
var result = splitBy(haystack , needle)
console.log( JSON.stringify( result, null, 2) )
And you'll end up with:
[
"<br />† bbbb",
"<br />‡ cccc"
]
Most of the existing answers predate the introduction of lookbehind assertions in JavaScript in 2018. You didn't specify how you wanted the delimiters to be included in the result. One typical use case would be sentences delimited by punctuation ([.?!]), where one would want the delimiters to be included at the ends of the resulting strings. This corresponds to the fourth case in the accepted answer, but as noted there, that solution only works for single characters. Arbitrary strings with the delimiters appended at the end can be formed with a lookbehind assertion:
'It is. Is it? It is!'.split(/(?<=[.?!])/)
/* [ 'It is.', ' Is it?', ' It is!' ] */
I know that this is a bit late but you could also use lookarounds
var string = "aaaaaa<br />† bbbb<br />‡ cccc";
var array = string.split(/(?<=<br \/>)/);
console.log(array);
I've also came up with this solution. No regex needed, very readable.
const str = "hello world what a great day today balbla"
const separatorIndex = str.indexOf("great")
const parsedString = str.slice(separatorIndex)
console.log(parsedString)

Need to escape non-ASCII characters in JavaScript

Is there any function to do the following?
var specialStr = 'ipsum áá éé lore';
var encodedStr = someFunction(specialStr);
// then encodedStr should be like 'ipsum \u00E1\u00E1 \u00E9\u00E9 lore'
I need to encode the characters that are out of ASCII range, and need to do it with that encoding. I don't know its name. Is it Unicode maybe?
This should do the trick:
function padWithLeadingZeros(string) {
return new Array(5 - string.length).join("0") + string;
}
function unicodeCharEscape(charCode) {
return "\\u" + padWithLeadingZeros(charCode.toString(16));
}
function unicodeEscape(string) {
return string.split("")
.map(function (char) {
var charCode = char.charCodeAt(0);
return charCode > 127 ? unicodeCharEscape(charCode) : char;
})
.join("");
}
For example:
var specialStr = 'ipsum áá éé lore';
var encodedStr = unicodeEscape(specialStr);
assert.equal("ipsum \\u00e1\\u00e1 \\u00e9\\u00e9 lore", encodedStr);
If you need hex encoding rather than unicode then you can simplify #Domenic's answer to:
"aäßåfu".replace(/./g, function(c){return c.charCodeAt(0)<128?c:"\\x"+c.charCodeAt(0).toString(16)})
returns: "a\xe4\xdf\xe5fu"
Just for information you can do as Domenic said or use the escape function but that will generate unicode with a different format (more browser friendly):
>>> escape("áéíóú");
"%E1%E9%ED%F3%FA"
This works for me. Specifically when using the Dropbox REST API:
encodeNonAsciiCharacters(value: string) {
let out = ""
for (let i = 0; i < value.length; i++) {
const ch = value.charAt(i);
let chn = ch.charCodeAt(0);
if (chn <= 127) out += ch;
else {
let hex = chn.toString(16);
if (hex.length < 4)
hex = "000".substring(hex.length - 1) + hex;
out += "\\u" + hex;
}
}
return out;
}

Javascript regex help

I have the following string in JavaScript
var mystring = " abcdef(p,q); check(x,y); cef(m,n);"
I would want to do a string replace such that my final string is :
mystring = " abcdef(p,q); someothercheck\(x,y\); cef(m,n);"
x and y should remain same after the substitution. and the backslashes are necessary since I need to pass them to some other command.
There can be other Parantheses in the string too.
If you don't have other parenthesis, it should be easy.
mystring = mystring.replace("check(", "someothercheck\\(");
mystring = mystring.replace(")", "\\)");
EDIT This works also in the case of multiple parenthesis (It does not affect the empty ones).
var str=" abcdef; check(x,y); cef();"
patt = /((\w)/g;
// transform (x in \(x
str = str.replace(patt, '\\($1');
patt = /(\w)\)/g
// transform y) in y\);
str = str.replace(patt, '$1\\)');
// transform check in someothercheck
str = str.replace('check', 'someothercheck');
EDIT Now it converts only the check strings.
function convertCheck(str, check, someOtherCheck) {
// console.log(str + " contains " + check + "? ");
// console.log(str.indexOf(check));
if (str.indexOf(check) === -1) return str;
var patt1 = /\((\w)/g,
patt2 = /(\w)\)/g;
str = str.replace(patt1, '\\($1');
str = str.replace(patt2, '$1\\)');
str = str.replace(check, someOtherCheck);
return str;
}
var str = "abcdef(); check(x,y); cef();",
tokens = str.split(';');
for (var i = 0; i < tokens.length; i++) {
tokens[i] = convertCheck(tokens[i], "check", "someothercheck");
}
str = tokens.join(";");
alert(str); // "abcdef(); someothercheck/(x,y/); cef();"
var myString = "abcdef; check(x,y); cef;";
myString.replace(/(\w+)\(/, 'someother$1(')
.replace(/\(/g, '\\(')
.replace(/\)/g, '\\)')

Categories

Resources