Join string based on startsWith() and endsWith() - javascript

I have string var str1 = 'foobarbaz' and var str2 = 'bazfoo'
I want to join them based on overlapping starting and ending characters. The result I am looking for is 'foobarbazfoo'.
I am currently doing it in a following way:
function merge(str1, str2) {
var size = Math.min(str1.length, str2.length);
index = 0;
for (var i = 0; i < size; i++) {
var ends = str1.substr(str1.length - i);
var starts = str2.substr(0, i);
if (ends === starts) {
index = i;
}
}
if (index === 0) {
throw 'Strings do not overlap';
} else {
return str1 + str2.substr(index, str2.length);
}
}
I wonder, if there is more elegant and efficient way of doing it ?

i think it would be a good idea to add the function to the String's prototype and using startsWith() and Conditional (ternary) Operator this what i could come up with :
String.prototype.merge = function(str) {
let match;
for (let i = this.length; i >= 0; i--)
(str.startsWith(this.slice(i))) && (match = this.slice(i));
return this.slice(0, this.indexOf(match)) + str.slice(str.indexOf(match), str.length)
}
let merged = 'foobarbaz'.merge('bazfoo')
console.log(merged);
in terms of speed, both methods are identical ( tested execution time with Performance.now() )
but less lines and a declarative rather than imperative code.
feel free to choose betwee slice and substring ( slice vs substring )

Related

Whats wrong with my palindrome? (javascript)

I have written this js code for palindrome, I know there are better and more efficient palindrome methods online but I want to know why I am unable to get my palindrome function to work properly?
CODE:
var pal = function(str) {
var len = str.length;
for (var i = 0; i < len; i++) {
var comp1 = str.substring(i, i + 1);
for (var j = len; j > 0; j--) {
var comp2 = str.substring(j - 1, j);
}
if (comp1 != comp2) {
console.log("not palindrome")
break;
} else {
console.log('palindrome')
}
}
}
pal('maddog');
OUTPUT :
palindrome
not palindrome
There are lot of better algorithms to check Palindrome. Let use the similar algorithm that you are using.
We basically use two pointers - left and right, and move to middle at the same time. In the original question, left pointer and right pointer doesn't move at the same time.
Pointers should move like this -
a b c b a
^ ^
a b c b a
^ ^
a b c b a
^
var isPalindrome = function (str) {
for (var i = 0, j = str.length-1; i < j; i++ , j--) {
if (str[i] != str[j]) {
return false;
}
}
return true;
}
console.log('maddog : ' + isPalindrome('maddog'));
console.log('abcba : ' + isPalindrome('abcba'));
console.log('deed : ' + isPalindrome('deed'));
console.log('a : ' + isPalindrome('a'));
Try the following code. It works by dividing the string length by 2, and then iterating up, checking mirroring characters against each other:
var pal = function(str){
var len = str.length;
for(var i = 0; i < Math.floor(len/2); i++){
if(str[i] != str[(len-1)-i]){
return false;
}
}
return true;
}
console.log(pal("bunny"));
console.log(pal("amoreroma"));
The inner loop is totally unnecessary. It does the same thing every time -- it loops through the whole string, starting from the end, repeatedly setting comp2 to the character; when it's done, comp2 always contains the first character. So your function just tests whether every character in the string is the same as the first character.
To test if something is a palindrome, you need to compare each character with the corresponding character from the other end of the string. You don't need two loops for this. You also only need to loop through the first half of the string, not the whole string.
Finally, you should only echo Palindrome at the end of the loop. Inside the loop you only know that one character matches, not all of them.
var pal = function(str) {
var len = str.length;
var half = Math.floor(len / 2);
var isPal = true;
for (var i = 0; i < half; i++) {
var comp1 = str[i];
var comp2 = str[len - i - 1];
if (comp1 != comp2) {
console.log("not palindrome")
isPal = false;
break;
}
}
if (isPal) {
console.log('palindrome')
}
}
pal('maddog');
pal('maddam');
You don't really need the nested loops, you can just loop backwards through the string to invert the string and then compare it to the original string. I updated the Snippet to work.
Before, your code was not inverting the string but rather just iterating through the characters and assigning them to the comp1 and comp1 variables. You need to concatenate the strings in order to build the new string backwards comp = comp + str.substring(j-1, j);
var pal = function(str) {
var len = str.length;
var comp = '';
for (var j = len; j > 0; j--) {
comp = comp + str.substring(j - 1, j);
}
if (str !== comp) {
console.log("not palindrome")
return;
}
console.log('palindrome')
}
pal('arepera');

UTF-8 support for regular expression in Javascript

I am trying to create a Javascript function that would find all positions for a set of patterns inside a UTF-8 string. For example:
I have a string "detaj" (it's a transcription written with International Phonetic Alphabet symbols, so I need a full UTF-8 support).
And I have an array of patterns: ["(?!dʒ)d", "(?!tʃ)t"] (each string is also UTF-8 encoded).
I need to find the position of each pattern and obtain the following array:
[0] => [0, "(?!dʒ)d"],
[1] => [2, "(?!tʃ)t"]
0 - is the position of the symbol "d", 2 - is the position of the symbol "t".
I started with this function:
https://stackoverflow.com/a/3410557/2006215
var str = "I learned to play the Ukulele in Lebanon."
var regex = /le/gi, result, indices = [];
while ( (result = regex.exec(str)) ) {
indices.push(result.index);
}
And I changed it to something like this:
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// straightforward approach doesn't work:
//var pattern = new RegExp(currentSoundRegex, "g");
// hexEncode is taken from here - https://stackoverflow.com/a/21648161/2006215 - doesn't work:
//var pattern = new RegExp(currentSoundRegex.hexEncode, "g");
// I'm trying to use utf8.js plugin from here - https://github.com/mathiasbynens/utf8.js - doesn't work:
var pattern = new RegExp(utf8.encode(currentSoundRegex), "g");
var indices = getIndicesOfRegex (pattern, word_transcription);
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSoundRegex ]);
}
}
return allIndices;
}
function getIndicesOfRegex (regex, str) {
var result, indices = [];
while (result = regex.exec(str)) {
indices.push(result.index);
}
return indices;
}
Anybody has any ideas?
UPDATE: I take both the transcription and the regex patterns from json file that I generate with PHP from UTF-8 strings. I am not sure how to call this, but it's not UTF-8. In any case it doesn't work with my Javascript function.
var questions = [{"word":"sorte","word_transcription":"s\u0254\u0281t","sounds_array":["d","t"],"sounds_regex_array":["(?!d\u0292)d","(?!t\u0283)t"]}];
I found where the problem was.
The error was triggered because I tried to execute lookbehind in Javascript, which is not supported.
The workaround for custom lookbehind functions is proposed here - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
But finally I just did my own modifications of the code. The above functions require XRegExp library, which is pretty heavy.
My solution:
function getIndicesOfRegex (currentSoundRegex, pattern, str) {
var result, indices = [];
while (result = pattern.exec(str)) {
if ((currentSoundRegex === "ʒ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "d")) { continue; }
if ((currentSoundRegex === "ʃ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "t")) { continue; }
indices.push(result.index);
}
return indices;
}
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_array, sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// lookbehind doesn't work in Javascript:
// possible workaround - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
if (currentSoundRegex === "(?<!d)ʒ") {
currentSoundRegex = "ʒ";
}
if (currentSoundRegex === "(?<!t)ʃ") {
currentSoundRegex = "ʃ";
}
var pattern = new RegExp(currentSoundRegex, "g");
var indices = getIndicesOfRegex (currentSoundRegex, pattern, word_transcription);
var currentSound = sounds_array[i];
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSound]);
}
}
return allIndices;
}

Reducing duplicate characters in a string to a given minimum

I was messing around with the first question here: Reduce duplicate characters to a desired minimum and am looking for more elegant answers than what I came up with. It passes the test but curious to see other solutions. The sample tests are:
reduceString('aaaabbbb', 2) 'aabb'
reduceString('xaaabbbb', 2) 'xaabb'
reduceString('aaaabbbb', 1) 'ab'
reduceString('aaxxxaabbbb', 2) 'aaxxaabb'
and my solution (that passes these tests):
reduceString = function(str, amount) {
var count = 0;
var result = '';
for (var i = 0; i < str.length; i++) {
if (str[i] === str[i+1]) {
count++;
if (count < amount) {
result += str[i];
}
} else {
count = 0;
result += str[i];
}
};
return result;
}
Just use regular expressions.
var reduceString = function (str, amount) {
var re = new RegExp("(.)(?=\\1{" + amount + "})","g");
return str.replace(re, "");
}
I guess my best solution would be like
var str = "axxxaabbbbcaaxxxaab",
redStr = (s,n) => s.replace(/(\w)\1+/g,"$1".repeat(n));
console.log(redStr(str,2));
I tried to make it as short as possible:
reduceString = function(str, amount) {
var finalString = '', cL = '', counter;
str.split('').forEach(function(i){
if (i !== cL) counter = 0;
counter++;
cL = i;
if (counter <= amount ) finalString = finalString + i;
});
return finalString;
}
You can use reg expression instead. tested in javascript.
how it works:
(.) //match any character
\1 //if it follow by the same character
+{2 //more than 1 times
/g //global
$1 //is 1 time by $1$1 is 2 times
reduceString('aaaabbbb', 2)
reduceString('xaaabbbb', 2)
reduceString('aaaabbbb', 1)
reduceString('aaxxxaabbbb', 2)
function reduceString(txt,num)
{
var canRepeat=['$1'];
for (i=1;i<num;i++)
{
canRepeat.push('$1')
}
canRepeat = canRepeat.join('');
console.log(txt.replace(/(.)\1{2,}/g, canRepeat))
}
With regex:
var reduceString = function(str, amount) {
var x = [ ...new Set(str) ];
for (var c of x){
var rex = new RegExp(c + '{'+amount+',}','g');
str = str.replace(rex,string(c,amount));
}
return str;
};
var string = function(c,amount){
for(var i=0,s="";i<amount;i++)s+=c;
return s;
};
Up above regex solutions are much more better, but here is my accepted solution with reduce:
make an array from string via spread operator
Check the previous item
find how many times char is repeated in result string
otherwise concat result string with the current char
Don`t forget to use the second argument as the initial value, and return for each cases
reduceString = function(str, amount) {
return [...str].reduce(((res, cur)=>{
if(res.length && cur === res[res.length-1]){
dupsCount = [...res].filter(char => char === cur).length
if(dupsCount===amount){
return res;
}
else {
res+=cur;
return res;
}
}
res+=cur;
return res;
}),"")
}

JavaScript strings outside of the BMP

BMP being Basic Multilingual Plane
According to JavaScript: the Good Parts:
JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.
This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.
Further investigation confirms this:
> String.fromCharCode(0x20001);
The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.
Question: is it at all possible to handle post-BMP characters in JavaScript?
2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:
Depends what you mean by ‘support’. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.
But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.
If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:
String.prototype.getCodePointLength= function() {
return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};
String.fromCodePoint= function() {
var chars= Array.prototype.slice.call(arguments);
for (var i= chars.length; i-->0;) {
var n = chars[i]-0x10000;
if (n>=0)
chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
}
return String.fromCharCode.apply(null, chars);
};
I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.
I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.
You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/
Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.
if (!String.prototype.ucLength) {
String.prototype.ucLength = function() {
// this solution was taken from
// http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
};
}
if (!String.prototype.codePointAt) {
String.prototype.codePointAt = function (ucPos) {
if (isNaN(ucPos)){
ucPos = 0;
}
var str = String(this);
var codePoint = null;
var pairFound = false;
var ucIndex = -1;
var i = 0;
while (i < str.length){
ucIndex += 1;
var code = str.charCodeAt(i);
var next = str.charCodeAt(i + 1);
pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
if (ucIndex == ucPos){
codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
break;
} else{
i += pairFound ? 2 : 1;
}
}
return codePoint;
};
}
if (!String.fromCodePoint) {
String.fromCodePoint = function () {
var strChars = [], codePoint, offset, codeValues, i;
for (i = 0; i < arguments.length; ++i) {
codePoint = arguments[i];
offset = codePoint - 0x10000;
if (codePoint > 0xFFFF){
codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
} else{
codeValues = [codePoint];
}
strChars.push(String.fromCharCode.apply(null, codeValues));
}
return strChars.join("");
};
}
if (!String.prototype.ucCharAt) {
String.prototype.ucCharAt = function (ucIndex) {
var str = String(this);
var codePoint = str.codePointAt(ucIndex);
var ucChar = String.fromCodePoint(codePoint);
return ucChar;
};
}
if (!String.prototype.ucIndexOf) {
String.prototype.ucIndexOf = function (searchStr, ucStart) {
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = 0;
}
var str = String(this);
var strUCLength = str.ucLength();
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i < strUCLength){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i++;
}
return -1;
};
}
if (!String.prototype.ucLastIndexOf) {
String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = strUCLength - 1;
}
if (ucStart >= strUCLength){
ucStart = strUCLength - 1;
}
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i >= 0){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i--;
}
return -1;
};
}
if (!String.prototype.ucSlice) {
String.prototype.ucSlice = function (ucStart, ucStop) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = strUCLength + ucStart;
if (ucStart < 0){ ucStart = 0;}
}
if (typeof(ucStop) == 'undefined'){
ucStop = strUCLength - 1;
}
if (ucStop < 0){
ucStop = strUCLength + ucStop;
if (ucStop < 0){ ucStop = 0;}
}
var ucChars = [];
var i = ucStart;
while (i < ucStop){
ucChars.push(str.ucCharAt(i));
i++;
}
return ucChars.join("");
};
}
if (!String.prototype.ucSplit) {
String.prototype.ucSplit = function (delimeter, limit) {
var str = String(this);
var strUCLength = str.ucLength();
var ucChars = [];
if (delimeter == ''){
for (var i = 0; i < strUCLength; i++){
ucChars.push(str.ucCharAt(i));
}
ucChars = ucChars.slice(0, 0 + limit);
} else{
ucChars = str.split(delimeter, limit);
}
return ucChars;
};
}
More recent JavaScript engines have String.fromCodePoint.
const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP
Also a code-point iterator, which gets you the code-point length.
function countCodePoints( str )
{
const i = str[Symbol.iterator]();
let count = 0;
while( !i.next().done ) ++count;
return count;
}
console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'
Yes, you can. Although support to non-BMP characters directly in source documents is optional according to the ECMAScript standard, modern browsers let you use them. Naturally, the document encoding must be properly declared, and for most practical purposes you would need to use the UTF-8 encoding. Moreover, you need an editor that can handle UTF-8, and you need some input method(s); see e.g. my Full Unicode Input utility.
Using suitable tools and settings, you can write var foo = '𠀁'.
The non-BMP characters will be internally represented as surrogate pairs, so each non-BMP character counts as 2 in the string length.
Using for (c of this) instruction, one can make various computations on a string that contains non-BMP characters. For instance, to compute the string length, and to get the nth character of the string:
String.prototype.magicLength = function()
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
k++;
}
return k;
}
String.prototype.magicCharAt = function(n)
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
if (k == n) return c + "";
k++;
}
return "";
}
This old topic has now a simple solution in ES6:
Split characters into an array
simple version
[..."😴😄😃⛔🎠🚓🚇"] // ["😴", "😄", "😃", "⛔", "🎠", "🚓", "🚇"]
Then having each one separated you can handle them easily for most common cases.
Credit: DownGoat
Full solution
To overcome special emojis as the one in the comment, one can search for the connection charecter (char code 8205 in UTF-16) and make some modifications. Here is how:
let myStr = "👩‍👩‍👧‍👧😃𝌆"
let arr = [...myStr]
for (i = arr.length-1; i--; i>= 0) {
if (arr[i].charCodeAt(0) == 8205) { // special combination character
arr[i-1] += arr[i] + arr[i+1]; // combine them back to a single emoji
arr.splice(i, 2)
}
}
console.log(arr.length) //3
Haven't found a case where this doesn't work. Comment if you do.
To conclude
it seems that JS uses the 8205 char code to represent UCS-2 characters as a UTF-16 combinations.

What's the best way to count keywords in JavaScript?

What's the best and most efficient way to count keywords in JavaScript? Basically, I'd like to take a string and get the top N words or phrases that occur in the string, mainly for the use of suggesting tags. I'm looking more for conceptual hints or links to real-life examples than actual code, but I certainly wouldn't mind if you'd like to share code as well. If there are particular functions that would help, I'd also appreciate that.
Right now I think I'm at using the split() function to separate the string by spaces and then cleaning punctuation out with a regular expression. I'd also want it to be case-insensitive.
Cut, paste + execute demo:
var text = "Text to be examined to determine which n words are used the most";
// Find 'em!
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
var word = matches[0].toLowerCase();
if (typeof words[word] == "undefined")
{
words[word] = 1;
}
else
{
words[word]++;
}
}
// Sort 'em!
var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
wordList.push([word, words[word]]);
}
}
wordList.sort(function(a, b) { return b[1] - a[1]; });
// Come back any time, straaanger!
var n = 10;
var message = ["The top " + n + " words are:"];
for (var i = 0; i < n; i++)
{
message.push(wordList[i][0] + " - " + wordList[i][1] + " occurance" +
(wordList[i][1] == 1 ? "" : "s"));
}
alert(message.join("\n"));
Reusable function:
function getTopNWords(text, n)
{
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
var word = matches[0].toLowerCase();
if (typeof words[word] == "undefined")
{
words[word] = 1;
}
else
{
words[word]++;
}
}
var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
wordList.push([word, words[word]]);
}
}
wordList.sort(function(a, b) { return b[1] - a[1]; });
var topWords = [];
for (var i = 0; i < n; i++)
{
topWords.push(wordList[i][0]);
}
return topWords;
}
Once you have that array of words cleaned up, and let's say you call it wordArray:
var keywordRegistry = {};
for(var i = 0; i < wordArray.length; i++) {
if(keywordRegistry.hasOwnProperty(wordArray[i]) == false) {
keywordRegistry[wordArray[i]] = 0;
}
keywordRegistry[wordArray[i]] = keywordRegistry[wordArray[i]] + 1;
}
// now keywordRegistry will have, as properties, all of the
// words in your word array with their respective counts
// this will alert (choose something better than alert) all words and their counts
for(var keyword in keywordRegistry) {
alert("The keyword '" + keyword + "' occurred " + keywordRegistry[keyword] + " times");
}
That should give you the basics of doing this part of the work.
Try to split you string on words and count the resulting words, then sort on the counts.
This builds upon a previous answer by insin by only having one loop:
function top_words(text, n) {
// Split text on non word characters
var words = text.toLowerCase().split(/\W+/)
var positions = new Array()
var word_counts = new Array()
for (var i=0; i<words.length; i++) {
var word = words[i]
if (!word) {
continue
}
if (typeof positions[word] == 'undefined') {
positions[word] = word_counts.length
word_counts.push([word, 1])
} else {
word_counts[positions[word]][1]++
}
}
// Put most frequent words at the beginning.
word_counts.sort(function (a, b) {return b[1] - a[1]})
// Return the first n items
return word_counts.slice(0, n)
}
// Let's see if it works.
var text = "Words in here are repeated. Are repeated, repeated!"
alert(top_words(text, 3))
The result of the example is: [['repeated',3], ['are',2], ['words', 1]]
I would do exactly what you have mentioned above to isolate each word. I would then probably add each word as the index of an array with the number of occurrences as the value.
For example:
var a = new Array;
a[word] = a[word]?a[word]+1:1;
Now you know how many unique words there are (a.length) and how many occurrences of each word existed (a[word]).

Categories

Resources