How to know which group has been found in a RegEx [duplicate]

How to know which group has been found in a RegEx [duplicate] - javascript

This question already has answers here:
Get index of each capture in a JavaScript regex
(7 answers)
Closed 1 year ago.
When I write a regular expression like:
var m = /(s+).*?(l)[^l]*?(o+)/.exec("this is hello to you");
console.log(m);
I get a match object containing the following:
{
0: "s is hello",
1: "s",
2: "l",
3: "o",
index: 3,
input: "this is hello to you"
}
I know the index of the entire match from the index property, but I also need to know the start and end of the groups matched. Using a simple search won't work. In this example it will find the first 'l' instead of the one found in the group.
Is there any way to get the offset of a matched group?

You can't directly get the index of a match group. What you have to do is first put every character in a match group, even the ones you don't care about:
var m= /(s+)(.*?)(l)([^l]*?)(o+)/.exec('this is hello to you');
Now you've got the whole match in parts:
['s is hello', 's', ' is hel', 'l', '', 'o']
So you can add up the lengths of the strings before your group to get the offset from the match index to the group index:
function indexOfGroup(match, n) {
var ix= match.index;
for (var i= 1; i<n; i++)
ix+= match[i].length;
return ix;
}
console.log(indexOfGroup(m, 3)); // 11

I wrote a simple (well the initialization got a bit bloated) javascript object to solve this problem on a project I've been working on recently. It works the same way as the accepted answer but generates the new regexp and pulls out the data you requested automatically.
var exp = new MultiRegExp(/(firstBit\w+)this text is ignored(optionalBit)?/i);
var value = exp.exec("firstbitWithMorethis text is ignored");
value = {0: {index: 0, text: 'firstbitWithMore'},
1: null};
Git Repo: My MultiRegExp. Hope this helps someone out there.
edit Aug, 2015:
Try me: MultiRegExp Live.

Another javascript class which is also able to parse nested groups is available under: https://github.com/valorize/MultiRegExp2
Usage:
let regex = /a(?: )bc(def(ghi)xyz)/g;
let regex2 = new MultiRegExp2(regex);
let matches = regex2.execForAllGroups('ababa bcdefghixyzXXXX'));
Will output:
[ { match: 'defghixyz', start: 8, end: 17 },
{ match: 'ghi', start: 11, end: 14 } ]

I have played around with adding nested capture groups and named groups with position information.
You can play with some regex on jsfiddle...
https://jsfiddle.net/smuchow1962/z5dj9gL0/
/*
Copyright (c) 2019 Steven A Muchow
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Enhanced RegEx JS processing
Adds position information for capture groups (nested ones too) AND named group items.
*/
class RegexContainer {
static _findCaptureGroupsInRegexTemplate(re, input) {
let refCount = 0; let matches = []; let res; let data;
re.lastIndex = 0;
while ((res = re.exec(input)) !== null) {
if (isCapturingStartItem(res[0])) {
refCount++;
data = {parent: 0, refCount: refCount, start: res.index};
if (res.groups.name) { data.name = res.groups.name; }
matches.push(data);
} else if (input.charAt(res.index) === ')') {
let idx = matches.length;
while (idx--) {
if (matches[idx].end === undefined) {
matches[idx].end = re.lastIndex;
matches[idx].source = input.substring(matches[idx].start, matches[idx].end);
break;
}
}
refCount--;
let writeIdx = idx;
while (idx--) {
if (matches[idx].refCount === refCount) {
matches[writeIdx].parent = idx + 1;
break;
}
}
}
}
matches.unshift({start: 0, end: input.length, source: input});
return matches;
function isCapturingStartItem(str) {
if (str !== '(') { return (str.search(/\(\?<\w/)!==-1); }
return true;
}
}
static execFull(re, input, foundCaptureItems) {
let result; let foundIdx; let groupName; const matches = [];
while ((result = re.exec(input)) !== null) {
let array = createCustomResultArray(result);
array.forEach((match, idx) => {
if (!idx) {
match.startPos = match.endPos = result.index;
match.endPos += result[0].length;
delete match.parent;
return;
}
let parentStr = array[match.parent].data;
foundIdx = (match.parent < idx - 1) ? parentStr.lastIndexOf(match.data) : parentStr.indexOf(match.data);
match.startPos = match.endPos = foundIdx + array[match.parent].startPos;
match.endPos += match.data.length;
if ((groupName = foundCaptureItems[idx].name)) { match.groupName = groupName; }
});
matches.push(array);
if (re.lastIndex === 0) { break; }
}
return matches;
function createCustomResultArray(result) {
let captureVar = 0;
return Array.from(result, (data) => {
return {data: data || '', parent: foundCaptureItems[captureVar++].parent,};
});
}
}
static mapCaptureAndNameGroups(inputRegexSourceString) {
let REGEX_CAPTURE_GROUPS_ANALYZER = /((((?<!\\)|^)\((\?((<(?<name>\w+)))|(\?<=.*?\))|(\?<!.*?\))|(\?!.*?\))|(\?=.*?\)))?)|((?<!\\)\)(([*+?](\?)?))?|({\d+(,)?(\d+)?})))/gm;
return RegexContainer._findCaptureGroupsInRegexTemplate(REGEX_CAPTURE_GROUPS_ANALYZER, inputRegexSourceString);
}
static exec(re, input) {
let foundCaptureItems = RegexContainer.mapCaptureAndNameGroups(re.source);
let res = RegexContainer.execFull(re, input, foundCaptureItems);
return {captureItems: foundCaptureItems, results: res};
}
}
let answers = [];
let regex = [
{ re: "[ \\t]*?\\[\\[(?<inner>\\s*(?<core>\\w(.|\\s)*?)\\s*?)]]", label: "NESTED Regex"},
{ re: "(?<context>((\\w)(\\w|-)*))((?<separator>( - ))?(?<type>(-|\\w)+)?\\s*(?<opt>(\\{.*}))?)?[\\t ]*", label: "simpler regex" },
]
let input = "[[ context1 ]] [[ context2 - with-style { andOpts : {data: 'some info'} } ]]";
regex.forEach( (item) => {
let re = new RegExp(item.re, 'gm');
let result = RegexContainer.exec(re,input);
result.label = item.label;
answers.push(result);
});
answers.forEach((answer,index) => {
console.log('==========================================================');
console.log('==== Item ' + index + ' label: ' + answer.label + ' regex: ' + answer.captureItems[0].source );
console.log('==========================================================\n\n');
let scannedItems = answer.results;
scannedItems.forEach( (match) => {
let full = match[0];
let mstr = full.data;
let substr = input.substring(full.startPos, full.endPos);
if (mstr !== substr) {
console.log('error in the parsing if you get here');
return;
}
console.log('==== Checking ' + mstr);
for (let i=1; i<match.length; i++) {
let capture = match[i];
if (capture.groupName) {
console.log(' ' + capture.groupName + ': ' + "```" + input.substring(capture.startPos,capture.endPos) + "```");
}
}
console.log('');
});
});
Architecture
Take Regex Template and identify the capture groups it will generate. Save it off as an array of group items and nesting info to feed into the expanded exec() call.
use regex to find capturing starts, non-capturing elements, capture names and capture endings. Trap properly for the dreaded \( and \) items.
non-recursive inspection of capture items and their parents (using reference counting).
run the exec() with the capture group information pulled above.
use substring functions to extract data for each capture group
put everything into an array for each result found and send the array back.

Based on the ecma regular expression syntax I've written a parser respective an extension of the RegExp class which solves besides this problem (full indexed exec method) as well other limitations of the JavaScript RegExp implementation for example: Group based search & replace. You can test and download the implementation here (is as well available as NPM module).
The implementation works as follows (small example):
//Retrieve content and position of: opening-, closing tags and body content for: non-nested html-tags.
var pattern = '(<([^ >]+)[^>]*>)([^<]*)(<\\/\\2>)';
var str = '<html><code class="html plain">first</code><div class="content">second</div></html>';
var regex = new Regex(pattern, 'g');
var result = regex.exec(str);
console.log(5 === result.length);
console.log('<code class="html plain">first</code>'=== result[0]);
console.log('<code class="html plain">'=== result[1]);
console.log('first'=== result[3]);
console.log('</code>'=== result[4]);
console.log(5=== result.index.length);
console.log(6=== result.index[0]);
console.log(6=== result.index[1]);
console.log(31=== result.index[3]);
console.log(36=== result.index[4]);
I tried as well the implementation from #velop but the implementation seems buggy for example it does not handle backreferences correctly e.g. "/a(?: )bc(def(\1ghi)xyz)/g" - when adding paranthesis in front then the backreference \1 needs to be incremented accordingly (which is not the case in his implementation).

For global regex you want to match only fragments and iterate so first solution won't work. This is a 30 min solution based on indexOf and sums that work for this case:
https://codepen.io/cancerberoSgx/pen/qYwjjz?editors=0012#code-area
!function () {
const regex = /\/\*\*\*#\s*([^#]+)\s*(#\*\*\*\/)/gim
const exampleThatMatch = `
/***#
debug.print('hello editor, simpleNode kind is ' +
arg.simpleNode.getKindName())
#***/
const a = 1 //user
/***#
debug.print(arg.simpleNode.getParent().getKindName())
#***/
`
const text = exampleThatMatch
function exec(r, s) {
function indexOfGroup(match, n) {
var ix = match.index;
for (var i = 1; i < n; i++)
ix += match[i].length;
return ix;
}
let result
let lastMatchIndex = 0
const matches = []
while ((result = regex.exec(text))) {
const match = []
lastMatchIndex = text.indexOf(result[0], lastMatchIndex)
let relIndex = 0
for (let i = 1; i < result.length; i++) {
relIndex = text.indexOf(result[i], relIndex)
match.push({ value: result[i], start: relIndex, end: relIndex + result[i].length })
}
matches.push(match)
}
return matches
}
const groupsWithIndex = exec(regex, text)
console.log({RESULT: groupsWithIndex })
// now test - let's remove everything else but matched groups
let frag = '' , sep = '\n#######\n'
groupsWithIndex.forEach(match => match.forEach(group => {
frag += text.substring(group.start, group.end) + sep
}))
console.log('The following are only the matched groups usign the result and text.substring just to verify it works OK:', '\n'+sep)
console.log(frag)
}()
And just in case here is the typescript:
https://codepen.io/cancerberoSgx/pen/yjrXxx?editors=0012
|
Enjoy

Related

How to find an unknown pattern, that is present in multiple strings?

I'm trying to find a part in multiple strings, that all strings share in common. For example:
const string1 = '.bold[_ngcontent="_kjhafh-asda-qw"] {background:black;}';
const string2 = '[_ngcontent="_kjhafh-asda-qw"] {background-color:hotpink;}';
const string3 = 'div > p > span[_ngcontent="_kjhafh-asda-qw"] {background:hotpink;}'
I don't know in advance what exactly the string is that I'm looking for, so I have to loop over the strings and find out. In the example above, the pattern would be [_ngcontent="_kjhafh-asda-qw"].
Is this even possible? Also, it would have to understand that maybe no such pattern exists. And are there methods for that or do I need to implement such an algorithm myself?
EDIT (context): We are building a validator, that checks a micro-frontend for global CSS rules (not prefixed and outside a shadow-dom), by loading it in isolation in a headless browser (within a jenkins pipeline) and validate, that it should not break any other stuff by global rules, that might be outside the context of the micro-frontend, on the same page. Using a headless browser, we can make use of the document.styleSheets property and not miss any styles that are being loaded. This will find <style> tags and its contents, aswell as content of external stylesheets.

Leveraging the BLAST algorithm, the following code snippet seeks successively matching substrings.
//
// See https://stackoverflow.com/questions/13006556/check-if-two-strings-share-a-common-substring-in-javascript/13007065#13007065
// for the following function...
//
String.prototype.subCompare = function(needle, haystack, minLength) {
var i,j;
haystack = haystack || this.toLowerCase();
minLength = minLength || 5;
for (i=needle.length; i>=minLength; i--) {
for (j=0; j <= (needle.length - i); j++) {
var substring = needle.substr(j,i);
var k = haystack.indexOf(substring);
if (k !== -1) {
return {
found : 1,
substring : substring,
needleIndex : j,
haystackIndex : k
};
}
}
}
return {
found : 0
}
}
//
// Iterate through the array of strings, seeking successive matching substrings...
//
strings = [
'.bold[_ngcontent="_kjhafh-asda-qw"] {background:black;}',
'[_ngcontent="_kjhafh-asda-qw"] {background-color:hotpink;}',
'div > p > span[_ngcontent="_kjhafh-asda-qw"] {background:hotpink;}'
]
check = { found: 1, substring: strings[ 0 ] }
i = 1;
while ( check.found && i < strings.length ) {
check = check.substring.subCompare( strings[ i++ ] );
}
console.log( check );
Note that without seeing a larger sampling of string data, it's not clear whether this algorithm satisfies the objective...

Thanks to Trentium's answer, I was able to do it. I adapted the code a little bit, as it was doing too much and also, the substr didn't yield a consistent result (it depended on the order of input strings).
The code could obviously be further minified/simplified.
const findCommonPattern = (base, needle, minLength = 5) => {
const haystack = base.toLowerCase();
for (let i = needle.length; i >= minLength; i--) {
for (let j = 0; j <= needle.length - i; j++) {
let prefix = needle.substr(j, i);
let k = haystack.indexOf(prefix);
if (k !== -1) {
return {
found: true,
prefix,
};
}
}
}
return {
found: false,
};
};
const checkIfCssIsPrefixed = (strings) => {
let check = { found: true };
let matchingStrings = [];
for (let i = 1; check.found && i < strings.length; ++i) {
check = findCommonPattern(strings[0], strings[i]);
matchingStrings.push(check.prefix);
}
// Sort by length and take the shortest string, which will be the pattern that all of the strings share in common.
check.prefix = matchingStrings.sort((a, b) => a.length - b.length)[0];
return check;
};
console.log(
checkIfCssIsPrefixed([
".spacer[_ngcontent-wdy-c0]",
"[_nghost-wdy-c0]",
".toolbar[_ngcontent-wdy-c0]",
"p[_ngcontent-wdy-c0]",
".spacer[_ngcontent-wdy-c0]",
".toolbar[_ngcontent-wdy-c0] img[_ngcontent-wdy-c0]",
"h1[_ngcontent-wdy-c0], h2[_ngcontent-wdy-c0], h3[_ngcontent-wdy-c0], h4[_ngcontent-wdy-c0], h5[_ngcontent-wdy-c0], h6[_ngcontent-wdy-c0]",
".toolbar[_ngcontent-wdy-c0] #twitter-logo[_ngcontent-wdy-c0]",
".toolbar[_ngcontent-wdy-c0] #youtube-logo[_ngcontent-wdy-c0]",
".toolbar[_ngcontent-wdy-c0] #twitter-logo[_ngcontent-wdy-c0]:hover, .toolbar[_ngcontent-wdy-c0] #youtube-logo[_ngcontent-wdy-c0]:hover",
])
);

How to get odd and even position characters from a string?

I'm trying to figure out how to remove every second character (starting from the first one) from a string in Javascript.
For example, the string "This is a test!" should become "hsi etTi sats!"
I also want to save every deleted character into another array.
I have tried using replace method and splice method, but wasn't able to get them to work properly. Mostly because replace only replaces the first character.
function encrypt(text, n) {
if (text === "NULL") return n;
if (n <= 0) return text;
var encArr = [];
var newString = text.split("");
var j = 0;
for (var i = 0; i < text.length; i += 2) {
encArr[j++] = text[i];
newString.splice(i, 1); // this line doesn't work properly
}
}

You could reduce the characters of the string and group them to separate arrays using the % operator. Use destructuring to get the 2D array returned to separate variables
let str = "This is a test!";
const [even, odd] = [...str].reduce((r,char,i) => (r[i%2].push(char), r), [[],[]])
console.log(odd.join(''))
console.log(even.join(''))
Using a for loop:
let str = "This is a test!",
odd = [],
even = [];
for (var i = 0; i < str.length; i++) {
i % 2 === 0
? even.push(str[i])
: odd.push(str[i])
}
console.log(odd.join(''))
console.log(even.join(''))

It would probably be easier to use a regular expression and .replace: capture two characters in separate capturing groups, add the first character to a string, and replace with the second character. Then, you'll have first half of the output you need in one string, and the second in another: just concatenate them together and return:
function encrypt(text) {
let removedText = '';
const replacedText1 = text.replace(/(.)(.)?/g, (_, firstChar, secondChar) => {
// in case the match was at the end of the string,
// and the string has an odd number of characters:
if (!secondChar) secondChar = '';
// remove the firstChar from the string, while adding it to removedText:
removedText += firstChar;
return secondChar;
});
return replacedText1 + removedText;
}
console.log(encrypt('This is a test!'));

Pretty simple with .reduce() to create the two arrays you seem to want.
function encrypt(text) {
return text.split("")
.reduce(({odd, even}, c, i) =>
i % 2 ? {odd: [...odd, c], even} : {odd, even: [...even, c]}
, {odd: [], even: []})
}
console.log(encrypt("This is a test!"));
They can be converted to strings by using .join("") if you desire.

I think you were on the right track. What you missed is replace is using either a string or RegExp.
The replace() method returns a new string with some or all matches of a pattern replaced by a replacement. The pattern can be a string or a RegExp, and the replacement can be a string or a function to be called for each match. If pattern is a string, only the first occurrence will be replaced.
Source: String.prototype.replace()
If you are replacing a value (and not a regular expression), only the first instance of the value will be replaced. To replace all occurrences of a specified value, use the global (g) modifier
Source: JavaScript String replace() Method
So my suggestion would be to continue still with replace and pass the right RegExp to the function, I guess you can figure out from this example - this removes every second occurrence for char 't':
let count = 0;
let testString = 'test test test test';
console.log('original', testString);
// global modifier in RegExp
let result = testString.replace(/t/g, function (match) {
count++;
return (count % 2 === 0) ? '' : match;
});
console.log('removed', result);

like this?
var text = "This is a test!"
var result = ""
var rest = ""
for(var i = 0; i < text.length; i++){
if( (i%2) != 0 ){
result += text[i]
} else{
rest += text[i]
}
}
console.log(result+rest)

Maybe with split, filter and join:
const remaining = myString.split('').filter((char, i) => i % 2 !== 0).join('');
const deleted = myString.split('').filter((char, i) => i % 2 === 0).join('');

You could take an array and splice and push each second item to the end of the array.
function encrypt(string) {
var array = [...string],
i = 0,
l = array.length >> 1;
while (i <= l) array.push(array.splice(i++, 1)[0]);
return array.join('');
}
console.log(encrypt("This is a test!"));

function encrypt(text) {
text = text.split("");
var removed = []
var encrypted = text.filter((letter, index) => {
if(index % 2 == 0){
removed.push(letter)
return false;
}
return true
}).join("")
return {
full: encrypted + removed.join(""),
encrypted: encrypted,
removed: removed
}
}
console.log(encrypt("This is a test!"))
Splice does not work, because if you remove an element from an array in for loop indexes most probably will be wrong when removing another element.

I don't know how much you care about performance, but using regex is not very efficient.
Simple test for quite a long string shows that using filter function is on average about 3 times faster, which can make quite a difference when performed on very long strings or on many, many shorts ones.
function test(func, n){
var text = "";
for(var i = 0; i < n; ++i){
text += "a";
}
var start = new Date().getTime();
func(text);
var end = new Date().getTime();
var time = (end-start) / 1000.0;
console.log(func.name, " took ", time, " seconds")
return time;
}
function encryptREGEX(text) {
let removedText = '';
const replacedText1 = text.replace(/(.)(.)?/g, (_, firstChar, secondChar) => {
// in case the match was at the end of the string,
// and the string has an odd number of characters:
if (!secondChar) secondChar = '';
// remove the firstChar from the string, while adding it to removedText:
removedText += firstChar;
return secondChar;
});
return replacedText1 + removedText;
}
function encrypt(text) {
text = text.split("");
var removed = "";
var encrypted = text.filter((letter, index) => {
if(index % 2 == 0){
removed += letter;
return false;
}
return true
}).join("")
return encrypted + removed
}
var timeREGEX = test(encryptREGEX, 10000000);
var timeFilter = test(encrypt, 10000000);
console.log("Using filter is faster ", timeREGEX/timeFilter, " times")
Using actually an array for storing removed letters and then joining them is much more efficient, than using a string and concatenating letters to it.
I changed an array to string in filter solution to make it the same like in regex solution, so they are more comparable.

word frequency in javascript

How can I implement javascript function to calculate frequency of each word in a given sentence.
this is my code:
function search () {
var data = document.getElementById('txt').value;
var temp = data;
var words = new Array();
words = temp.split(" ");
var uniqueWords = new Array();
var count = new Array();
for (var i = 0; i < words.length; i++) {
//var count=0;
var f = 0;
for (j = 0; j < uniqueWords.length; j++) {
if (words[i] == uniqueWords[j]) {
count[j] = count[j] + 1;
//uniqueWords[j]=words[i];
f = 1;
}
}
if (f == 0) {
count[i] = 1;
uniqueWords[i] = words[i];
}
console.log("count of " + uniqueWords[i] + " - " + count[i]);
}
}
am unable to trace out the problem ..any help is greatly appriciated.
output in this format:
count of is - 1
count of the - 2..
input: this is anil is kum the anil

Here is a JavaScript function to get the frequency of each word in a sentence:
function wordFreq(string) {
var words = string.replace(/[.]/g, '').split(/\s/);
var freqMap = {};
words.forEach(function(w) {
if (!freqMap[w]) {
freqMap[w] = 0;
}
freqMap[w] += 1;
});
return freqMap;
}
It will return a hash of word to word count. So for example, if we run it like so:
console.log(wordFreq("I am the big the big bull."));
> Object {I: 1, am: 1, the: 2, big: 2, bull: 1}
You can iterate over the words with Object.keys(result).sort().forEach(result) {...}. So we could hook that up like so:
var freq = wordFreq("I am the big the big bull.");
Object.keys(freq).sort().forEach(function(word) {
console.log("count of " + word + " is " + freq[word]);
});
Which would output:
count of I is 1
count of am is 1
count of big is 2
count of bull is 1
count of the is 2
JSFiddle: http://jsfiddle.net/ah6wsbs6/
And here is wordFreq function in ES6:
function wordFreq(string) {
return string.replace(/[.]/g, '')
.split(/\s/)
.reduce((map, word) =>
Object.assign(map, {
[word]: (map[word])
? map[word] + 1
: 1,
}),
{}
);
}
JSFiddle: http://jsfiddle.net/r1Lo79us/

I feel you have over-complicated things by having multiple arrays, strings, and engaging in frequent (and hard to follow) context-switching between loops, and nested loops.
Below is the approach I would encourage you to consider taking. I've inlined comments to explain each step along the way. If any of this is unclear, please let me know in the comments and I'll revisit to improve clarity.
(function () {
/* Below is a regular expression that finds alphanumeric characters
Next is a string that could easily be replaced with a reference to a form control
Lastly, we have an array that will hold any words matching our pattern */
var pattern = /\w+/g,
string = "I I am am am yes yes.",
matchedWords = string.match( pattern );
/* The Array.prototype.reduce method assists us in producing a single value from an
array. In this case, we're going to use it to output an object with results. */
var counts = matchedWords.reduce(function ( stats, word ) {
/* `stats` is the object that we'll be building up over time.
`word` is each individual entry in the `matchedWords` array */
if ( stats.hasOwnProperty( word ) ) {
/* `stats` already has an entry for the current `word`.
As a result, let's increment the count for that `word`. */
stats[ word ] = stats[ word ] + 1;
} else {
/* `stats` does not yet have an entry for the current `word`.
As a result, let's add a new entry, and set count to 1. */
stats[ word ] = 1;
}
/* Because we are building up `stats` over numerous iterations,
we need to return it for the next pass to modify it. */
return stats;
}, {} );
/* Now that `counts` has our object, we can log it. */
console.log( counts );
}());

const sentence = 'Hi my friend how are you my friend';
const countWords = (sentence) => {
const convertToObject = sentence.split(" ").map( (i, k) => {
return {
element: {
word: i,
nr: sentence.split(" ").filter(j => j === i).length + ' occurrence',
}
}
});
return Array.from(new Set(convertToObject.map(JSON.stringify))).map(JSON.parse)
};
console.log(countWords(sentence));

Here is an updated version of your own code...
<!DOCTYPE html>
<html>
<head>
<title>string frequency</title>
<style type="text/css">
#text{
width:250px;
}
</style>
</head>
<body >
<textarea id="txt" cols="25" rows="3" placeholder="add your text here"> </textarea></br>
<button type="button" onclick="search()">search</button>
<script >
function search()
{
var data=document.getElementById('txt').value;
var temp=data;
var words=new Array();
words=temp.split(" ");
var unique = {};
for (var i = 0; i < words.length; i++) {
var word = words[i];
console.log(word);
if (word in unique)
{
console.log("word found");
var count = unique[word];
count ++;
unique[word]=count;
}
else
{
console.log("word NOT found");
unique[word]=1;
}
}
console.log(unique);
}
</script>
</body>
I think your loop was overly complicated. Also, trying to produce the final count while still doing your first pass over the array of words is bound to fail because you can't test for uniqueness until you have checked each word in the array.
Instead of all your counters, I've used a Javascript object to work as an associative array, so we can store each unique word, and the count of how many times it occurs.
Then, once we exit the loop, we can see the final result.
Also, this solution uses no regex ;)
I'll also add that it's very hard to count words just based on spaces. In this code, "one, two, one" will results in "one," and "one" as being different, unique words.

While both of the answers here are correct maybe are better but none of them address OP's question (what is wrong with the his code).
The problem with OP's code is here:
if(f==0){
count[i]=1;
uniqueWords[i]=words[i];
}
On every new word (unique word) the code adds it to uniqueWords at index at which the word was in words. Hence there are gaps in uniqueWords array. This is the reason for some undefined values.
Try printing uniqueWords. It should give something like:
["this", "is", "anil", 4: "kum", 5: "the"]
Note there no element for index 3.
Also the printing of final count should be after processing all the words in the words array.
Here's corrected version:
function search()
{
var data=document.getElementById('txt').value;
var temp=data;
var words=new Array();
words=temp.split(" ");
var uniqueWords=new Array();
var count=new Array();
for (var i = 0; i < words.length; i++) {
//var count=0;
var f=0;
for(j=0;j<uniqueWords.length;j++){
if(words[i]==uniqueWords[j]){
count[j]=count[j]+1;
//uniqueWords[j]=words[i];
f=1;
}
}
if(f==0){
count[i]=1;
uniqueWords[i]=words[i];
}
}
for ( i = 0; i < uniqueWords.length; i++) {
if (typeof uniqueWords[i] !== 'undefined')
console.log("count of "+uniqueWords[i]+" - "+count[i]);
}
}
I have just moved the printing of count out of the processing loop into a new loop and added a if not undefined check.
Fiddle: https://jsfiddle.net/cdLgaq3a/

I had a similar assignment. This is what I did:
Assignment : Clean the following text and find the most frequent word (hint, use replace and regular expressions).
const sentence = '%I $am#% a %tea#cher%, &and& I lo%#ve %te#a#ching%;. The#re $is no#th#ing; &as& mo#re rewarding as educa#ting &and& #emp%o#weri#ng peo#ple. ;I found tea#ching m%o#re interesting tha#n any ot#her %jo#bs. %Do#es thi%s mo#tiv#ate yo#u to be a tea#cher!? %Th#is 30#Days&OfJavaScript &is al#so $the $resu#lt of &love& of tea&ching'
console.log(`\n\n 03.Clean the following text and find the most frequent word (hint, use replace and regular expressions) \n\n ${sentence} \n\n`)
console.log(`Cleared sentence : ${sentence.replace(/[.,\/#!$%\^&\*;:{}=\-_`~()#]/g, "")}`)
console.log(mostFrequentWord(sentence))
function mostFrequentWord(sentence) {
sentence = sentence.replace(/[.,\/#!$%\^&\*;:{}=\-_`~()#]/g, "").trim().toLowerCase()
let sentenceArray = sentence.split(" ")
let word = null
let count = 0
for (i = 0; i < sentenceArray.length; i++) {
word = sentenceArray[i]
count = sentence.match(RegExp(sentenceArray[i], 'gi')).length
if (count > count) {
count = count
word = word
}
}
return `\n Count of most frequent word "${word}" is ${count}`
}

I'd go with Sampson's match-reduce method for slightly better efficiency. Here's a modified version of it that is more production-ready. It's not perfect, but it should cover the vast majority of scenarios (i.e., "good enough").
function calcWordFreq(s) {
// Normalize
s = s.toLowerCase();
// Strip quotes and brackets
s = s.replace(/["“”(\[{}\])]|\B['‘]([^'’]+)['’]/g, '$1');
// Strip dashes and ellipses
s = s.replace(/[‒–—―…]|--|\.\.\./g, ' ');
// Strip punctuation marks
s = s.replace(/[!?;:.,]\B/g, '');
return s.match(/\S+/g).reduce(function(oFreq, sWord) {
if (oFreq.hasOwnProperty(sWord)) ++oFreq[sWord];
else oFreq[sWord] = 1;
return oFreq;
}, {});
}
calcWordFreq('A ‘bad’, “BAD” wolf-man...a good ol\' spook -- I\'m frightened!') returns
{
"a": 2
"bad": 2
"frightened": 1
"good": 1
"i'm": 1
"ol'": 1
"spook": 1
"wolf-man": 1
}

Return url for the first item for which the content part matches pattern

How can I return the url part of the first page in pages for which the content part matches pattern (in the style of function 1, case insensitive) but return an empty string if no page is found?
e.g.
url1(pages,"GREAT") returns "www.xyz.ac.uk"
url1(pages,"xyz") returns ""
Here is my code so far:
var pg = [ "|www.cam.ac.uk|Cambridge University offers degree programmes and world class research." , "!www.xyz.ac.uk!An great University" , "%www%Yet another University" ];
var pt = "great";
function url1(pages, pattern) {
var result = "";
for (x in pages) {
current = pages[x].split(pattern);
result = current[1];
}
return result;
}
alert(url1(pg, pt));

Try this:
var pg = [ "|www.cam.ac.uk|Cambridge University offers degree programmes and world class research." , "!www.xyz.ac.uk!An great University" , "%www%Yet another University" ];
function find(pages, pattern) {
var i, l, page, arr;
pattern = pattern.toLowerCase();
for(i=0, l=pages.length; i<l; i++) {
page = pages[i];
arr = page.split(page[0]);
if(arr.slice(2).join(page[0]).toLowerCase().indexOf(pattern) >=0) {
return arr[1];
}
}
return '';
}
console.log(find(pg,'great')); // 'www.xyz.ac.uk'
console.log(find(pg,'xyz')); // ''
Fiddle here: http://jsbin.com/uGebeQi/2/edit

Here's another solution using a regular expression. I feel this is slightly better than blindly splitting on the first character since the content could always contain the first character as well.
Note: The accepted answer was corrected.
function url1(pages, pattern) {
var rx = /^(.)(.+?)\1(.+)/,
i = 0,
len = pages.length,
matches;
for (; i < len; i++) {
if ((matches = pages[i].match(rx)) && matches[3].indexOf(pattern) !== -1)
return matches[2];
}
return '';
}
//www.xyz.ac.uk
url1(['!www.xyz.ac.uk!An great University! The best in the world!'], 'best');
You can always lowercase the pattern and the content if you want case-insensitive searches.

How to find indices of all occurrences of one string in another in JavaScript?

I'm trying to find the positions of all occurrences of a string in another string, case-insensitive.
For example, given the string:
I learned to play the Ukulele in Lebanon.
and the search string le, I want to obtain the array:
[2, 25, 27, 33]
Both strings will be variables - i.e., I can't hard-code their values.
I figured that this was an easy task for regular expressions, but after struggling for a while to find one that would work, I've had no luck.
I found this example of how to accomplish this using .indexOf(), but surely there has to be a more concise way to do it?

var str = "I learned to play the Ukulele in Lebanon."
var regex = /le/gi, result, indices = [];
while ( (result = regex.exec(str)) ) {
indices.push(result.index);
}
UPDATE
I failed to spot in the original question that the search string needs to be a variable. I've written another version to deal with this case that uses indexOf, so you're back to where you started. As pointed out by Wrikken in the comments, to do this for the general case with regular expressions you would need to escape special regex characters, at which point I think the regex solution becomes more of a headache than it's worth.
function getIndicesOf(searchStr, str, caseSensitive) {
var searchStrLen = searchStr.length;
if (searchStrLen == 0) {
return [];
}
var startIndex = 0, index, indices = [];
if (!caseSensitive) {
str = str.toLowerCase();
searchStr = searchStr.toLowerCase();
}
while ((index = str.indexOf(searchStr, startIndex)) > -1) {
indices.push(index);
startIndex = index + searchStrLen;
}
return indices;
}
var indices = getIndicesOf("le", "I learned to play the Ukulele in Lebanon.");
document.getElementById("output").innerHTML = indices + "";
<div id="output"></div>

One liner using String.prototype.matchAll (ES2020):
[...sourceStr.matchAll(new RegExp(searchStr, 'gi'))].map(a => a.index)
Using your values:
const sourceStr = 'I learned to play the Ukulele in Lebanon.';
const searchStr = 'le';
const indexes = [...sourceStr.matchAll(new RegExp(searchStr, 'gi'))].map(a => a.index);
console.log(indexes); // [2, 25, 27, 33]
If you're worried about doing a spread and a map() in one line, I ran it with a for...of loop for a million iterations (using your strings). The one liner averages 1420ms while the for...of averages 1150ms on my machine. That's not an insignificant difference, but the one liner will work fine if you're only doing a handful of matches.
See matchAll on caniuse

Here is regex free version:
function indexes(source, find) {
if (!source) {
return [];
}
// if find is empty string return all indexes.
if (!find) {
// or shorter arrow function:
// return source.split('').map((_,i) => i);
return source.split('').map(function(_, i) { return i; });
}
var result = [];
for (i = 0; i < source.length; ++i) {
// If you want to search case insensitive use
// if (source.substring(i, i + find.length).toLowerCase() == find) {
if (source.substring(i, i + find.length) == find) {
result.push(i);
}
}
return result;
}
indexes("I learned to play the Ukulele in Lebanon.", "le")
EDIT: and if you want to match strings like 'aaaa' and 'aa' to find [0, 2] use this version:
function indexes(source, find) {
if (!source) {
return [];
}
if (!find) {
return source.split('').map(function(_, i) { return i; });
}
var result = [];
var i = 0;
while(i < source.length) {
if (source.substring(i, i + find.length) == find) {
result.push(i);
i += find.length;
} else {
i++;
}
}
return result;
}

You sure can do this!
//make a regular expression out of your needle
var needle = 'le'
var re = new RegExp(needle,'gi');
var haystack = 'I learned to play the Ukulele';
var results = new Array();//this is the results you want
while (re.exec(haystack)){
results.push(re.lastIndex);
}
Edit: learn to spell RegExp
Also, I realized this isn't exactly what you want, as lastIndex tells us the end of the needle not the beginning, but it's close - you could push re.lastIndex-needle.length into the results array...
Edit: adding link
#Tim Down's answer uses the results object from RegExp.exec(), and all my Javascript resources gloss over its use (apart from giving you the matched string). So when he uses result.index, that's some sort of unnamed Match Object. In the MDC description of exec, they actually describe this object in decent detail.

I am a bit late to the party (by almost 10 years, 2 months), but one way for future coders is to do it using while loop and indexOf()
let haystack = "I learned to play the Ukulele in Lebanon.";
let needle = "le";
let pos = 0; // Position Ref
let result = []; // Final output of all index's.
let hayStackLower = haystack.toLowerCase();
// Loop to check all occurrences
while (hayStackLower.indexOf(needle, pos) != -1) {
result.push(hayStackLower.indexOf(needle , pos));
pos = hayStackLower.indexOf(needle , pos) + 1;
}
console.log("Final ", result); // Returns all indexes or empty array if not found

If you just want to find the position of all matches I'd like to point you to a little hack:
var haystack = 'I learned to play the Ukulele in Lebanon.',
needle = 'le',
splitOnFound = haystack.split(needle).map(function (culm)
{
return this.pos += culm.length + needle.length
}, {pos: -needle.length}).slice(0, -1); // {pos: ...} – Object wich is used as this
console.log(splitOnFound);
It might not be applikable if you have a RegExp with variable length but for some it might be helpful.
This is case sensitive. For case insensitivity use String.toLowerCase function before.

const findAllOccurrences = (str, substr) => {
str = str.toLowerCase();
let result = [];
let idx = str.indexOf(substr)
while (idx !== -1) {
result.push(idx);
idx = str.indexOf(substr, idx+1);
}
return result;
}
console.log(findAllOccurrences('I learned to play the Ukulele in Lebanon', 'le'));

I would recommend Tim's answer. However, this comment by #blazs states "Suppose searchStr=aaa and that str=aaaaaa. Then instead of finding 4 occurences your code will find only 2 because you're making skips by searchStr.length in the loop.", which is true by looking at Tim's code, specifically this line here: startIndex = index + searchStrLen; Tim's code would not be able to find an instance of the string that's being searched that is within the length of itself. So, I've modified Tim's answer:
function getIndicesOf(searchStr, str, caseSensitive) {
var startIndex = 0, index, indices = [];
if (!caseSensitive) {
str = str.toLowerCase();
searchStr = searchStr.toLowerCase();
}
while ((index = str.indexOf(searchStr, startIndex)) > -1) {
indices.push(index);
startIndex = index + 1;
}
return indices;
}
var searchStr = prompt("Enter a string.");
var str = prompt("What do you want to search for in the string?");
var indices = getIndicesOf(str, searchStr);
document.getElementById("output").innerHTML = indices + "";
<div id="output"></div>
Changing it to + 1 instead of + searchStrLen will allow the index 1 to be in the indices array if I have an str of aaaaaa and a searchStr of aaa.
P.S. If anyone would like comments in the code to explain how the code works, please say so, and I'll be happy to respond to the request.

Here is a simple code snippet:
function getIndexOfSubStr(str, searchToken, preIndex, output) {
var result = str.match(searchToken);
if (result) {
output.push(result.index +preIndex);
str=str.substring(result.index+searchToken.length);
getIndexOfSubStr(str, searchToken, preIndex, output)
}
return output;
}
var str = "my name is 'xyz' and my school name is 'xyz' and my area name is 'xyz' ";
var searchToken ="my";
var preIndex = 0;
console.log(getIndexOfSubStr(str, searchToken, preIndex, []));

Thanks for all the replies. I went through all of them and came up with a function that gives the first an last index of each occurrence of the 'needle' substring . I am posting it here in case it will help someone.
Please note, it is not the same as the original request for only the beginning of each occurrence. It suits my usecase better because you don't need to keep the needle length.
function findRegexIndices(text, needle, caseSensitive){
var needleLen = needle.length,
reg = new RegExp(needle, caseSensitive ? 'gi' : 'g'),
indices = [],
result;
while ( (result = reg.exec(text)) ) {
indices.push([result.index, result.index + needleLen]);
}
return indices
}

Check this solution which will able to find same character string too, let me know if something missing or not right.
function indexes(source, find) {
if (!source) {
return [];
}
if (!find) {
return source.split('').map(function(_, i) { return i; });
}
source = source.toLowerCase();
find = find.toLowerCase();
var result = [];
var i = 0;
while(i < source.length) {
if (source.substring(i, i + find.length) == find)
result.push(i++);
else
i++
}
return result;
}
console.log(indexes('aaaaaaaa', 'aaaaaa'))
console.log(indexes('aeeaaaaadjfhfnaaaaadjddjaa', 'aaaa'))
console.log(indexes('wordgoodwordgoodgoodbestword', 'wordgood'))
console.log(indexes('I learned to play the Ukulele in Lebanon.', 'le'))

Follow the answer of #jcubic, his solution caused a small confusion for my case
For example var result = indexes('aaaa', 'aa') will return [0, 1, 2] instead of [0, 2]
So I updated a bit his solution as below to match my case
function indexes(text, subText, caseSensitive) {
var _source = text;
var _find = subText;
if (caseSensitive != true) {
_source = _source.toLowerCase();
_find = _find.toLowerCase();
}
var result = [];
for (var i = 0; i < _source.length;) {
if (_source.substring(i, i + _find.length) == _find) {
result.push(i);
i += _find.length; // found a subText, skip to next position
} else {
i += 1;
}
}
return result;
}

Here's my code (using search and slice methods)
let s = "I learned to play the Ukulele in Lebanon"
let sub = 0
let matchingIndex = []
let index = s.search(/le/i)
while( index >= 0 ){
matchingIndex.push(index+sub);
sub = sub + ( s.length - s.slice( index+1 ).length )
s = s.slice( index+1 )
index = s.search(/le/i)
}
console.log(matchingIndex)

This is what I usually use to get a string index also according to its position.
I pass following parameters:
search: the string where to search for
find: the string to find
position ('all' by default): the position by which the find string appears in search string
(if 'all' it returns the complete array of indexes)
(if 'last' it returns the last position)
function stringIndex (search, find, position = "all") {
var currIndex = 0, indexes = [], found = true;
while (found) {
var searchIndex = search.indexOf(find);
if (searchIndex > -1) {
currIndex += searchIndex + find.length;
search = search.substr (searchIndex + find.length);
indexes.push (currIndex - find.length);
} else found = false; //no other string to search for - exit from while loop
}
if (position == 'all') return indexes;
if (position > indexes.length -1) return [];
position = (position == "last") ? indexes.length -1 : position;
return indexes[position];
}
//Example:
var myString = "Joe meets Joe and together they go to Joe's house";
console.log ( stringIndex(myString, "Joe") ); //0, 10, 38
console.log ( stringIndex(myString, "Joe", 1) ); //10
console.log ( stringIndex(myString, "Joe", "last") ); //38
console.log ( stringIndex(myString, "Joe", 5) ); //[]

Hi friends this is just another way of finding indexes of matching phrase using reduce and a helper method. Of course RegExp is more convenient and perhaps is internally implemented somehow like this. I hope you find it useful.
function findIndexesOfPhraseWithReduce(text, phrase) {
//convert text to array so that be able to manipulate.
const arrayOfText = [...text];
/* this function takes the array of characters and
the search phrase and start index which comes from reduce method
and calculates the end with length of the given phrase then slices
and joins characters and compare it whith phrase.
and returns True Or False */
function isMatch(array, phrase, start) {
const end = start + phrase.length;
return (array.slice(start, end).join('')).toLowerCase() ===
phrase.toLowerCase();
}
/* here we reduce the array of characters and test each character
with isMach function which takes "current index" and matches the phrase
with the subsequent character which starts from current index and
ends at the last character of phrase(the length of phrase). */
return arrayOfText.reduce((acc, item, index) => isMatch(arrayOfText, phrase,
index) ? [...acc, index] : acc, []);
}
findIndexesOfPhraseWithReduce("I learned to play the Ukulele in Lebanon.", "le");
function findIndexesOfPhraseWithReduce(text, phrase) {
const arrayOfText = [...text];
function isMatch(array, phrase, start) {
const end = start + phrase.length;
return (array.slice(start, end).join('')).toLowerCase() ===
phrase.toLowerCase();
}
return arrayOfText.reduce((acc, item, index) => isMatch(arrayOfText, phrase,
index) ? [...acc, index] : acc, []);
}
console.log(findIndexesOfPhraseWithReduce("I learned to play the Ukulele in Lebanon.", "le"));

function countInString(searchFor,searchIn){
var results=0;
var a=searchIn.indexOf(searchFor)
while(a!=-1){
searchIn=searchIn.slice(a*1+searchFor.length);
results++;
a=searchIn.indexOf(searchFor);
}
return results;
}

the below code will do the job for you :
function indexes(source, find) {
var result = [];
for(i=0;i<str.length; ++i) {
// If you want to search case insensitive use
// if (source.substring(i, i + find.length).toLowerCase() == find) {
if (source.substring(i, i + find.length) == find) {
result.push(i);
}
}
return result;
}
indexes("hello, how are you", "ar")

Use String.prototype.match.
Here is an example from the MDN docs itself:
var str = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
var regexp = /[A-E]/gi;
var matches_array = str.match(regexp);
console.log(matches_array);
// ['A', 'B', 'C', 'D', 'E', 'a', 'b', 'c', 'd', 'e']

Develop Reference

JavaScript is the programming language of the Web.

How to know which group has been found in a RegEx [duplicate] - javascript

Related

How to find an unknown pattern, that is present in multiple strings?

How to get odd and even position characters from a string?

word frequency in javascript

Return url for the first item for which the content part matches pattern

How to find indices of all occurrences of one string in another in JavaScript?

Categories

Resources