Split a string at nth occurrence of a regex in javascript - javascript

I know split can get a second parameter as a limit, but it is not what I'm looking for. And I know it can be done by splitting and joining again with a solid string delimiter.
The problem is the delimiter is a regular expression and I don't know the exact length of the pattern that matches.
Consider this string:
this is a title
--------------------------
rest is body! even if there are some dashes!
--------
---------------------
it should not be counted as a separated part!
By using this:
str.split(/---*\n/);
I will get:
[
'this is a title',
'rest is body! even if there are some dashes.!',
'',
'it should not be counted as a separated part!'
]
And this is what I wanted to be: (if I want to split by the first occurrence)
[
'this is a title',
'rest is body! even if there are some dashes.!\n--------\n---------------------\nit should not be counted as a separated part!'
]
This solution is what I currently have, but it's just for the first occurrence.
function split(str, regex) {
var match = str.match(regex);
return [str.substr(0, match.index), str.substr(match.index+match[0].length)];
}
Any ideas on how to generalize the solution for any number n to split the string on the nth occurrence of regex?

var str= "this-----that---these------those";
var N= 2;
var regex= new RegExp( "^((?:[\\s\\S]*?---*){"+(N-1)+"}[\\s\\S]*?)---*([\\s\\S]*)$" );
var result= regex.exec(str).slice(1,3);
console.log(result);
Output:
["this-----that", "these------those"]
jsFiddle
Option with the function:
var generateRegExp= function (N) {
return new RegExp( "^((?:[\\s\\S]*?---*){"+(N-1)+"}[\\s\\S]*?)---*([\\s\\S]*)$" );
};
var getSlice= function(str, regexGenerator, N) {
return regexGenerator(N).exec(str).slice(1,3);
};
var str= "this-----that---these------those";
var N= 2;
var result= getSlice(str, generateRegExp, N);
console.log(result);
jsFiddle
Option with the function 2:
var getSlice= function(str, regex, N) {
var re= new RegExp( "^((?:[\\s\\S]*?"+regex+"){"+(N-1)+"}[\\s\\S]*?)"+regex+"([\\s\\S]*)$" );
return re.exec(str).slice(1,3);
};
var str= "this-----that---these------those";
var N= 3;
var result= getSlice(str, "---*", N);
console.log(result);
jsFiddle

Related

How to get value in $1 in regex to a variable for further manipulation [duplicate]

You can backreference like this in JavaScript:
var str = "123 $test 123";
str = str.replace(/(\$)([a-z]+)/gi, "$2");
This would (quite silly) replace "$test" with "test". But imagine I'd like to pass the resulting string of $2 into a function, which returns another value. I tried doing this, but instead of getting the string "test", I get "$2". Is there a way to achieve this?
// Instead of getting "$2" passed into somefunc, I want "test"
// (i.e. the result of the regex)
str = str.replace(/(\$)([a-z]+)/gi, somefunc("$2"));
Like this:
str.replace(regex, function(match, $1, $2, offset, original) { return someFunc($2); })
Pass a function as the second argument to replace:
str = str.replace(/(\$)([a-z]+)/gi, myReplace);
function myReplace(str, group1, group2) {
return "+" + group2 + "+";
}
This capability has been around since Javascript 1.3, according to mozilla.org.
Using ESNext, quite a dummy links replacer but just to show-case how it works :
let text = 'Visit http://lovecats.com/new-posts/ and https://lovedogs.com/best-dogs NOW !';
text = text.replace(/(https?:\/\/[^ ]+)/g, (match, link) => {
// remove ending slash if there is one
link = link.replace(/\/?$/, '');
return `${link.substr(link.lastIndexOf('/') +1)}`;
});
document.body.innerHTML = text;
Note: Previous answer was missing some code. It's now fixed + example.
I needed something a bit more flexible for a regex replace to decode the unicode in my incoming JSON data:
var text = "some string with an encoded 's' in it";
text.replace(/&#(\d+);/g, function() {
return String.fromCharCode(arguments[1]);
});
// "some string with an encoded 's' in it"
If you would have a variable amount of backreferences then the argument count (and places) are also variable. The MDN Web Docs describe the follwing syntax for sepcifing a function as replacement argument:
function replacer(match[, p1[, p2[, p...]]], offset, string)
For instance, take these regular expressions:
var searches = [
'test([1-3]){1,3}', // 1 backreference
'([Ss]ome) ([A-z]+) chars', // 2 backreferences
'([Mm][a#]ny) ([Mm][0o]r[3e]) ([Ww][0o]rd[5s])' // 3 backreferences
];
for (var i in searches) {
"Some string chars and many m0re w0rds in this test123".replace(
new RegExp(
searches[i]
function(...args) {
var match = args[0];
var backrefs = args.slice(1, args.length - 2);
// will be: ['Some', 'string'], ['many', 'm0re', 'w0rds'], ['123']
var offset = args[args.length - 2];
var string = args[args.length - 1];
}
)
);
}
You can't use 'arguments' variable here because it's of type Arguments and no of type Array so it doesn't have a slice() method.

Using simple angular filter to replace all occurences of certain strings in input string regardless of case and whitespaces

On my website I have a commentary field, where people can write whatever they want. To prevent spam and unserious comments, I'm using an angular filter in this way:
<span>{{comment | mouthWash}}</span>
The angular filter fetches an array containing banned words and scans through the input string and replaces all the occurences of the fetched words. The code for the filter is as below:
app.filter('mouthWash', function($http) {
var badWords;
$http.get('js/objects/bad-words.json').success(function (data) {
badWords = data;
});
return function(input) {
angular.forEach(badWords, function(word){
var regEx = new RegExp(word);
input = input.replace(regEx, "mooh");
});
return input;
};
});
bad-words.json is something like this:
["fuck", "ass", "shit", etc...]
So as an example <span>{{ "fuck this" | mouthWash}}</span> is outputted as <span>mooh this</span>
This is working perfectly, except that I want it to ignore whitespaces, to make it more bullet proof. I do not have much experience with regex, so if anyone had a simple soloution to this, I would be really grateful.
just change new RegExp(word, "ig"); to new RegExp("ig");
working example:
var words = ['pig', 'dog', '', ' ', 'cow'];
words.forEach(function(word) {
var regEx = new RegExp("ig");
word = word.replace(regEx, "mooh");
console.log(word);
});
Output:
"pmooh"
"dog"
""
" "
"cow"
This is the code I ended up with:
app.filter('mouthWash', function($http) {
var badWords;
$http.get('js/objects/bad-words.json').success(function (data) {
badWords = data;
});
return function(input) {
angular.forEach(badWords, function(word){
var str = word.substring(0,1)+"\\s*";
for (var i = 1; i < word.length - 1; i++) str = str + word.substring(i,i+1)+"\\s*";
str = str + word.substring(word.length - 1,word.length);
var regEx = new RegExp(str, "gi");
input = input.replace(regEx, "mooh");
});
return input;
};
});
I created a for loop that would loop through every character of the banned word, adding the character together with \s* (so that spaces was ignored) to a string.
for (var i = 1; i < word.length - 1; i++) str = str + word.substring(i,i+1)+"\\s*";
Then created a regExp from the string, by using the regExp constructor with the string as first parameter and "gi" as second, to make the regExp global and case insensitive.
var regEx = new RegExp(str, "gi");
Then that regex was used to search through input string and replace all matches with "mooh".

javascript regex pattern with an array

I want to create a regex pattern which should be able to search through an array.
Let's say :
var arr = [ "first", "second", "third" ];
var match = text.match(/<arr>/);
which should be able to match only
<first> or <second> or <third> ......
but should ignore
<ffirst> or <dummy>
I need an efficient approach please .
Any help would be great .
Thanks
First you can do array.map to quote all special regex characters.
Then you can do array.join to join the array elements using | and create an instance of RegExp.
Code:
function quoteSpecial(str) { return str.replace(/([\[\]^$|()\\+*?{}=!.])/g, '\\$1'); }
var arr = [ "first", "second", "third", "fo|ur" ];
var re = new RegExp('<(?:' + arr.map(quoteSpecial).join('|') + ')>');
//=> /<(?:first|second|third|fo\|ur)>/
then use this RegExp object:
'<first>'.match(re); // ["<first>"]
'<ffirst>'.match(re); // null
'<dummy>'.match(re); // null
'<second>'.match(re); // ["<second>"]
'<fo|ur>'.match(re); // ["<fo|ur>"]
You should search for a specific word from a list using (a|b|c).
The list is made from the arr by joining the values with | char as glue
var arr = [ "first", "second", "third" ];
var match = text.match(new RegExp("<(?:"+arr.join("|")+")>")); //matches <first> <second> and <third>
Note that if your "source" words might contain regular expression's preserved characters - you might get into trouble - so you might need to escape those characters before joining the array
A good function for doing so can be found here:
function regexpQuote(str, delimiter) {
return String(str)
.replace(new RegExp('[.\\\\+*?\\[\\^\\]$(){}=!<>|:\\' + (delimiter || '') + '-]', 'g'), '\\$&');
}
so in this case you'll have
function escapeArray(arr){
var escaped = [];
for(var i in arr){
escaped.push(regexpQuote(arr[i]));
}
return escaped;
}
var arr = [ "first", "second", "third" ];
var pattern = new RegExp("<(?:"+escapeArray(arr).join("|")+")>");
var match = text.match(pattern); //matches <first> <second> and <third>

Regex remove repeated characters from a string by javascript

I have found a way to remove repeated characters from a string using regular expressions.
function RemoveDuplicates() {
var str = "aaabbbccc";
var filtered = str.replace(/[^\w\s]|(.)\1/gi, "");
alert(filtered);
}
Output: abc
this is working fine.
But if str = "aaabbbccccabbbbcccccc" then output is abcabc.
Is there any way to get only unique characters or remove all duplicates one?
Please let me know if there is any way.
A lookahead like "this, followed by something and this":
var str = "aaabbbccccabbbbcccccc";
console.log(str.replace(/(.)(?=.*\1)/g, "")); // "abc"
Note that this preserves the last occurrence of each character:
var str = "aabbccxccbbaa";
console.log(str.replace(/(.)(?=.*\1)/g, "")); // "xcba"
Without regexes, preserving order:
var str = "aabbccxccbbaa";
console.log(str.split("").filter(function(x, n, s) {
return s.indexOf(x) == n
}).join("")); // "abcx"
This is an old question, but in ES6 we can use Sets. The code looks like this:
var test = 'aaabbbcccaabbbcccaaaaaaaasa';
var result = Array.from(new Set(test)).join('');
console.log(result);

JavaScript - string regex backreferences

You can backreference like this in JavaScript:
var str = "123 $test 123";
str = str.replace(/(\$)([a-z]+)/gi, "$2");
This would (quite silly) replace "$test" with "test". But imagine I'd like to pass the resulting string of $2 into a function, which returns another value. I tried doing this, but instead of getting the string "test", I get "$2". Is there a way to achieve this?
// Instead of getting "$2" passed into somefunc, I want "test"
// (i.e. the result of the regex)
str = str.replace(/(\$)([a-z]+)/gi, somefunc("$2"));
Like this:
str.replace(regex, function(match, $1, $2, offset, original) { return someFunc($2); })
Pass a function as the second argument to replace:
str = str.replace(/(\$)([a-z]+)/gi, myReplace);
function myReplace(str, group1, group2) {
return "+" + group2 + "+";
}
This capability has been around since Javascript 1.3, according to mozilla.org.
Using ESNext, quite a dummy links replacer but just to show-case how it works :
let text = 'Visit http://lovecats.com/new-posts/ and https://lovedogs.com/best-dogs NOW !';
text = text.replace(/(https?:\/\/[^ ]+)/g, (match, link) => {
// remove ending slash if there is one
link = link.replace(/\/?$/, '');
return `${link.substr(link.lastIndexOf('/') +1)}`;
});
document.body.innerHTML = text;
Note: Previous answer was missing some code. It's now fixed + example.
I needed something a bit more flexible for a regex replace to decode the unicode in my incoming JSON data:
var text = "some string with an encoded 's' in it";
text.replace(/&#(\d+);/g, function() {
return String.fromCharCode(arguments[1]);
});
// "some string with an encoded 's' in it"
If you would have a variable amount of backreferences then the argument count (and places) are also variable. The MDN Web Docs describe the follwing syntax for sepcifing a function as replacement argument:
function replacer(match[, p1[, p2[, p...]]], offset, string)
For instance, take these regular expressions:
var searches = [
'test([1-3]){1,3}', // 1 backreference
'([Ss]ome) ([A-z]+) chars', // 2 backreferences
'([Mm][a#]ny) ([Mm][0o]r[3e]) ([Ww][0o]rd[5s])' // 3 backreferences
];
for (var i in searches) {
"Some string chars and many m0re w0rds in this test123".replace(
new RegExp(
searches[i]
function(...args) {
var match = args[0];
var backrefs = args.slice(1, args.length - 2);
// will be: ['Some', 'string'], ['many', 'm0re', 'w0rds'], ['123']
var offset = args[args.length - 2];
var string = args[args.length - 1];
}
)
);
}
You can't use 'arguments' variable here because it's of type Arguments and no of type Array so it doesn't have a slice() method.

Categories

Resources