Shortest regular expression match if already part of another match - javascript

I would like to retrieve the shortest match from a long text where strings are repeated throughout the text. However, matches within text that has already been matched aren't being found.
Here's a simplified version of the issue I'm facing:
Code: "ababc".match(/a.+c/g)
Observed result: ["ababc"]
Expected result: ["ababc", "abc"]
Therefore I'm wondering whether there is an easier way to retrieve the substring "abc" than manually writing recursive code to search within matches.

As mentioned in my comment, you can't do what you want with regex alone.
You gave a simplified example, so I'm not sure how far this will take you, but here is my stab at doing what you are looking for. I have a sneaking suspicion your "a" and "c" characters are not the same, so you will need to modify this accordingly (e.g. pass them as arguments to the function).
function getShortestMatch(str) {
var str = str || '';
var match,
index,
regex,
length,
results = [];
// iterate along the string one character at a time
for (index = 0, length = str.length; index < length; index++) {
// if the current character is 'a' (the beginning part of our substring match)
if (str[index] === 'a') {
// create a new regex that first consumes everything up to
// the starting character. Then matches for everything from there to
// the ending substring char 'c'. It is a lazy match so it will stop
// at the first matched ending char 'c'
regex = new RegExp('^.{' + index + '}(a.+?c)');
match = str.match(regex);
// if there is a match, then push to the results array
if (match && match[1]) {
results.push(match[1]);
}
}
}
// sort the results array ascending (shortest first)
results.sort(function(a,b){
return a.length - b.length;
});
// log all results matched to the console for sake of example
console.log(results);
// return the first (shortest) element
return results[0];
}
Example
getShortestMatch('ababcabbc');
// output showing all results found (from console.log in the function)
["abc", "abbc", "ababc"]
// return value
"abc"
Note: This function does not attempt to find all possible matches to "everything between an 'a' and a 'c'", since your question was about finding the shortest one. If for some reason you want all possible matches to that, then a greedy .+ regex would be thrown into the mix.

Loop through substrings starting from each successive character (using slice), matching against a regexp which is anchored to the start of the string (^), and uses non-greedy matching (?):
const input = "ababc";
const regexp = /^a.+?c/;
const results = [];
for (var i = 0; i < input.length; i++) {
var match = input.slice(i).match(regexp);
if (match) results.push(match[0]);
}
console.log("all results are", results);
var shortest = results.sort((a, b) => a.length - b.length)[0];
console.log("shortest result is", shortest);

This is the answer I went with due to its effectiveness, simplicity and efficiency:
let seq = "us warship";
let source = "The traditional US adversary has also positioned a spy ship off the coast of Delaware and carried out flights near a US Navy warship, concerning American officials.";
let re = new RegExp(`\\b${seq.replace(/\s/g, "\\b.+?\\b")}\\b`, "gi");
let snippet = null;
let matches;
while (matches = re.exec(source)) {
let match = matches[0];
if (!snippet || match.length < snippet.length) {
snippet = match;
}
re.lastIndex -= (match.length - 1);
}
console.log(snippet); // "US Navy warship"
Source: https://stackoverflow.com/a/8236152/1055499

Related

JS Regex returning -1 & 0

I was tasked with the following:
take a string
print each of the vowels on a new line (in order) then...
print each of the consonants on a new line (in order)
The problem I found was with the regex. I originally used...
/[aeiouAEIOU\s]/g
But this would return 0 with a vowel and -1 with a consonant (so everything happened in reverse).
I really struggled to understand why and couldn't for the life of me find the answer. In the end it was simple enough to just invert the string but I want to know why this is happening the way it is. Can anyone help?
let i;
let vowels = /[^aeiouAEIOU\s]/g;
let array = [];
function vowelsAndConsonants(s) {
for(i=0;i<s.length;i++){
//if char is vowel then push to array
if(s[i].search(vowels)){
array.push(s[i]);
}
}
for(i=0;i<s.length;i++){
//if char is cons then push to array
if(!s[i].search(vowels)){
array.push(s[i]);
}
}
for(i=0;i<s.length;i++){
console.log(array[i]);
}
}
vowelsAndConsonants("javascript");
if(vowels.test(s[i])){ which will return true or false if it matches, or
if(s[i].search(vowels) !== -1){ and if(s[i].search(vowels) === -1){
is what you want if you want to fix your code.
-1 is not falsey so your if statement will not function correctly. -1 is what search returns if it doesn't find a match. It has to do this because search() returns the index position of the match, and the index could be anywhere from 0 to Infinity, so only negative numbers are available to indicate non-existent index:
MDN search() reference
Below is a RegEx that matches vowel OR any letter OR other, effectively separating out vowel, consonant, everything else into 3 capture groups. This makes it so you don't need to test character by character and separate them out manually.
Then iterates and pushes them into their respective arrays with a for-of loop.
const consonants = [], vowels = [], other = [];
const str = ";bat cat set rat. let ut cut mut,";
for(const [,cons,vow,etc] of str.matchAll(/([aeiouAEIOU])|([a-zA-Z])|(.)/g))
cons&&consonants.push(cons) || vow&&vowels.push(vow) || typeof etc === 'string'&&other.push(etc)
console.log(
consonants.join('') + '\n' + vowels.join('') + '\n' + other.join('')
)
There are a couple of inbuilt functions available:
let some_string = 'Mary had a little lamb';
let vowels = [...some_string.match(/[aeiouAEIOU\s]/g)];
let consonents = [...some_string.match(/[^aeiouAEIOU\s]/g)];
console.log(vowels);
console.log(consonents);
I think that you don't understand correctly how your regular expression works. In the brackets you have only defined a set of characters you want to match /[^aeiouAEIOU\s]/g and further by using the caret [^]as first in your group, you say that you want it to match everything but the characters in the carets. Sadly you don't provide an example of input and expected output, so I am only guessing, but I thing you could do the following:
let s = "avndexleops";
let keep_vowels = s.replace(/[^aeiouAEIOU\s]/g, '');
console.log(keep_vowels);
let keep_consonants = s.replace(/[aeiouAEIOU\s]/g, '');
console.log(keep_consonants);
Please provide example of expected input and output.
You used:
/[^aeiouAEIOU\s]/g
Instead of:
/[aeiouAEIOU\s]/g
^ means "not", so your REGEX /[^aeiouAEIOU\s]/g counts all the consonants.

Regex extracting multiple matches for string [duplicate]

I'm trying to obtain all possible matches from a string using regex with javascript. It appears that my method of doing this is not matching parts of the string that have already been matched.
Variables:
var string = 'A1B1Y:A1B2Y:A1B3Y:A1B4Z:A1B5Y:A1B6Y:A1B7Y:A1B8Z:A1B9Y:A1B10Y:A1B11Y';
var reg = /A[0-9]+B[0-9]+Y:A[0-9]+B[0-9]+Y/g;
Code:
var match = string.match(reg);
All matched results I get:
A1B1Y:A1B2Y
A1B5Y:A1B6Y
A1B9Y:A1B10Y
Matched results I want:
A1B1Y:A1B2Y
A1B2Y:A1B3Y
A1B5Y:A1B6Y
A1B6Y:A1B7Y
A1B9Y:A1B10Y
A1B10Y:A1B11Y
In my head, I want A1B1Y:A1B2Y to be a match along with A1B2Y:A1B3Y, even though A1B2Y in the string will need to be part of two matches.
Without modifying your regex, you can set it to start matching at the beginning of the second half of the match after each match using .exec and manipulating the regex object's lastIndex property.
var string = 'A1B1Y:A1B2Y:A1B3Y:A1B4Z:A1B5Y:A1B6Y:A1B7Y:A1B8Z:A1B9Y:A1B10Y:A1B11Y';
var reg = /A[0-9]+B[0-9]+Y:A[0-9]+B[0-9]+Y/g;
var matches = [], found;
while (found = reg.exec(string)) {
matches.push(found[0]);
reg.lastIndex -= found[0].split(':')[1].length;
}
console.log(matches);
//["A1B1Y:A1B2Y", "A1B2Y:A1B3Y", "A1B5Y:A1B6Y", "A1B6Y:A1B7Y", "A1B9Y:A1B10Y", "A1B10Y:A1B11Y"]
Demo
As per Bergi's comment, you can also get the index of the last match and increment it by 1 so it instead of starting to match from the second half of the match onwards, it will start attempting to match from the second character of each match onwards:
reg.lastIndex = found.index+1;
Demo
The final outcome is the same. Though, Bergi's update has a little less code and performs slightly faster. =]
You cannot get the direct result from match, but it is possible to produce the result via RegExp.exec and with some modification to the regex:
var regex = /A[0-9]+B[0-9]+Y(?=(:A[0-9]+B[0-9]+Y))/g;
var input = 'A1B1Y:A1B2Y:A1B3Y:A1B4Z:A1B5Y:A1B6Y:A1B7Y:A1B8Z:A1B9Y:A1B10Y:A1B11Y'
var arr;
var results = [];
while ((arr = regex.exec(input)) !== null) {
results.push(arr[0] + arr[1]);
}
I used zero-width positive look-ahead (?=pattern) in order not to consume the text, so that the overlapping portion can be rematched.
Actually, it is possible to abuse replace method to do achieve the same result:
var input = 'A1B1Y:A1B2Y:A1B3Y:A1B4Z:A1B5Y:A1B6Y:A1B7Y:A1B8Z:A1B9Y:A1B10Y:A1B11Y'
var results = [];
input.replace(/A[0-9]+B[0-9]+Y(?=(:A[0-9]+B[0-9]+Y))/g, function ($0, $1) {
results.push($0 + $1);
return '';
});
However, since it is replace, it does extra useless replacement work.
Unfortunately, it's not quite as simple as a single string.match.
The reason is that you want overlapping matches, which the /g flag doesn't give you.
You could use lookahead:
var re = /A\d+B\d+Y(?=:A\d+B\d+Y)/g;
But now you get:
string.match(re); // ["A1B1Y", "A1B2Y", "A1B5Y", "A1B6Y", "A1B9Y", "A1B10Y"]
The reason is that lookahead is zero-width, meaning that it just says whether the pattern comes after what you're trying to match or not; it doesn't include it in the match.
You could use exec to try and grab what you want. If a regex has the /g flag, you can run exec repeatedly to get all the matches:
// using re from above to get the overlapping matches
var m;
var matches = [];
var re2 = /A\d+B\d+Y:A\d+B\d+Y/g; // make another regex to get what we need
while ((m = re.exec(string)) !== null) {
// m is a match object, which has the index of the current match
matches.push(string.substring(m.index).match(re2)[0]);
}
matches == [
"A1B1Y:A1B2Y",
"A1B2Y:A1B3Y",
"A1B5Y:A1B6Y",
"A1B6Y:A1B7Y",
"A1B9Y:A1B10Y",
"A1B10Y:A1B11Y"
];
Here's a fiddle of this in action. Open up the console to see the results
Alternatively, you could split the original string on :, then loop through the resulting array, pulling out the the ones that match when array[i] and array[i+1] both match like you want.

Tokenize a JavaScript String depending on the characters

In JavaScript, let's say I have a String like "23+var-5/422*b".
I want to split this String so that I get [23,+,var,-,5,/,422,*,b].
I want to tokenize it so that I split the string into 3 types of tokens:
Numerical literals, [0-9].
String literals, [A-z].
Operator characters, [-+*/].
So basically, go through the string, and for each "cluster of characters" that share the same class (each with 1 or more characters), convert that into a token.
I could probably use a for loop, comparing each character with each class, and manually create a token every time the current "character class" changes... it would be very tedious and use many variables and loops.
Does anyone know a more elegant (less verbose) way to get there?
A global regexp match will do this for you:
var str = "23+var-5/422*b";
var arr = str.match(/[0-9]+|[a-zA-Z]+|[-+*/]/g); // notice the creation of one token
// per operator (even if consecutive)
However, it simply ignores invalid characters instead of erroring out.
Here's a way to do it using Regex. Obviously the code can be simplified more if you use Underscore.js or CoffeeScript. So here's a longer version using vanilla JS:
var s = "23+var-5/422*b"; // your string
var re1 = /[0-9]/; // Regex for numerals
var re2 = /[a-zA-Z]/; // Regex for roman chars
var re3 = /[-+*\/]/; // Regex you wanted for operators
// Helper function, return true if n none-negative
function nonNegative(n) {
return n >= 0;
}
// helper function: add any none-negative n to array arr
function addNonNegative(n, arr) {
if (nonNegative(n)) {arr.push(n)};
}
// The main function to split string s
function split(s) {
var result = []; // The result array, initialized
// Do while string s is none empty.
while(s.length > 0) {
// The order of indices of regex found
var order = [];
// search for index or which the regex occurs, then if that index is none-negative, add it to the 'order' array
addNonNegative(s.search(re1), order);
addNonNegative(s.search(re2), order);
addNonNegative(s.search(re3), order);
// sort the order array
order = order.sort();
// variables to slice the string s.
// start is always 0. Marks the starting index of the first matched regex
var start = order.shift();
// Marks the starting index of the second matched regex
var end = order.shift(); // end is the second result in order
result.push(s.slice(start, end)); // slice the string s from start to end
// update s so that exclude what was sliced before
s = s.slice(end);
// boundary condition: finally when end is null once all regex have been pulled, set s = ""
if (end == null) {s = ""};
}
return result;
}

Find all matches in a concatenated string of same-length words?

I have a long Javascript string with letters like :
"aapaalaakaaiartaxealpyaaraa"
This string is actually a chained list of 3-letter-words : "aap","aal","aak","aai", "art", "axe","alp", "yaa" and "raa"
In reality I have many of these strings, with different word lengths, and they can be up to 2000 words long, so I need the fastest way to get all the words that start with a certain string. So when searching for all words that start with "aa" it should return :
"aap","aal","aak" and "aai"
Is there a way to do this with a regex ? It's very important that it only matches on each 3-letter word, so matches in between words should not be counted, so "aar" should not be returned, and also not "yaa" or "raa".
The simple way:
var results = [];
for (var i = 0; i < str.length; i += 3) {
if (str.substring(i, i + 2) === "aa") {
results.push(str.substring(i, i + 3));
}
}
Don’t ask whether it’s the fastest – just check whether it’s fast enough, first. :)
How about:
var str = 'aapaalaakaaiartaxealpyaaraa';
var pattern = /^aa/;
var result = str.match(/.{3}/g).filter(function(word) {
return pattern.test(word);
});
console.log(result); //=> ["aap","aal","aak","aai"]
"aapaalaakaaiartaxealpyaaraa".replace(/\w{3}|\w+/g,function(m){return m.match(/^aa/)?m+',':','}).split(',').filter(Boolean)

split string only on first instance of specified character

In my code I split a string based on _ and grab the second item in the array.
var element = $(this).attr('class');
var field = element.split('_')[1];
Takes good_luck and provides me with luck. Works great!
But, now I have a class that looks like good_luck_buddy. How do I get my javascript to ignore the second _ and give me luck_buddy?
I found this var field = element.split(new char [] {'_'}, 2); in a c# stackoverflow answer but it doesn't work. I tried it over at jsFiddle...
Use capturing parentheses:
'good_luck_buddy'.split(/_(.*)/s)
['good', 'luck_buddy', ''] // ignore the third element
They are defined as
If separator contains capturing parentheses, matched results are returned in the array.
So in this case we want to split at _.* (i.e. split separator being a sub string starting with _) but also let the result contain some part of our separator (i.e. everything after _).
In this example our separator (matching _(.*)) is _luck_buddy and the captured group (within the separator) is lucky_buddy. Without the capturing parenthesis the luck_buddy (matching .*) would've not been included in the result array as it is the case with simple split that separators are not included in the result.
We use the s regex flag to make . match on newline (\n) characters as well, otherwise it would only split to the first newline.
What do you need regular expressions and arrays for?
myString = myString.substring(myString.indexOf('_')+1)
var myString= "hello_there_how_are_you"
myString = myString.substring(myString.indexOf('_')+1)
console.log(myString)
I avoid RegExp at all costs. Here is another thing you can do:
"good_luck_buddy".split('_').slice(1).join('_')
With help of destructuring assignment it can be more readable:
let [first, ...rest] = "good_luck_buddy".split('_')
rest = rest.join('_')
A simple ES6 way to get both the first key and remaining parts in a string would be:
const [key, ...rest] = "good_luck_buddy".split('_')
const value = rest.join('_')
console.log(key, value) // good, luck_buddy
Nowadays String.prototype.split does indeed allow you to limit the number of splits.
str.split([separator[, limit]])
...
limit Optional
A non-negative integer limiting the number of splits. If provided, splits the string at each occurrence of the specified separator, but stops when limit entries have been placed in the array. Any leftover text is not included in the array at all.
The array may contain fewer entries than limit if the end of the string is reached before the limit is reached.
If limit is 0, no splitting is performed.
caveat
It might not work the way you expect. I was hoping it would just ignore the rest of the delimiters, but instead, when it reaches the limit, it splits the remaining string again, omitting the part after the split from the return results.
let str = 'A_B_C_D_E'
const limit_2 = str.split('_', 2)
limit_2
(2) ["A", "B"]
const limit_3 = str.split('_', 3)
limit_3
(3) ["A", "B", "C"]
I was hoping for:
let str = 'A_B_C_D_E'
const limit_2 = str.split('_', 2)
limit_2
(2) ["A", "B_C_D_E"]
const limit_3 = str.split('_', 3)
limit_3
(3) ["A", "B", "C_D_E"]
This solution worked for me
var str = "good_luck_buddy";
var index = str.indexOf('_');
var arr = [str.slice(0, index), str.slice(index + 1)];
//arr[0] = "good"
//arr[1] = "luck_buddy"
OR
var str = "good_luck_buddy";
var index = str.indexOf('_');
var [first, second] = [str.slice(0, index), str.slice(index + 1)];
//first = "good"
//second = "luck_buddy"
You can use the regular expression like:
var arr = element.split(/_(.*)/)
You can use the second parameter which specifies the limit of the split.
i.e:
var field = element.split('_', 1)[1];
Replace the first instance with a unique placeholder then split from there.
"good_luck_buddy".replace(/\_/,'&').split('&')
["good","luck_buddy"]
This is more useful when both sides of the split are needed.
I need the two parts of string, so, regex lookbehind help me with this.
const full_name = 'Maria do Bairro';
const [first_name, last_name] = full_name.split(/(?<=^[^ ]+) /);
console.log(first_name);
console.log(last_name);
Non-regex solution
I ran some benchmarks, and this solution won hugely:1
str.slice(str.indexOf(delim) + delim.length)
// as function
function gobbleStart(str, delim) {
return str.slice(str.indexOf(delim) + delim.length);
}
// as polyfill
String.prototype.gobbleStart = function(delim) {
return this.slice(this.indexOf(delim) + delim.length);
};
Performance comparison with other solutions
The only close contender was the same line of code, except using substr instead of slice.
Other solutions I tried involving split or RegExps took a big performance hit and were about 2 orders of magnitude slower. Using join on the results of split, of course, adds an additional performance penalty.
Why are they slower? Any time a new object or array has to be created, JS has to request a chunk of memory from the OS. This process is very slow.
Here are some general guidelines, in case you are chasing benchmarks:
New dynamic memory allocations for objects {} or arrays [] (like the one that split creates) will cost a lot in performance.
RegExp searches are more complicated and therefore slower than string searches.
If you already have an array, destructuring arrays is about as fast as explicitly indexing them, and looks awesome.
Removing beyond the first instance
Here's a solution that will slice up to and including the nth instance. It's not quite as fast, but on the OP's question, gobble(element, '_', 1) is still >2x faster than a RegExp or split solution and can do more:
/*
`gobble`, given a positive, non-zero `limit`, deletes
characters from the beginning of `haystack` until `needle` has
been encountered and deleted `limit` times or no more instances
of `needle` exist; then it returns what remains. If `limit` is
zero or negative, delete from the beginning only until `-(limit)`
occurrences or less of `needle` remain.
*/
function gobble(haystack, needle, limit = 0) {
let remain = limit;
if (limit <= 0) { // set remain to count of delim - num to leave
let i = 0;
while (i < haystack.length) {
const found = haystack.indexOf(needle, i);
if (found === -1) {
break;
}
remain++;
i = found + needle.length;
}
}
let i = 0;
while (remain > 0) {
const found = haystack.indexOf(needle, i);
if (found === -1) {
break;
}
remain--;
i = found + needle.length;
}
return haystack.slice(i);
}
With the above definition, gobble('path/to/file.txt', '/') would give the name of the file, and gobble('prefix_category_item', '_', 1) would remove the prefix like the first solution in this answer.
Tests were run in Chrome 70.0.3538.110 on macOSX 10.14.
Use the string replace() method with a regex:
var result = "good_luck_buddy".replace(/.*?_/, "");
console.log(result);
This regex matches 0 or more characters before the first _, and the _ itself. The match is then replaced by an empty string.
Javascript's String.split unfortunately has no way of limiting the actual number of splits. It has a second argument that specifies how many of the actual split items are returned, which isn't useful in your case. The solution would be to split the string, shift the first item off, then rejoin the remaining items::
var element = $(this).attr('class');
var parts = element.split('_');
parts.shift(); // removes the first item from the array
var field = parts.join('_');
Here's one RegExp that does the trick.
'good_luck_buddy' . split(/^.*?_/)[1]
First it forces the match to start from the
start with the '^'. Then it matches any number
of characters which are not '_', in other words
all characters before the first '_'.
The '?' means a minimal number of chars
that make the whole pattern match are
matched by the '.*?' because it is followed
by '_', which is then included in the match
as its last character.
Therefore this split() uses such a matching
part as its 'splitter' and removes it from
the results. So it removes everything
up till and including the first '_' and
gives you the rest as the 2nd element of
the result. The first element is "" representing
the part before the matched part. It is
"" because the match starts from the beginning.
There are other RegExps that work as
well like /_(.*)/ given by Chandu
in a previous answer.
The /^.*?_/ has the benefit that you
can understand what it does without
having to know about the special role
capturing groups play with replace().
if you are looking for a more modern way of doing this:
let raw = "good_luck_buddy"
raw.split("_")
.filter((part, index) => index !== 0)
.join("_")
Mark F's solution is awesome but it's not supported by old browsers. Kennebec's solution is awesome and supported by old browsers but doesn't support regex.
So, if you're looking for a solution that splits your string only once, that is supported by old browsers and supports regex, here's my solution:
String.prototype.splitOnce = function(regex)
{
var match = this.match(regex);
if(match)
{
var match_i = this.indexOf(match[0]);
return [this.substring(0, match_i),
this.substring(match_i + match[0].length)];
}
else
{ return [this, ""]; }
}
var str = "something/////another thing///again";
alert(str.splitOnce(/\/+/)[1]);
For beginner like me who are not used to Regular Expression, this workaround solution worked:
var field = "Good_Luck_Buddy";
var newString = field.slice( field.indexOf("_")+1 );
slice() method extracts a part of a string and returns a new string and indexOf() method returns the position of the first found occurrence of a specified value in a string.
This should be quite fast
function splitOnFirst (str, sep) {
const index = str.indexOf(sep);
return index < 0 ? [str] : [str.slice(0, index), str.slice(index + sep.length)];
}
console.log(splitOnFirst('good_luck', '_')[1])
console.log(splitOnFirst('good_luck_buddy', '_')[1])
This worked for me on Chrome + FF:
"foo=bar=beer".split(/^[^=]+=/)[1] // "bar=beer"
"foo==".split(/^[^=]+=/)[1] // "="
"foo=".split(/^[^=]+=/)[1] // ""
"foo".split(/^[^=]+=/)[1] // undefined
If you also need the key try this:
"foo=bar=beer".split(/^([^=]+)=/) // Array [ "", "foo", "bar=beer" ]
"foo==".split(/^([^=]+)=/) // [ "", "foo", "=" ]
"foo=".split(/^([^=]+)=/) // [ "", "foo", "" ]
"foo".split(/^([^=]+)=/) // [ "foo" ]
//[0] = ignored (holds the string when there's no =, empty otherwise)
//[1] = hold the key (if any)
//[2] = hold the value (if any)
a simple es6 one statement solution to get the first key and remaining parts
let raw = 'good_luck_buddy'
raw.split('_')
.reduce((p, c, i) => i === 0 ? [c] : [p[0], [...p.slice(1), c].join('_')], [])
You could also use non-greedy match, it's just a single, simple line:
a = "good_luck_buddy"
const [,g,b] = a.match(/(.*?)_(.*)/)
console.log(g,"and also",b)

Categories

Resources