regex to match dots but not when enclosed by nested square brackets

regex to match dots but not when enclosed by nested square brackets - javascript

input
books.copies.[read_by.[p_id="65784"].page=5468].text.[paragraph="20"].letters
the idea is to split the string by dots but ignore those inside square brackets
so after splitting there should be an array
[
'books',
'copies',
'[read_by.[p_id="65784"].page=5468]',
'text',
'[paragraph="20"]',
'letters'
]
I already looked at this answer but it doesn't work with nested square brackets, which is what i need. Also I'm using javascript, so negative lookbehinds are not supported.
Help is much appreciated.
Edit 1: expand example

It isn't possible to do it with a regex in Javascript that isn't able to match nested structures. You need to use the good old method: a stack.
var text = 'books.copies.[read_by.[p_id="65784"].page=5468].text.[paragraph="20"].letters';
var item = '', result = [], stack = 0;
for (var i=0; i < text.length; i++) {
if ( text[i] == '.' && stack == 0 ) {
result.push(item);
item = '';
continue;
} else if ( text[i] == '[' ) {
stack++;
} else if ( text[i] == ']' ) {
stack--;
}
item += text[i];
}
result.push(item);
console.log(result);

You need to write a parser for this since a JavaScript regex does not support regex recursion, nor balanced constructs.
The point in these functions is that they keep a stack (level, openBrackets) of opening delimiters (in your case, it is [) and then check the stack state: if the stack is not emppty, the found . is considered inside the brackets, and is thus just appended to the current match. Else, when the stack is empty, the . found is considered outside of brackets, and is thus used to split on (the current value is appended to the output array (result, ret)).
function splitByDotsOutsideBrackets(string){
var openBrackets = 0, ret = [], i = 0;
while (i < string.length){
if (string.charAt(i) == '[')
openBrackets++;
else if (string.charAt(i) == ']')
openBrackets--;
else if (string.charAt(i) == "." && openBrackets == 0){
ret.push(string.substr(0, i));
string = string.substr(i + 1);
i = -1;
}
i++;
}
if (string != "") ret.push(string);
return ret;
}
var res = splitByDotsOutsideBrackets('books.copies.[read_by.[p_id="65784"].page=5468].text.[paragraph="20"].letters');
console.log(res);
Or another variation:
function splitOnDotsOutsideNestedBrackets(str) {
var result = [], start = 0, level = 0;
for (var i = 0; i < str.length; ++i) {
switch (str[i]) {
case '[':
++level;
break;
case ']':
if (level > 0)
--level;
break;
case '.':
if (level)
break;
if (start < i)
result.push(str.substr(start, i - start));
start = i + 1;
break;
}
}
if (start < i)
result.push(str.substr(start, i - start));
return result;
}
var s = 'books.copies.[read_by.[p_id="65784"].page=5468].text.[paragraph="20"].letters';
console.log(splitOnDotsOutsideNestedBrackets(s))
Adapted from one of my previous answers.

Related

What is the fastest way to count the number of lines in a string in JavaScript?

I'm looking for the most performant way to count the number of lines in a string in JavaScript for a string of any length. I have tested three approaches, but I feel like there may be a faster way that one of you is aware of.
Method 1:
// Split the string on newlines into an array and measure the array length
return string.split(/\r|\r\n|\n/g).length;
Method 2:
// Use match instead of split
return (string.match(/\r|\r\n|\n/g) || '').length + 1;
Method 3:
// Replace all of the newlines with empty space and compare line lengths
return string.length - string.replace(/\r|\r\n|\n/g, '').length + 1;

Find the first occurrence of a possible line break character and count this character:
function countLines(string) {
let count = 1;
let chr;
let i = 0, end = string.length;
for (; i < end; ++i) {
if (string[i] == '\n' || string[i] == '\r') {
count = 2;
chr = string[i];
break;
}
}
for (++i; i < end; ++i) {
if (string[i] == chr) {
++count;
}
}
return count;
}
const linuxString = 'Some\ntext\nwith\nfive\nlines';
const windowsString = 'Some\r\ntext\r\nwith\r\nfive\r\nlines';
const classicMacString = 'Some\rtext\rwith\rfive\rlines';
console.log(countLines(linuxString));
console.log(countLines(windowsString));
console.log(countLines(classicMacString));

Remove consecutive characters from string until it doesn't have any consecutive characters

If you see two consecutive characters that are the same, you pop them from left to right, until you cannot pop any more characters. Return the resulting string.
let str = "abba"
"abba" - pop the two b's -> "aa"
"aa" - pop the two a's -> ""
return ""
Here's what i have tried so far:
function match(str){
for (let i = 0; i< str.length; i++){
if (str[i] === str[i+1]){
return str.replace(str[i], "");
}
}
};
match('abba');
But it replaces one character only.The problem is if any two consecutive characters matches it needs to remove both of those and console (Like 'abba' to 'aa'). Then it needs to go over the updated string to do the same thing again (Like 'aa' to '')and console until the return string can't be changed anymore.
Here's another solution i found:
function removeAdjacentDuplicates(str) {
let newStr = '';
for (let i = 0; i < str.length; i++) {
if (str[i] !== str[i + 1])
if (str[i - 1] !== str[i])
newStr += str[i];
}
return newStr;
};
removeAdjacentDuplicates('abba');
But this iterates one time only. I need this to go on until there's no more consecutive characters. Also It would be great if good time complexity is maintained.

You can use a while loop to continuously loop until the result is equal to the previous result.
function removeAdjacentDuplicates(str) {
let newStr = '';
for (let i = 0; i < str.length; i++) {
if (str[i] !== str[i + 1])
if (str[i - 1] !== str[i])
newStr += str[i];
}
return newStr;
};
let before = 'abba';
let result = removeAdjacentDuplicates(before);
while(result != before){
before = result;
result = removeAdjacentDuplicates(before);
}
console.log(result);
If you want to add a limit to the number of pops, you can store the maximum pops in a variable and the number of pops in another (incremented in the loop), then add an expression to the while loop that instructs it not to execute when the number of pops is no longer smaller than the maximum number of pops permitted.
E.g:
function removeAdjacentDuplicates(str) {
let newStr = '';
for (let i = 0; i < str.length; i++) {
if (str[i] !== str[i + 1])
if (str[i - 1] !== str[i])
newStr += str[i];
}
return newStr;
};
let before = 'cabbac';
let result = removeAdjacentDuplicates(before);
const maxPop = 2;
var pops = 1; //It's 1 because we already removed the duplicates once on line 11
while (result != before && pops < maxPop) {
before = result;
result = removeAdjacentDuplicates(before);
pops++;
}
console.log(result);

You can use a regular expression to match consecutive characters and keep replacing until the string is unchanged.
function f(s) {
while (s != (s = s.replace(/(.)\1+/g, '')));
return s;
}
console.log(f("abba"))

How do I look in a string with text inside quotations, but ignoring anything inside brackets using regex?

For example my string is:
var str = 'Hello "Counts1 [ignore1] Counts2 [ignore2] Counts3 [ignore3] Count these too"';
How would I get everything inside the string that is inside quotations ignoring the characters inside the brackets?
So for example a regex to collect: "Counts1", "Counts2", "Counts3", "Count these too"
So far I only got:
var regex = /".*?"/g;
It outputs:
"Counts1 [ignore1] Counts2 [ignore2] Counts3 [ignore3]"

This should do it: /(?<="|\] )[\w ]+(?="| \[)/g
Positive lookbehind (?<="|\] ) makes sure that [\w ]+ has either " or ] to the left.
Positive lookahead (?="| \[) makes sure that [\w ]+ has either " or [ to the right.
Demo

You can also use the following non-regex approach that also supports nested square brackets:
const extract = (str) => {
let result = [], start = 0, i=0, level = 0, inquote=false;
for (let i = 0; i < str.length; ++i) {
switch (str[i]) {
case '"':
if (inquote && !level) {
if (start < i) {
result.push(str.substr(start, i - start));
}
start = i + 1;
} else {
start = i+1;
}
inquote = !inquote;
break;
case '[':
if (inquote) {
if (start < i && inquote && !level) {
result.push(str.substr(start, i - start));
}
++level;
}
break;
case ']':
if (inquote) {
if (level > 0) {
--level;
}
if (level == 0) {
start = i + 1;
}
}
break;
}
}
if (start < i)
result.push(str.substr(start, i - start));
return result;
}
console.log(extract('Hello "Counts1 [ignore1] Counts2 [ignore2] Counts3 [ignore3] Count these too"'));
console.log(extract('Hello "Counts1 [[ignore1] this [2]] Counts2 [ignore2] Counts3 [ignore3] Count these too"'));
console.log(extract('"Counts1 [[ignore1] [2]] Counts2 [ignore2] Count these too" and [junk], and "maybe this, too [not this]"'));

Function to check mathematical expression not working

The Function to check mathematical expression not working.
I debugged this on chrome, and i saw that when it gets to the first pop (stack.pop()!== chars[i]), it returns false, but it shouldn't.
var smarter_validate = function(str) {
var chars = str.split('');
var stack = [];
var lookup = {
'(': ')',
'[': ']',
'{': '}',
'<': '>'
};
var left = Object.keys(lookup);
var right = Object.keys(lookup).map(function(key) {
return lookup[key]
});
for (var i = 0; i < chars.length; i++) {
if (left.indexOf(chars[i]) !== (-1)) {
stack.push(chars[i]);
} else if (right.indexOf(chars[i]) !== (-1)) {
if ((stack.length === 0) || (stack.pop() !== chars[i])) {
return false;
}
}
}
return (stack.length === 0);
};
console.log("SMART VALIDATE" + smarter_validate('(3+4[*2{6+8}])'));

You actually have to compare the popped value's corresponding closing character with chars[i], not the popped value itself.
So you need to do
if (stack.length === 0 || lookup[stack.pop()] !== chars[i]) {
Now, when you { from the stack, you will look for the corresponding closing character from the lookup and compare it with the current closing character.
Alternatively you can simply push the expected closing character in the stack so that you don't have do the lookup during the comparison, like this
stack.push(lookup[chars[i]]);

Regex to remove redundant multiplication/division involving zero or one?

I have an odd problem where I am receiving computer generated equations (as a string) where multiplications/divisions with zero or one and lone zeros are occasionally present. These equations are to be presented to a user in string form.
I know that I can remove these redundant parts of the equation by implementing a kind of parser, but I was curious as to whether a regular expression could be used to remove them.
I came up with the following before I finally gave up on my (quite limited) regex skills:
/([^\+\-]*(?:0|0\*|\*0){1,}[^\+\-]*)|(?:1\*|\*1)/g
It seems to work only if:
there are no non-zero numbers with a zero in them (ie. no 10's,20's,etc.)
there are no negations.
It also doesn't work well with parentheses. Unfortunately parentheses are quite common.
Note that removing the redundant portions stated above may result in redundant parentheses or "zero" parentheses (ie it could turn out like ()*x, which is equivalent to 0*x). The redundant parentheses are not as much of an issue, but I assume the "zero" parentheses could be removed by a second pass similar to the first looking for (). If either of these could be done in the same regex as the one that solves the problem I would be extremely impressed.
So I turn to you regex gurus of Stack Overflow. Can it be done?
Assumptions
The following can be assumed true about the stringified equations:
There are no divisions by zero, equations will not have any occurrence of [expr]/0or even expressions that evaluate to [expr]/0 such as [expr]/sin(0).
The only operators within the equations themselves are + - * and /.
Minus operator (-) includes both subtraction and negation, although negation is always surrounded by parentheses.
Any operation other than the above (sin,cos, pow, etc.) will appear as a function call. (no ^ %, etc.)
Sample Equation
"(0+(0/0.5+(0+1*cos(p)+0*0+0*sin(p))*cos(k)+(0+1*0+0*1+0*0)*(-sin(k))+(0+1*(-sin(p))/0.5+0*0+0*cos(p))*0)*x+(0+(0+1*cos(p)+0*0+0*sin(p))*sin(k)+(0+1*0+0*1+0*0)*cos(k)+(0+1*(-sin(p))/0.5+0*0+0*cos(p))*0)*y+(0+(0+1*cos(p)+0*0+0*sin(p))*0+(0+1*0+0*1+0*0)*0+(0+1*(-sin(p))/0.5+0*0+0*cos(p))*1)*z)"
Quite cluttered isn't it?

After leaving the comment I couldn't resist having a crack at it :)
You're biggest problem is nested parentheses. Regexes themselves are really bad at handling nested structures. This is a prime example of my mantra "Regular expressions are a tool, not a solution".
With regexes as your tool, you can apply kind of a "leaf-first" (or bottom-up) approach for this tree-structure, that's what I do in the first part while (sEq.match(...)) {...}. After that I can walk through the created array and do some simple text edits.
I've also added that 1*, *1 and /1 are deleted as they similarly don't affect the equation. You could probably expand this to make it smart enough to replace sin(0)/cos(0) with 0 and 1 respectively, then the solution would be even smaller in some cases.
(As mentioned in the comments of the code, this breaks if the equation contains stuff like 5.0*4 because JavaScript regexes don't have lookbehind so I'm trusting the \b word boundary to do that work for me. Simply adding logic that deletes unnecessary decimals would solve this though. Something like sEq = sEq.replace(/\.0+\b/g, ''); but I don't know if that's necessary for your use-case.) Edit: now fixed, 5.0*4 should remain in tact
This is not thoroughly tested though, feedback welcome.
var sEq = "(0+(0/0.5+(0+1*cos(p)+0*0+0*sin(p))*cos(k)+(0+1*0+0*1+0*0)*(-sin(k))+(0+1*(-sin(p))/0.5+0*0+0*cos(p))*0)*x+(0+(0+1*cos(p)+0*0+0*sin(p))*sin(k)+(0+1*0+0*1+0*0)*cos(k)+(0+1*(-sin(p))/0.5+0*0+0*cos(p))*0)*y+(0+(0+1*cos(p)+0*0+0*sin(p))*0+(0+1*0+0*1+0*0)*0+(0+1*(-sin(p))/0.5+0*0+0*cos(p))*1)*z)";
var aParts = [];
document.getElementById('output').value = sEq + '\n';
while (sEq.match(/\([^()]*\)/)) {
// while there are still "leafs", save them to aParts and replace with
// a reference to their index in aParts, making their parent a new
// "leaf" because it now doesn't contain the round brackets anymore
sEq = sEq.replace(/([a-z]*)\(([^()]*)\)/gi, function(m, f, a) {
var n = aParts.length;
aParts[n] = {
'found':m,
'funct':f,
'arith':a
};
return '[' + n + ']';
});
}
for (var i = 0; i < aParts.length; i++) {
// isolate divisions/multiplications
var dms = aParts[i]['arith'].split(/(?=[+-])/);
for (var j = 0; j < dms.length; j++) {
// if the isolated part is multiplied by or divided into 0, replace with 0
if (dms[j].match(/([^.]|^)\b0[*\/]|\*0(?!\.?0*[1-9])/)) {
dms[j] = dms[j].replace(/([+-]?).*/, '$1'+'0');
}
// remove /1, *1 and 1*
dms[j] = dms[j].replace(/[\/*]1\b(?!\.0*[1-9])(?:\.0*)?/g, '').replace(/([^.]|^)\b1\*/g, '$1');
}
// join back together, remove 0+, +0 and -0; 0- results in negation
aParts[i]['arith'] = dms.join('').replace(/[+-]0(?!\.?0*[1-9])(?:\.?0*)?/g, '').replace(/([^.]|^)\b0(?:\+|(-))/g, '$1$2');
// if after this the part contains just 0, perpetuate down to further eliminate
if (aParts[i]['funct']=='' && aParts[i]['arith']=='0') {
for (var j = i + 1; j < aParts.length; j++) {
if (aParts[j]['arith'].indexOf('[' + i + ']') != -1) {
aParts[j]['arith'] = aParts[j]['arith'].replace('[' + i + ']', '0');
break;
}
}
}
// add back parts previously simplified by replacing [n] with the content of aParts[n]
aParts[i]['arith'] = aParts[i]['arith'].replace(/\[(\d+)\]/g, function (m, n) {
return aParts[parseInt(n)]['funct'] + '(' + aParts[parseInt(n)]['arith'] + ')';
});
// This is just to show the progress of the algorithm
document.getElementById('parts').value += i + '\t' + aParts[i]['found'] + '\n';
var tmp = [];
for (var a = 0; a < aParts.length; a++) {
tmp[a] = {
'funct':aParts[a]['funct'],
'arith':aParts[a]['arith'].replace(/\[(\d+)\]/g, function (m, n) {
return tmp[parseInt(n)]['funct'] + '(' + tmp[parseInt(n)]['arith'] + ')';
})
};
}
// some steps didn't change after analysing, only append when significant
if (document.getElementById('output').value.indexOf('\n' + tmp[tmp.length-1]['arith'] + '\n') ==-1)
document.getElementById('output').value += tmp[tmp.length-1]['arith'] + '\n';
}
document.getElementById('solution').innerHTML =
aParts[aParts.length-1]['funct'] +
'(' + aParts[aParts.length-1]['arith'] + ')';
<h3>Parts isolated:</h3>
<textarea id="parts" rows="10" style="width:100%" wrap="off"></textarea>
<h3>Steps that simplified the equation:</h3>
<textarea id="output" rows="10" style="width:100%" wrap="off"></textarea>
<h3>Solution:</h3>
<pre id="solution"></pre>

I ended up implementing a completely non-regex, recursive approach to the problem. The cleanupEqn() function essentially splits each string by operators (top level parentheses are grouped as a single operand), recursively operates on each sub part, then does another run through on the way back up the function call chain.
Comparing this with funkwurm's regex solution in jsperf shows it is significantly faster (at least in my personal chrome and firefox browsers).
It hasn't been thoroughly tested yet, and I'm sure there could be improvements made so I welcome any feedback.
Stealing funkwurm's snippet display to show my solution:
var sEq = "(0+(0/0.5+(0+1*cos(p)+0*0+0*sin(p))*cos(k)+(0+1*0+0*1+0*0)*(-sin(k))+(0+1*(-sin(p))/0.5+0*0+0*cos(p))*0)*x+(0+(0+1*cos(p)+0*0+0*sin(p))*sin(k)+(0+1*0+0*1+0*0)*cos(k)+(0+1*(-sin(p))/0.5+0*0+0*cos(p))*0)*y+(0+(0+1*cos(p)+0*0+0*sin(p))*0+(0+1*0+0*1+0*0)*0+(0+1*(-sin(p))/0.5+0*0+0*cos(p))*1)*z)";
var operators = ['+','-','*','/'];
var level = 0;
var result = cleanupEqn(sEq);
document.getElementById('solution').innerHTML = result;
function cleanupEqn(eqn){
var parts = removeRedundant(splitByParen(eqn));
level++;
document.getElementById('output').value += 'Level ' + level + ': Processing ' + eqn + '\n';
for(var i=0; i < parts.length; i++){
document.getElementById('parts').value += parts[i] + '\n';
if(parts[i].charAt(0) === '('){
// Clean up the expression inside the parentheses
var tmp = cleanupEqn(parts[i].substring(1,parts[i].length-1));
// If it was reduced to a zero, don't add the parentheses back
if(tmp === '0'){
parts[i] = '0';
}
else {
parts[i] = '(' + tmp + ')';
}
}
}
// Finally, remove redundancies again, since some might have bubbled up.
removeRedundant(parts);
document.getElementById('output').value += 'Level ' + level + ': Completed ' + eqn + '\n' + JSON.stringify(parts, null, '\t') + '\n';
level--;
// Join it all into a string and return
return parts.join('');
}
function splitByParen(str){
var out = [];
var exprStart = 0;
var count = 0;
var i;
for (i = 0; i < str.length; i++) {
var t = str.charAt(i);
if(str.charAt(i) === '('){
if(i > exprStart && count === 0){
out.push(str.substring(exprStart, i));
exprStart = i;
}
count++;
}
else if(str.charAt(i) === ')'){
count--;
if(count === 0){
out.push(str.substring(exprStart, i+1));
exprStart = i+1;
}
}
else if(count === 0 && operators.indexOf(str.charAt(i)) > -1){
if(i > exprStart){
out.push(str.substring(exprStart, i));
}
out.push(str.charAt(i));
exprStart = i+1;
}
}
// Add the last part
if(i > exprStart){
out.push(str.substring(exprStart, i));
}
return out;
}
function removeRedundant(parts) {
for(var i=0; i < parts.length; i++){
if(parts[i] === '0'){
if(i === 0){
switch(parts[i+1]){
case '*':
case '/':
if(parts[i+1] === '*' || parts[i+1] === '/'){
parts.splice(i, 3, '0');
}
else {
parts.splice(i, 2);
}
i--;
break;
case '+':
parts.splice(i, 2);
i--;
break;
case '-':
parts.splice(i, 1);
i--;
}
}
else {
switch(parts[i-1]){
case '*':
if(parts[i+1] === '*' || parts[i+1] === '/'){
// Check if the prior portion is part of a function call
if(i > 2 && operators.indexOf(parts[i-3]) < 0){
// Check if the next portion is part of a function call (or undefined)
if(i+3 < parts.length && operators.indexOf(parts[i+3]) < 0){
parts.splice(i-3, 6, '0');
i -= 4;
}
else {
parts.splice(i-3, 5, '0');
i -= 4;
}
}
else {
parts.splice(i-2, 4, '0');
i -= 3;
}
}
else {
parts.splice(i-2, 3, '0');
i -= 3;
}
break;
case '+':
case '-':
if(parts[i+1] === '*' || parts[i+1] === '/'){
// Check if the next portion is part of a function call (or undefined)
if(i+3 < parts.length && operators.indexOf(parts[i+3]) < 0){
parts.splice(i, 4, '0');
}
else {
parts.splice(i, 3, '0');
}
i--;
}
else if(parts[i+1] === '+'){
parts.splice(i-1, 2);
i -= 2;
}
else {
parts.splice(i-1, 2);
i -= 2;
}
}
}
}
else if(parts[i] === '1'){
if(i === 0){
switch(parts[i+1]){
case '*':
parts.splice(i, 2);
i--;
break;
case '+':
case '-':
if(parts[i+1] === '*'){
parts.splice(i, 2);
i--;
}
}
}
switch(parts[i-1]){
case '*':
case '/':
if(parts[i+1] !== '/'){
parts.splice(i-1, 2);
i -= 2;
}
break;
case '+':
case '-':
if(parts[i+1] === '*'){
parts.splice(i, 2);
i--;
}
}
}
}
return parts;
}
<h3>Parts isolated:</h3>
<textarea id="parts" rows="10" style="width:100%" wrap="off"></textarea>
<h3>Steps that simplified the equation:</h3>
<textarea id="output" rows="10" style="width:100%" wrap="off"></textarea>
<h3>Solution:</h3>
<pre id="solution"></pre>
<script src="new.js"></script>

Develop Reference

JavaScript is the programming language of the Web.

regex to match dots but not when enclosed by nested square brackets - javascript

Related

What is the fastest way to count the number of lines in a string in JavaScript?

Remove consecutive characters from string until it doesn't have any consecutive characters

How do I look in a string with text inside quotations, but ignoring anything inside brackets using regex?

Function to check mathematical expression not working

Regex to remove redundant multiplication/division involving zero or one?

Categories

Resources