Cancel Regex match if timeout - javascript

Is it possible to cancel a regex.match operation if takes more than 10 seconds to complete?
I'm using an huge regex to match a specific text, and sometimes may work, and sometimes can fail...
regex: MINISTÉRIO(?:[^P]*(?:P(?!ÁG\s:\s\d+\/\d+)[^P]*)(?:[\s\S]*?))PÁG\s:\s+\d+\/(\d+)\b(?:\D*(?:(?!\1\/\1)\d\D*)*)\1\/\1(?:[^Z]*(?:Z(?!6:\s\d+)[^Z]*)(?:[\s\S]*?))Z6:\s+\d+
Working example: https://regex101.com/r/kU6rS5/1
So.. i want cancel the operation if takes more than 10 seconds. Is it possible? I'm not finding anything related in sof
Thanks.

You could spawn a child process that does the regex matching and kill it off if it hasn't completed in 10 seconds. Might be a bit overkill, but it should work.
fork is probably what you should use, if you go down this road.
If you'll forgive my non-pure functions, this code would demonstrate the gist of how you could communicate back and forth between the forked child process and your main process:
index.js
const { fork } = require('child_process');
const processPath = __dirname + '/regex-process.js';
const regexProcess = fork(processPath);
let received = null;
regexProcess.on('message', function(data) {
console.log('received message from child:', data);
clearTimeout(timeout);
received = data;
regexProcess.kill(); // or however you want to end it. just as an example.
// you have access to the regex data here.
// send to a callback, or resolve a promise with the value,
// so the original calling code can access it as well.
});
const timeoutInMs = 10000;
let timeout = setTimeout(() => {
if (!received) {
console.error('regexProcess is still running!');
regexProcess.kill(); // or however you want to shut it down.
}
}, timeoutInMs);
regexProcess.send('message to match against');
regex-process.js
function respond(data) {
process.send(data);
}
function handleMessage(data) {
console.log('handing message:', data);
// run your regex calculations in here
// then respond with the data when it's done.
// the following is just to emulate
// a synchronous computational delay
for (let i = 0; i < 500000000; i++) {
// spin!
}
respond('return regex process data in here');
}
process.on('message', handleMessage);
This might just end up masking the real problem, though. You may want to consider reworking your regex like other posters have suggested.

Another solution I found here:
https://www.josephkirwin.com/2016/03/12/nodejs_redos_mitigation/
Based on the use of VM, no process fork.
That's pretty.
const util = require('util');
const vm = require('vm');
var sandbox = {
regex:/^(A+)*B/,
string:"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC",
result: null
};
var context = vm.createContext(sandbox);
console.log('Sandbox initialized: ' + vm.isContext(sandbox));
var script = new vm.Script('result = regex.test(string);');
try{
// One could argue if a RegExp hasn't processed in a given time.
// then, its likely it will take exponential time.
script.runInContext(context, { timeout: 1000 }); // milliseconds
} catch(e){
console.log('ReDos occurred',e); // Take some remedial action here...
}
console.log(util.inspect(sandbox)); // Check the results

Related

Is there a way to limit the execution time of regex evaluation in javascript? [duplicate]

Is it possible to cancel a regex.match operation if takes more than 10 seconds to complete?
I'm using an huge regex to match a specific text, and sometimes may work, and sometimes can fail...
regex: MINISTÉRIO(?:[^P]*(?:P(?!ÁG\s:\s\d+\/\d+)[^P]*)(?:[\s\S]*?))PÁG\s:\s+\d+\/(\d+)\b(?:\D*(?:(?!\1\/\1)\d\D*)*)\1\/\1(?:[^Z]*(?:Z(?!6:\s\d+)[^Z]*)(?:[\s\S]*?))Z6:\s+\d+
Working example: https://regex101.com/r/kU6rS5/1
So.. i want cancel the operation if takes more than 10 seconds. Is it possible? I'm not finding anything related in sof
Thanks.
You could spawn a child process that does the regex matching and kill it off if it hasn't completed in 10 seconds. Might be a bit overkill, but it should work.
fork is probably what you should use, if you go down this road.
If you'll forgive my non-pure functions, this code would demonstrate the gist of how you could communicate back and forth between the forked child process and your main process:
index.js
const { fork } = require('child_process');
const processPath = __dirname + '/regex-process.js';
const regexProcess = fork(processPath);
let received = null;
regexProcess.on('message', function(data) {
console.log('received message from child:', data);
clearTimeout(timeout);
received = data;
regexProcess.kill(); // or however you want to end it. just as an example.
// you have access to the regex data here.
// send to a callback, or resolve a promise with the value,
// so the original calling code can access it as well.
});
const timeoutInMs = 10000;
let timeout = setTimeout(() => {
if (!received) {
console.error('regexProcess is still running!');
regexProcess.kill(); // or however you want to shut it down.
}
}, timeoutInMs);
regexProcess.send('message to match against');
regex-process.js
function respond(data) {
process.send(data);
}
function handleMessage(data) {
console.log('handing message:', data);
// run your regex calculations in here
// then respond with the data when it's done.
// the following is just to emulate
// a synchronous computational delay
for (let i = 0; i < 500000000; i++) {
// spin!
}
respond('return regex process data in here');
}
process.on('message', handleMessage);
This might just end up masking the real problem, though. You may want to consider reworking your regex like other posters have suggested.
Another solution I found here:
https://www.josephkirwin.com/2016/03/12/nodejs_redos_mitigation/
Based on the use of VM, no process fork.
That's pretty.
const util = require('util');
const vm = require('vm');
var sandbox = {
regex:/^(A+)*B/,
string:"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC",
result: null
};
var context = vm.createContext(sandbox);
console.log('Sandbox initialized: ' + vm.isContext(sandbox));
var script = new vm.Script('result = regex.test(string);');
try{
// One could argue if a RegExp hasn't processed in a given time.
// then, its likely it will take exponential time.
script.runInContext(context, { timeout: 1000 }); // milliseconds
} catch(e){
console.log('ReDos occurred',e); // Take some remedial action here...
}
console.log(util.inspect(sandbox)); // Check the results

Call hierarchy of async functions inside a loop?

There's a async call I'm making that queries a database on a service, but this service has a limit of how many it can output at once, so I need to check if it hit its limit through the result it sends and repeat the query until it doesn't.
Synchronous mockup :
var query_results = [];
var limit_hit = true; #While this is true means that the query hit the record limit
var start_from = 0; #Pagination parameter
while (limit_hit) {
Server.Query(params={start_from : start_from}, callback=function(result){
limit_hit = result.limit_hit;
start_from = result.results.length;
query_result.push(result.results);
}
}
Obviously the above does not work, I've seen some other questions here about the issue, but they don't mention what to do when you need each iteration to wait for the last one to finish and you don't know before hand the number of iterations.
How can I turn the above asynchronous? I'm open to answers using promise/deferred-like logic, but preferably something clean.
I can probably think of a monstruous and horrible way of doing this using waits/timeouts, but there has to be a clean, clever and modern way to solve it.
Another way is to make a "pre-query" to know the number of features before hand so you know the number of loops, I'm not sure if this is the correct way.
Here we use Dojo sometimes, but the examples I found does not explain what to do when you have an unknown amount of loops https://www.sitepen.com/blog/2015/06/10/dojo-faq-how-can-i-sequence-asynchronous-operations/
although many answers already, still I believe async/await is the cleanest way.
https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/async_function
and you might need babel
https://babeljs.io/
JS async logic syntax changed from callback to promise then to async/await, they all do the same thing, when callback nests a lot we need something like a chain, then promise come, when promise goes in loop, we need something make the chain more plain more simple, then async/await come. But not all browsers support the new syntax, so babel come to compile new syntax to old syntax, then you can always code in new syntax.
getData().then((data) => {
//do something with final data
})
async function getData() {
var query_results = [];
var limit_hit = true;
var start_from = 0;
//when you use await, handle error with try/catch
try {
while (limit_hit) {
const result = await loadPage(start_from)
limit_hit = result.limit_hit;
start_from = result.results.length;
query_result.push(result.results);
}
} catch (e) {
//when loadPage rejects
console.log(e)
return null
}
return query_result
}
async function loadPage(start_from) {
//when you use promise, handle error with reject
return new Promise((resolve, reject) => Server.Query({
start_from
}, (result, err) => {
//error reject
if (err) {
reject(err)
return
}
resolve(result)
}))
}
If you want to use a loop then I think there is no (clean) way to do it without Promises.
A different approach would be the following:
var query_results = [];
var start_from = 0;
funciton myCallback(result) {
if(!result) {
//first call
Server.Query({ start_from: start_from}, myCallback);
} else {
//repeated call
start_from = result.results.length
query_result.push(result.results);
if(!result.limit_hit) {
//limit has not been hit yet
//repeat the query with new start value
Server.Query({ start_from: start_from}, myCallback);
} else {
//call some callback function here
}
}
}
myCallback(null);
You could call this recursive, but since the Query is asynchronous you shouldn't have problems with call stack limits etc.
Using promises in an ES6 environment you could make use of async/await. Im not sure if this is possible with dojo.
You don't understand callbacks until you have written a rate limiter or queue ;) The trick is to use a counter: Increment the counter before the async request, and decrement it when you get the response, then you will know how many requests are "in flight".
If the server is choked you want to put the item back in the queue.
There are many things you need to take into account:
What will happen to the queue if the process is killed ?
How long to wait before sending another request ?
Make sure the callback is not called many times !
How many times should you retry ?
How long to wait before giving up ?
Make sure there are no loose ends ! (callback is never called)
When all edge cases are taken into account you will have a rather long and not so elegant solution. But you can abstract it into one function! (that returns a Promise or whatever you fancy).
If you have a user interface you also want to show a loading bar and some statistics!
You must await for the server response every time. Here a encapsulated method
var query = (function(){
var results = [];
var count = 0;
return function check(fun){
Server.Query({ start_from: count}, function(d){
count = d.results.length;
results.push(d.results);
if (d.limit_hit && fun) fun(results);
else check(fun);
});
};
})();
// Call here
var my_query = query(function(d){
// --> retrive all data when limit_hit is true)
});
You can use a generator function Generators to achieve this
For POC:
some basics
- You define a generator with an asterick *
- it exposes a next function which returns the next value
- generators can pause with yield statement internally and can resume externally by calling the next()
- While (true) will ensure that the generator is not done until limit has reached
function *limitQueries() {
let limit_hit = false;
let start_from = 0;
const query_result = [];
while (true) {
if (limit_hit) {break;}
yield Server.Query(params={start_from : start_from},
callback=function* (result) {
limit_hit = result.limit_hit;
start_from = result.results.length;
yield query_result.push(result.results);
}
}
}
So apparently, the generator function maintains its own state. Generator function exposes two properties { value, done } and you can call it like this
const gen = limitQueries();
let results = [];
let next = gen.next();
while(next.done) {
next = gen.next();
}
results = next.value;
You might have to touch your Server.Query method to handle generator callback. Hope this helps! Cheers!

inserting max num of records in mongodb using nodejs in less time

I'm doing a small task that, Need to read a large file(i.e, 1.3GB) in node and devide every line into one record, every record to be inserted into mongodb collection in less time. Please suggest me in code and thanks in advance.
You probably want to read a such amount of data without buffering it into memory.
Assuming you are dealing with JSON data, I think this might be a feasible approach:
var LineByLineReader = require('line-by-line');
var fileHandler = new LineByLineReader('path/to/file', { encoding:'utf8', skipEmptyLines: true });
var entries = [];
var bulkSize = 100000; // tweak as needed
fileHandler.on('error', function (err) {
// process errors here
});
fileHandler.on('line', function (line) {
entries.push(JSON.parse(line));
if (entries.length === bulkSize) {
// pause handler and write data
fileHandler.pause();
YourCollection.insertMany(entries)
.then(() => {
entries = [];
fileHandler.resume();
})
}
});
fileHandler.on('end', function () {
YourCollection.insertMany(entries)
.then(() => {
// everything's done, do your stuff here
});
});
The line-by-line module seems to be a bit buggy and could be deprecated in the future, so you may want to use linebyline instead

Async request into for loop angular.js

I have an array and i need to send values of array to webservice through http post request one by one . For the node.js , i'm using "async" package to do that for ex: async.eachSeries doing it well , how can i do that same thing for angular.js , my normal async code;
//this code sends all queries of array (maybe 5.000 request at same time , it is hard to process for webservice :=) ) at same time and wait for all responses.
//it works but actually for me , responses should wait others at end of loop should work one by one
//like async.eachSeries module!
for (var i = 0; i < myArr.lenght; i++) {
(function (i) {
var data = {
"myQuery": myArr[i].query
};
$http.post("/myServiceUrl", data).success(function (result) {
console.log(result);
});
})(i);
}
Both Matt Way and Chris L answers Correct , you can investigate Chris's answer for understanding about async to sync functions in for loops.
You can use $q to create a similar requirement by chaining promises together. For example:
var chain = $q.when();
angular.forEach(myArr, function(item){
chain = chain.then(function(){
var data = {
myQuery: item.query
};
return $http.post('/myServiceUrl', data).success(function(result){
console.log(result);
});
});
});
// the final chain object will resolve once all the posts have completed.
chain.then(function(){
console.log('all done!');
});
Essentially you are just running the next promise once the previous one has completed. Emphasis here on the fact that each request will wait until the previous one has completed, as per your question.
function logResultFromWebService(value)
{
$http.post("/myServiceUrl", value).success(console.log);
}
angular.forEach(myArray, logResultFromWebService);
If I understand your question correctly. You want to run a for loop in a synchronized manner such that the next iteration only occurs once the previous iteration is completed. For that, you can use a synchronized loop/callbacks. Especially if the order matters.
var syncLoop = function (iterations, process, exit) {
var index = 0,
done = false,
shouldExit = false;
var loop = {
next: function () {
if (done) {
if (shouldExit && exit) {
return exit(); // Exit if we're done
}
}
// If we're not finished
if (index < iterations) {
index++; // Increment our index
process(loop); // Run our process, pass in the loop
// Otherwise we're done
} else {
done = true; // Make sure we say we're done
if (exit) exit(); // Call the callback on exit
}
},
iteration: function () {
return index - 1; // Return the loop number we're on
},
break: function (end) {
done = true; // End the loop
shouldExit = end; // Passing end as true means we still call the exit callback
}
};
console.log('running first time');
loop.next();
return loop;
}
For your particular implementation:
syncLoop(myArray.length, function (loop) {
var index = loop.iteration();
var data = {
"myQuery": myArray[index].query
};
$http.post("/myServiceUrl", data).success(function (result) {
console.log(result);
loop.next();
});
}, function () {
console.log('done');
});
If you intend on doing something with the data once returned (such as perform calculations) you can do so with this method because you will return the data in a specified order.
I implemented something similar in a statistical calculation web app I built.
EDIT:
To illustrate the problem I had when using $q.when I have set up a fiddle. Hopefully this will help illustrate why I did this the way I did.
https://jsfiddle.net/chrislewispac/6atp3w8o/
Using the following code from Matt's answer:
var chain = $q.when(promise.getResult());
angular.forEach(myArr, function (item) {
chain = chain.then(function () {
$rootScope.status = item;
console.log(item);
});
});
// the final chain object will resolve once all the posts have completed.
chain.then(function () {
console.log('all done!');
});
And this fiddle is an example of my solution:
https://jsfiddle.net/chrislewispac/Lgwteone/3/
Compare the $q version to my version. View the console and imagine those being delivered to the user interface for user intervention in the process and/or performing statistical operations on the sequential returns.
You will see that it does not sequentially give the numbers 1,2,3,4 etc. either in the console or in the view in Matt's answer. It 'batches' the responses and then returns them. Therefore, if step 3 is not to be run depending on the response in step 2 there is not, at least in the answer provided, a way to break out or explicitly control the synchronous operation here. This presents a significant problem when attempting to perform sequential calculations and/or allow the user to control break points, etc.
Now, I am digging through both the $q libraries and the Q library to see if there is a more elegant solution for this problem. However, my solution does work as requested and is very explicit which allows me to place the function in a service and manipulate for certain use cases at my will because I completely understand what it is doing. For me, that is more important than using a library (at least at this stage in my development as a programmer and I am sure there are lots of other people at the same stage on StackOverflow as well).
If the order doesn't matter in which they are sent
var items = [/* your array */];
var promises = [];
angular.forEach(items, function(value, key){
var promise = $http.post("/myServiceUrl", { "myQuery": value.query });
promises.push(promise);
});
return $q.all(promises);

How to use drain event of stream.Writable in Node.js

In Node.js I'm using the fs.createWriteStream method to append data to a local file. In the Node documentation they mention the drain event when using fs.createWriteStream, but I don't understand it.
var stream = fs.createWriteStream('fileName.txt');
var result = stream.write(data);
In the code above, how can I use the drain event? Is the event used properly below?
var data = 'this is my data';
if (!streamExists) {
var stream = fs.createWriteStream('fileName.txt');
}
var result = stream.write(data);
if (!result) {
stream.once('drain', function() {
stream.write(data);
});
}
The drain event is for when a writable stream's internal buffer has been emptied.
This can only happen when the size of the internal buffer once exceeded its highWaterMark property, which is the maximum bytes of data that can be stored inside a writable stream's internal buffer until it stops reading from the data source.
The cause of something like this can be due to setups that involve reading a data source from one stream faster than it can be written to another resource. For example, take two streams:
var fs = require('fs');
var read = fs.createReadStream('./read');
var write = fs.createWriteStream('./write');
Now imagine that the file read is on a SSD and can read at 500MB/s and write is on a HDD that can only write at 150MB/s. The write stream will not be able to keep up, and will start storing data in the internal buffer. Once the buffer has reached the highWaterMark, which is by default 16KB, the writes will start returning false, and the stream will internally queue a drain. Once the internal buffer's length is 0, then the drain event is fired.
This is how a drain works:
if (state.length === 0 && state.needDrain) {
state.needDrain = false;
stream.emit('drain');
}
And these are the prerequisites for a drain which are part of the writeOrBuffer function:
var ret = state.length < state.highWaterMark;
state.needDrain = !ret;
To see how the drain event is used, take the example from the Node.js documentation.
function writeOneMillionTimes(writer, data, encoding, callback) {
var i = 1000000;
write();
function write() {
var ok = true;
do {
i -= 1;
if (i === 0) {
// last time!
writer.write(data, encoding, callback);
} else {
// see if we should continue, or wait
// don't pass the callback, because we're not done yet.
ok = writer.write(data, encoding);
}
} while (i > 0 && ok);
if (i > 0) {
// had to stop early!
// write some more once it drains
writer.once('drain', write);
}
}
}
The function's objective is to write 1,000,000 times to a writable stream. What happens is a variable ok is set to true, and a loop only executes when ok is true. For each loop iteration, the value of ok is set to the value of stream.write(), which will return false if a drain is required. If ok becomes false, then the event handler for drain waits, and on fire, resumes the writing.
Regarding your code specifically, you don't need to use the drain event because you are writing only once right after opening your stream. Since you have not yet written anything to the stream, the internal buffer is empty, and you would have to be writing at least 16KB in chunks in order for the drain event to fire. The drain event is for writing many times with more data than the highWaterMark setting of your writable stream.
Imagine you're connecting 2 streams with very different bandwidths, say, uploading a local file to a slow server. The (fast) file stream will emit data faster than the (slow) socket stream can consume it.
In this situation, node.js will keep data in memory until the slow stream gets a chance to process it. This can get problematic if the file is very large.
To avoid this, Stream.write returns false when the underlying system buffer is full. If you stop writing, the stream will later emit a drain event to indicate that the system buffer has emptied and it is appropriate to write again.
You can use pause/resume the readable stream and control the bandwidth of the readable stream.
Better: you can use readable.pipe(writable) which will do this for you.
EDIT: There's a bug in your code: regardless of what write returns, your data has been written. You don't need to retry it. In your case, you're writing data twice.
Something like this would work:
var packets = […],
current = -1;
function niceWrite() {
current += 1;
if (current === packets.length)
return stream.end();
var nextPacket = packets[current],
canContinue = stream.write(nextPacket);
// wait until stream drains to continue
if (!canContinue)
stream.once('drain', niceWrite);
else
niceWrite();
}
Here is a version with async/await
const write = (writer, data) => {
return new Promise((resolve) => {
if (!writer.write(data)) {
writer.once('drain', resolve)
}
else {
resolve()
}
})
}
// usage
const run = async () => {
const write_stream = fs.createWriteStream('...')
const max = 1000000
let current = 0
while (current <= max) {
await write(write_stream, current++)
}
}
https://gist.github.com/stevenkaspar/509f792cbf1194f9fb05e7d60a1fbc73
This is a speed-optimized version using Promises (async/await). The caller has to check if it gets a promise back and only in that case await has to be called. Doing await on each call can slow down the program by a factor of 3...
const write = (writer, data) => {
// return a promise only when we get a drain
if (!writer.write(data)) {
return new Promise((resolve) => {
writer.once('drain', resolve)
})
}
}
// usage
const run = async () => {
const write_stream = fs.createWriteStream('...')
const max = 1000000
let current = 0
while (current <= max) {
const promise = write(write_stream, current++)
// since drain happens rarely, awaiting each write call is really slow.
if (promise) {
// we got a drain event, therefore we wait
await promise
}
}
}

Categories

Resources