Main purpose: I'm trying to scrape data off of around 10,000 different pages using Node.js.
Problem: It scrapes through the first 500~1000 very fast and then turns into a turtle (its variable where it slows down) beyond that, and then eventually just seems stuck forever.
I'm using the request module in Node.js to make the requests I then use cheerio to start scraping,
This code replicates my problem:
var request = require('request');
var requestsCalledCounter = 0;
var requestsCompletedCounter = 0;
var MAX_REQUESTS = 500;
var start = function () {
while (requestsCalledCounter < MAX_REQUESTS) {
request("http://www.google.com", function (error, response, html) {
requestsCompletedCounter++;
});
requestsCalledCounter++;
}
};
start();
Output:
Test 1:
447/500
89.4%
Timed out: No requests completed after 5 seconds
447 Completed
Test 2:
427/500
85.39999999999999%
Timed out: No requests completed after 5 seconds
427
Extra details that might help:
I have an array of URL's that I am going to scrape, so I am looping through them making a request to every URL in the array. It has about 10,000 URL's.
I agree with #cviejo in the comments. You should use an existing project. However to increase the understanding, here is an implementation that will only have 10 requests outstanding at a time.
var request = require('request');
var requestsCalledCounter = 0;
var requestsCompletedCounter = 0;
var pending = 0;
var MAX_PENDING = 10;
var MAX_REQUESTS = 500;
var doreq = function () {
request("http://www.google.com", function (error, response, html) {
requestsCompletedCounter++;
pending--;
});
pending++;
requestsCalledCounter++;
}
var start = function () {
while (pending < MAX_PENDING && requestsCalledCounter < MAX_REQUESTS) {
doreq();
}
if (requestsCalledCounter < MAX_REQUESTS) {
setTimeout(start, 1);
}
};
start();
Related
I got a blob to construct, and received almost 100 parts of (500k) to decrypt and construct a blob file.
Actually it's working fine, but when i do my decryption, that take processor, and freeze my page.
I try different approach, with defered of jquery, timeout but always the same probleme.
It's there a ways to not freez the UI thread ?
var parts = blobs.sort(function (a, b) {
return a.part - b.part;
})
// notre bytesarrays finales
var byteArrays = [];
i = 0;
for (var i = 0; i < blobs.length; i++)
{
// That job is intensive, and take time
byteArrays.push(that.decryptBlob(parts[i].blob.b64, fileType));
}
// create new blob with all data
var blob = new Blob(byteArrays, { type: fileType });
The body inside for(...) loop is synchronous, so the entire decryption process is synchronous, in simple words, decryption happens chunk after chunk. How about making it asynchronous ? Like decrypting multiple chunks in parallel. In JavaScript terminology we can use Asynchronous Workers. These workers can work in parallel, so if you spawn 5 workers for example. The total time is reduced by T / 5. (T = total time in synchronous mode).
Read more about worker threads here :
https://blog.logrocket.com/node-js-multithreading-what-are-worker-threads-and-why-do-they-matter-48ab102f8b10/
Tanks to Sebastian Simon,
I took the avenue of worker. And it's working fine.
var chunks = [];
var decryptedChucnkFnc = function (args) {
// My builder blob job here
}
// determine the number of maximum worker to use
var maxWorker = 5;
if (totalParts < maxWorker) {
maxWorker = totalParts;
}
for (var iw = 0; iw < maxWorker; iw++) {
eval('var w' + iw + ' = new Worker("decryptfile.min.js")');
var wo = eval("w" + iw);
var item = blobs.pop();
wo.postMessage(MyObjectPassToTheFile);
wo.onmessage = decryptedChucnkFnc;
}
Doing scraping. How can I stay on a page and read the content to search for data every xx seconds without refresh the page? I use this way but the pc crashes after some time. Any ideas on how to make it efficient? I would like to achieve it without using while (true). The readOdds function does not always delay the same time.
//...
while(true){
const html = await page.content();
cant = await readOdds(html); // some code with the html
console.info('Waiting 5 seconds to read again...');
await page.waitFor(5000);
}
this is a section
async function readOdds(htmlPage){
try {
var savedat = functions.mysqlDateTime(new Date());
var pageHtml=htmlPage.replace(/(\r\n|\n|\r)/gm,"");
var exp_text_all = /<coupon-section(.*?)<\/coupon-section>/g;
var leagueLinksMatches = pageHtml.match(exp_text_all);
var cmarkets = 0;
let reset = await mysqlfunctions.promise_updateMarketsCount(cmarkets, table_markets_count, site);
console.log(reset);
if(leagueLinksMatches == null){
return cmarkets;
}
for (let i = 0; i < leagueLinksMatches.length; i++) {
const html = leagueLinksMatches[i];
var expc = /class="title ellipsis-text">(.*?)<\/span/g;
var nameChampionship = functions.getDataInHtmlCode(String(html).match(expc)[0]);
var idChampionship = await mysqlfunctions.promise_db_insert_Championship(nameChampionship, gsport, table_championship);
var exp_text = /<ui-event-line(.*?)<\/ui-event-line>/g;
var text = html.match(exp_text);
// console.info(text.length);
for (let index = 0; index < text.length; index++) {
const element = text[index];
....
Simple Solution with recursive callback
However before we go into that, you can try to run the function itself instead of while which will loop forever without any proper control.
const readLoop = async() => {
const html = await page.content();
cant = await readOdds(html);
return readLoop() // run the loop again
}
// invoke it for infinite callbacks without any delays at all
await readLoop();
Which will run the same block function continuously, without any delay, as long as your readOdds function returns. You won't have to use page.waitFor and while.
Memory leak prevention
For advanced cases where you have respawn over a period of time, Queue like bull and process manager like PM2 comes into play. However, queue will void your without refresh the page? part of your question.
You definitely should use pm2 though.
The usage is as follows,
npm i -g pm2
pm2 start index.js --name=myawesomeapp // or your app file
There are few useful arguments,
--max-memory-restart 100M, It can limit memory usage to 100M and restart itself.
--max-restarts 50, It will stop working once it restarts 50 times due to error (or memory leak).
You can check the logs using pm2 logs myawesomeapp as you set the name above.
Okay so i've spent the last day trying to figure something out, I'm relatively new to coding so if its a mess I'm sorry. I'm currently working on a bot that requests a JSON, here is the code I have so far
const request = require('request');
const bodyParser = require('body-parser');
global.count = 10;
for (var i = 1; global.count === 10; i++) {
var options = {
url: 'https://www.the100.io/api/v1/groups/2127/users?page=' + i, //Returns 10 entries per page, so loop is to navigate pages
headers: {
'Authorization': 'Token token="Hidden for Privacy"'
}
}
function callback(error, response, body) {
if (!error && response.statusCode == 200) {
var info = JSON.parse(body); //Also need a way to append to info as to add on as the loop progresses, still need to look that up though
console.log(JSON.stringify(info, null, 1)); //Logs the body
global.count = info.length; //Will return the value 10, until there are no more entries and then will terminate loop
}
}
request(options, callback);//Sends request
}
//It just keeps running the loop and doesn't execute the request at the bottom which is what will make the loop terminate, I've tried many things with
//callbacks and nothing has worked so far
I can't seem to be able to make the loop run properly, I didn't wanna ask for help but I am stuck I'm sad to say. Thanks ahead of time.
I think Some what confusion in the question and explain clearly???
In Your writing for loop is continuously repeating You want like that are some thing else..
in my thought it should be for navigation of page loading( every page contains 10)
global.count = 10;
for( var i = 1; i< =global.count; i++)
{
-- Write Your code here ---
}
I have written a service to download files from an external partner site. There are around 1000 files of 1 MB each. My process is going out of memory every time I reach around 800 files.
How should I identify the root cause ?
var request = require('sync-request');
var fs = require('graceful-fs')
function find_starting_url(xyz_category){
feed_url = "<url>"
response = request("GET", feed_url).getBody().toString()
response = JSON.parse(response)
apiListings = response['apiGroups']['affiliate']['apiListings']
starting_url = apiListings[xyz_category]['availableVariants']['v0.1.0']['get']
return starting_url
}
function get_all_files(feed_category, count, next_url, retry_count){
var headers = {
'Id': '<my_header>',
'Token': '<my key>'
}
console.log(Date())
console.log(count)
if(next_url){
products_url = next_url
}
else{
products_url = find_starting_url(feed_category)
}
try{
var products = request("GET", products_url, {"headers": headers}).getBody().toString()
var parsed = JSON.parse(products)
var home = process.env.HOME
var fd = fs.openSync(home + "/data/abc/xyz/" + feed_category + "/" + count + ".json", 'w')
fs.writeSync(fd, products)
fs.closeSync(fd)
next_url = parsed['nextUrl']
count++;
if(next_url){
get_all_files(feed_category, count, next_url)
}
}catch(e){
if(retry_count >= 5){
console.log("TERRIBLE ENDING!!!", e)
}else{
retry_count++;
console.log("some error... retrying ..", e)
get_all_files(feed_category, count, next_url, retry_count)
}
}
}
var feed_category = process.argv[2]
get_all_files(feed_category, 1)
You're calling a synchronous function recursively so every single request you have and all the data from each request is retained in memory in your local variables until all of the requests are done and all the recursive calls can unwind and then finally free all the sets of local variables. This requires monster amounts of memory (as you have discovered).
It would be best to restructure your code so that the current request is processed, written to disk and then nothing from that request is retained when it goes onto the next request. The simplest way to do that would be to use a while loop instead of a recursive call. In pseudo code:
initialize counter
while (more to do) {
process the next item
increment counter
}
I don't understand the details of what your code is trying to do well enough to propose a rewrite, but hopefully you can see how you can replace the recursion with the type of non-recursive structure above.
It's because you are performing a recursive call to the get_all_files function and it's keeping the body variable in memory for every single execution, since every child execution needs to be completed before the memory is released.
I'm trying to write a performance tool using node.js so I can automate it, and store the results in MySQL. The tool is supposed to gather how long took for the browser to load a particular webpage. I'm using HttpWatch to measure the performance, and the result is displayed in seconds. The browser utilized is Firefox.
Below is a piece of script I'm using to run the performance test:
var MyUrls = [
"http://google.com",
"http://yahoo.com"
];
try {
var win32ole = require('win32ole');
var control = win32ole.client.Dispatch('HttpWatch.Controller');
var plugin = control.Firefox.New();
for (var i=0; i < MyUrls.length; i++) {
var url = MyUrls[i];
console.log(url);
for(var j=0; j < 14; j++) {
// Start Recording HTTP traffic
plugin.Log.EnableFilter(false);
// Clear Cache and cookier before each test
plugin.ClearCache();
plugin.ClearAllCookies();
plugin.ClearSessionCookies();
plugin.Record();
// Goto to the URL and wait for the page to be loaded
plugin.GotoURL(url);
control.Wait(plugin, -1);
// Stop recording HTTP
plugin.Stop();
if ( plugin.Log.Pages.Count != 0 )
{
// Display summary statistics for page
var summary = plugin.Log.Pages(0).Entries.Summary;
console.log(summary.Time);
}
}
}
plugin.CloseBrowser();
} catch(e) {
console.log('*** exception cached ***\n' + e);
}
After the second iteration of the inner loop, I'm getting the following error:
C:\xampp\htdocs\test\browser-perf>node FF-load-navigation.js
http://localhost/NFC-performance/Bing.htm
[Number (VT_R8 or VT_I8 bug?)]
2.718
[Number (VT_R8 or VT_I8 bug?)]
2.718
OLE error: [EnableFilter] -2147352570 [EnableFilter] IDispatch::GetIDsOfNames Au
toWrap() failed
Have someone seen this before? Can you help me?
You have to remember that node is asynchronous
So that for loop runs simultaneously to plugin.CloseBrowser();, which is obviously not what you want because thats causing it to close, which will cause problems in the for loop.
rather you want that to run after the for loop finishes.
Look at async for a simple way to do this.
async.each(MyUrls, function (callback) {
...
callback()
}, function(err){
plugin.CloseBrowser();
});
The same has to be done for your inner for loop.