I'm writing a downloader with node.js and the request module. Using the stream syntax I'm doing
var r = request(url).pipe(fs.createWriteStream(targetPath));
r.on('error', function(err) { console.log(err); });
r.on('finish', cb);
to download the file, save it and call the callback. However, in almost 50% of the cases the file is either empty or not created at all. No error event is emitted. It seems like the finish event is triggered even though the file wasn't (completely) written yet.
Context: The whole thing is wrapped into async.each calls.
Any clues? Thanks!
You need to close the file before accessing it:
var file = fs.createWriteStream(targetPath);
var r = request(url).pipe(file);
r.on('error', function(err) { console.log(err); });
r.on('finish', function() { file.close(cb) });
Incidentally, if the url replies with any http error (such as a 404 not found), that won't trigger the 'error' event, so you should probably check that separately:
function handleFailure(err) { console.log(err); };
var file = fs.createWriteStream(targetPath);
request(url, function(error, response) {
if (response.statusCode != 200) {
console.log("oops, got a " + response.statusCode);
return
}
// close is async, and you have to wait until close completes otherwise
// you'll (very rarely) not be able to access the file in the callback.
file.on('finish', function() { file.close(cb) });
response.pipe(file).on('error', handleFailure)
file.on('error', handleFailure)
}).on('error', handleFailure);
Related
I am using node js to download large files(300MB) from a server and pipe the response to a file write stream. As far as I understand pipes in nodejs, the data flow is managed by node and I don't have to consider draining and other events. The issue I face is that the memory usage of the docker where my application is running increases in the same amount as the file being downloaded (i.e It seems the file is being saved in memory). This memory usage persists even when I delete the file in the docker. I am attaching the code used for creating request and piping, below for reference. The code is running fine but causing performance issues like huge memory/CPU usage and crashes with OOM error. I am not able to understand what I am doing wrong.
let req = request({
url: firmwareURL,
maxAttempts: 5,
retryDelay: 5000,
retryStrategy: request.RetryStrategies.HTTPOrNetworkError});
// 1. Perform server request
req.on('response', (res) => {
console.log(methodName, 'Download response statusCode:', res.statusCode);
if (res.statusCode === 200) {
abortOperation = false;
isStarted = "yes";
// 1.1 Create local file stream if the file is found on url and WaterMark paramter, for bigger chunk
// basepath + basefirmware folder + firmware name + file extension
fileStoragePath = `${firmwareDirectory}/${ip}`;
console.log("filestoragepath is",fileStoragePath);
fileName = `${firmwareVersion}.${firmwareURL.split(".").pop()}`;
// temporary store the file
tempFile = `${fileStoragePath}/${fileName}`;
console.log("tempfile is",tempFile);
writestream = fs.createWriteStream(tempFile, {
highWaterMark: Math.pow(2,20 )
}); // for 1mb buffer,can be increased
writestream.on('error', function (err) {
// on error
console.log('Error while creating a file write stream' + err);
abortOperation = true;
isStarted = "no";
_deleteProgressPointer(ip);
});
// 1.2 Get content length of the current response
size = parseInt(res.headers['content-length'], 10);
console.log(methodName, 'File size is:', size);
req.pipe(writestream);
} else {
// 1.3 Ignore next request events on failure
console.log(methodName, 'File not found on server. res.statusCode:', res.statusCode);
abortOperation = true;
isStarted = "no";
_deleteProgressPointer(ip);
}
});
// 3. In case of error ignore next request events
req.on('error', (error) => {
console.log(methodName, 'File not found on server:', error);
abortOperation = true;
isStarted = "no";
_deleteProgressPointer(ip);
});
// 4. After stream is received close the connection
req.on('end', () => {
if (!abortOperation) {
if (null !== writestream) {
writestream.end();
writestream.on('finish', function () {
console.log(methodName, `File successfully downloaded for device ${ip} of firmware version ${firmwareVersion}`);
try {
// file extraction/storage operation
// further check whether the file extension is valid or not
if (ALLOWED_EXTENSION.includes(firmwareURL.split(".").pop())) {
try {
//req.unpipe(writestream);
fileio.removeFile(tempFile); //deleting downloaded file to avoid storage issues
});
console.log("upgrade ended");
return upgradeOp;
} catch (error) {
console.log(`Error while renamining file: ${tempFile}`);
}
} else {
console.log(methodName, ` Not an valid file extension: ${tempFile}`);
fileio.removeFile(tempFile);
console.log(methodName, ` Invalid: ${tempFile} removed`);
}
// delete the progress pointer
_deleteProgressPointer(ip);
} catch (error) {
// delete the progress pointer
_deleteProgressPointer(ip);
console.log(methodName, `Error during read/write operation :${error}`);
}
});
}
The problem is that you are using the requestretry package, which does not really support streaming. It does always call request with a callback and will provide a promise that is resolved with the full response. The request library will read the entire response body when such a callback is provided, which indeed does buffer the complete response in memory. This is not what you want.
I don't see a way to do streaming-only with requestretry, so you should use the request package directly (or, given its deprecation, one of its successor libraries) and handle the retry logic yourself.
I'm trying to scrape images off a page but the page returns a placeholder source attr if that page isn't fully loaded, (takes about 0.5 seconds to fully load) how would I make request wait?
tried doing
function findCommonMovies(movie, callback){
request('http://www.imdb.com/find?ref_=nv_sr_fn&q='+ movie +'&s=all', function (error, response, body) {
if (error){
return
}else{
var $ = cheerio.load(body);
var title = $(".result_text").first().text().split("(")[0].split(" ").join('')
var commonMovies = []
// var endurl = $("a[name=tt] .result_text a").attr("href")
var endurl = $('a[name=tt]').parent().parent().find(".findSection .findList .findResult .result_text a").attr("href");
request('http://www.imdb.com' + endurl, function (err, response, body) {
if (err){
console.log(err)
}else{
setInterval(function(){var $ = cheerio.load(body)}, 2000)
$(".rec_page .rec_item a img").each(function(){
var title = $(this).attr("title")
var image = $(this).attr("src")
commonMovies.push({title: title, image: image})
});
}
callback(commonMovies)
});
}
});
}
findCommonMovies("Gotham", function(common){
console.log(common)
})
Cheerio is not a web browser. It's just a parser of HTML. Which means that the javascript functions which make the async requests are not being executed.
So. You can't do what you want unless you use something that acts as a web browser. Selenium for example adds an API to a lot of web browsers.
You need to download Selenium client and keep running it as long as you want to keep making requests to sites with async content loading.
Also, you are going to need a wrapper based on the language you are using and the webdriver you want. The webdriver is used to add support for different web browsers.
I assume you are using NodeJS or something similar based on js so, here you go.
And be sure to check the API.
Hope to be of some help.
You could also check PhantomJS.
you can set timeout:
var options = {
url : 'http://www.imdb.com/find?ref_=nv_sr_fn&q='+ movie +'&s=all',
timeout: 10000 //set waiting time till 10 minutes.
}
request(options, function(err, response, body){
if (err) {
console.log(err);
}
//do what you want here
}
setTimeout(function, millseconds to wait) will pause for how many seconds you want.
setTimeout(function(){var $ = cheerio.load(body)}, 2000)
It appears to me like your callback is located in the wrong place and there should be no need for any timer. When request() calls its callback, the whole response is ready so no need for a timer.
Here's the code with the callback in the right place and also changed so that it has an error argument so the caller can propagate and detect errors:
function findCommonMovies(movie, callback){
request('http://www.imdb.com/find?ref_=nv_sr_fn&q='+ movie +'&s=all', function (error, response, body) {
if (error) {
callback(error);
return;
} else {
var $ = cheerio.load(body);
var title = $(".result_text").first().text().split("(")[0].split(" ").join('')
var commonMovies = [];
// var endurl = $("a[name=tt] .result_text a").attr("href")
var endurl = $('a[name=tt]').parent().parent().find(".findSection .findList .findResult .result_text a").attr("href");
request('http://www.imdb.com' + endurl, function (err, response, body) {
if (err) {
console.log(err)
callback(err);
} else {
var $ = cheerio.load(body);
$(".rec_page .rec_item a img").each(function(){
var title = $(this).attr("title");
var image = $(this).attr("src");
commonMovies.push({title, image});
});
callback(null, commonMovies);
}
});
}
});
}
findCommonMovies("Gotham", function(err, common) {
if (err) {
console.log(err);
} else {
console.log(common)
}
});
Note: This will access ONLY the HTML markup served by the server for the URLs you request. If those pages have content that is inserted by browser Javascript, that content will not be present in what you get here and no delay will make it appear. That's because cheerio does not run browser Javascript, it JUST parses the HTML that the server originally sends. To run browser Javascript, you need a more complete browser engine than cheerio provides such as PhantomJS that will actually run the page's Javascript.
I am trying to use JQuery in my Node Js Code but it is not working.
Please see my below Node JS code.
var http = require('http'),
fs = require('fs');
fs.readFile('./index.html', function (err, html) {
if (err) {
throw err;
}
http.createServer(function (request, response) {
response.writeHeader(200, {
"Content-Type": "text/html"
});
response.write(html);
response.end();
}).listen(8080);
});
require("jsdom").env("", function (err, window) {
if (err) {
console.error(err);
return;
}
var $ = require("jquery")(window);
$('#div1').append('<div>dd</div>');
});
My Index file is coming properly. There is a Div which has div1 Id in my Index.html but I couldn't append new div into my index.html.
What is the problem? Is there any problem in my code?
In addition, if I define var $ in top of the code and move the $('#div1').append('<div>dd</div>'); code out of the brackets such as
});
$('#div1').append('<div>dd</div>');
var db = require("./db_select");
Node JS throwing an error such as: TypeError: $ is not a function
How can I define a global $ ?
I've changed my code as below because I'm anticipating that it can be a sync problem.
var runHtml = function () {
this.load = function () {
var http = require('http'),
fs = require('fs');
fs.readFile('./index.html', function (err, html) {
if (err) {
throw err;
}
console.log('Node Js is working');
http.createServer(function (request, response) {
response.writeHeader(200, {
"Content-Type": "text/html"
});
response.write(html);
response.end();
}).listen(8080);
});
};
this.createjson = function () {
require("jsdom").env("", function (err, window) {
if (err) {
console.error(err);
return;
}
var $ = require("jquery")(window);
$('#div1').append('<div>dd</div>');
});
};
if (this instanceof runHtml) {
return this.runHtml;
} else {
return new runHtml();
}
};
var runit = new runHtml();
runit.load().createjson();
However, when I run this code, I'm getting an error from the terminal such as: TypeError: Cannot read property 'createjson' of undefined
Why am I getting this error?
As you can see I have a createjson function and I'm trying to call it after load function as using javascript chaning!
The terminal actually gave it away. When you get the error TypeError: Cannot read property 'createjson' of undefined that tells you where to look. In your code, there is a thing, which has a property of creaetejson, that, at runtime, is undefined.
In your last code snippet, at the end, you have a line that goes runit.load().createjson();. This is a line that has a thing (the call to the load method) that has a property called createjson. Looking at the rest of that code snippet, we see that load is a method of runit, which is defined by the line above it var runit = new runHtml();. If we checkout the runHTML function, we find that it has a method called load. In this method, there is no return statement, so it returns undefined by default.
Returning to the line runit.load().createjson();, we can now see how it is evaluated: runit is a functional style class and load is a method on runit, which returns undefined. You called the load method on the runit class, which means that the line (aside from what it does in the load function) is the same as undefined.createjson().
Now, in regards to solving the problem, there are a number of different ways to go.
The simplest way may be to simply return the object from the load method. If you go that route, I suggest you beef up your knowledge of the keyword this. You may also want to look into the keywords async and await.
Happy Coding
I'm using Electrons Quick Start Projekt (Commit dbef48ee7d072a38724ecfa57601e39d36e9714e) to test exceptions.
In index.html I changed the name of the required module from renderer.js to rendererXXX.js.
require('./renderer.js')
which results in an expected Exeption (it is visible in the devtools for that window):
Uncaught Error: Cannot find module './rendererXXX.js'
Now it would be nice if the main-process (see main.js) is aware that one renderer process failed. Thus I wrapped the instatiation of the window into a try-catch-block
try {
app.on('ready', createWindow)
} catch (e) {
console.log("Exception caught: " + e.message);
} finally {
// nothing yet
}
But I realized, that the Exception is not forwarded to the main-process. So what are typical ways to handle exceptions of renderer processes - is there a way to handle them from the main-process?
EDIT:
I also wrapped the line that loads the index.html into try-catch, but still I can't handle the error:
try {
// and load the index.html of the app.
mainWindow.loadURL(`file://${__dirname}/index.html`)
} catch (e) {
console.log("Exception caught in 'createWindow': " + e.message);
}
Electron windows are rendered in their own process. Because of this there is little if any communication between main process and render processes. The best you can do is catch errors in the render process and use Electrons IPC module to pass them back to your main process.
In your render process:
var ipc = require('electron').ipcRenderer;
window.onerror = function(error, url, line) {
ipc.send('errorInWindow', error);
};
In your main process:
var ipc = require('electron').ipcMain;
ipc.on('errorInWindow', function(event, data){
console.log(data)
});
Additionally your main process can watch for a limited set of events directly on the window (or on the windows webContents):
window.on('unresponsive', function() {
console.log('window crashed');
});
...
window.webContents.on('did-fail-load', function() {
console.log('window failed load');
});
I had a similar issue where I wanted to log errors to a file from the main process. Here's an addition to the answer already provided by Teak:
var ipc = require('electron').ipcRenderer;
window.onerror = function(error, url, line) {
ipc.send('errorInWindow', error);
};
would work. Just keep in mind that the onerror callback passes 5 arguments, where the last one is the actual Error object.
https://developer.mozilla.org/en-US/docs/Web/API/GlobalEventHandlers/onerror
However, since messages are serialized when sending through IPC, it's not possible to pass the Error object fully since it won't serialize correctly by default. Thus the data needs to be massaged before sending it if you need more error details (such as stack trace etc).
I used the following Is it not possible to stringify an Error using JSON.stringify? for some ideas and the end result was:
var objFromError = function(err, filter, space) {
var plainObject = {};
Object.getOwnPropertyNames(err).forEach(function(key) {
plainObject[key] = err[key];
});
return plainObject;
};
window.onerror = function (msg, url, lineNo, columnNo, error) {
ipcRenderer.send('asynchronous-windowerr', 'main', objFromError(error));
}
Then in main.js:
ipcMain.on('asynchronous-windowerr', function(event, source, err) {
var str = source + ': ';
if(err != null) {
if(err.stack != null) {
str += err.stack;
} else if(err.message != null) {
str += err.message;
}
}
loggerr.appendLogFile(errLogFile, 'err', str);
})
I am using the following node.js code to download documents from some url and save it in the disk.
I want to be informed about when the document is downloaded. i have not seen any callback with pipe.Or, Is there any 'end' event that can be captured on completion of download ?
request(some_url_doc).pipe(fs.createWriteStream('xyz.doc'));
Streams are EventEmitters so you can listen to certain events. As you said there is a finish event for request (previously end).
var stream = request(...).pipe(...);
stream.on('finish', function () { ... });
For more information about which events are available you can check the stream documentation page.
Based nodejs document, http://nodejs.org/api/stream.html#stream_event_finish,
it should handle writableStream's finish event.
var writable = getWriteable();
var readable = getReadable();
readable.pipe(writable);
writable.on('finish', function(){ ... });
Code snippet for piping content from web via http(s) to filesystem. As #starbeamrainbowlabs noticed event finish does job
var tmpFile = "/tmp/somefilename.doc";
var ws = fs.createWriteStream(tmpFile);
ws.on('finish', function() {
// pipe done here, do something with file
});
var client = url.slice(0, 5) === 'https' ? https : http;
client.get(url, function(response) {
return response.pipe(ws);
});
I found an a bit different solution of my problem regarding this context. Thought worth sharing.
Most of the example create readStreams from file. But in my case readStream has to be created from JSON string coming from a message pool.
var jsonStream = through2.obj(function(chunk, encoding, callback) {
this.push(JSON.stringify(chunk, null, 4) + '\n');
callback();
});
// message.value --> value/text to write in write.txt
jsonStream.write(JSON.parse(message.value));
var writeStream = sftp.createWriteStream("/path/to/write/write.txt");
//"close" event didn't work for me!
writeStream.on( 'close', function () {
console.log( "- done!" );
sftp.end();
}
);
//"finish" event didn't work for me either!
writeStream.on( 'close', function () {
console.log( "- done!"
sftp.end();
}
);
// finally this worked for me!
jsonStream.on('data', function(data) {
var toString = Object.prototype.toString.call(data);
console.log('type of data:', toString);
console.log( "- file transferred" );
});
jsonStream.pipe( writeStream );
Here's a solution that handles errors in requests and calls a callback after the file is written:
request(opts)
.on('error', function(err){ return callback(err)})
.pipe(fs.createWriteStream(filename))
.on('finish', function (err) {
return callback(err);
});