I have written some javascript that goes to a page and returns the data from the site, however, I would like to get specific elements off this html site and use functions like document.getElementById. How can I use that sort of functionality here? Currently, the console.log(chunk) simply spits out the entire body of html, I want to be able to parse that.
var http = require("http");
var options = {
host: 'www.google.com',
port: 80,
path: '/news'
};
http.get(options, function(res) {
res.setEncoding('utf8');
res.on('data', function(chunk){
console.log(chunk);
});
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
There are many npm modules to perform the same, here are some
1.cheerio
2.jsdom
Related
I try to get text content from the webpage. For example Google.com
I write at console:
$ ('#SIvCob').innerText
and get:
"Google offered in: русский"
This is the text, what I find out. Now I want to save it to file (.txt).
Two moments: there is no only one item, that I search, actually 7-10. And, there is a refresh every second! I go to write a cycle.
I know about copy() function and about right click on the console and "Save As," but I need a CODE, which will do it automatically.
Thanks in advance.
The browser has no API to write to the file system since that would be a security risk. But you can use Nodejs and their File System API to write you text file.
You will also need to use the HTTP API to get the web content. And you will also need to parse your HTML, you can do it with fast-html-parser or any other module of your choice. (high5, htmlparser, htmlparser2, htmlparser2-dom, hubbub, libxmljs, ms/file, parse5, ...)
var http = require('http');
var fs = require('fs');
var parser = require('node-html-parser');
var options = {
host: 'www.google.com',
port: 80,
path: '/index.html'
};
var file = '/path/to/myFile.txt';
http.get(options, function(res) {
res.setEncoding('utf8');
var body = '';
res.on('data', function (chunk) {body += chunk});
res.on('end', function () {
var dom = parser.parse(body);
var text = dom.querySelector('#SIvCob').text;
fs.writeFile(file, text, function (err) {
if (err) throw err;
console.log('The file has been saved!');
});
});
});
I have two servers that communicate with each other. Server1 requests for parts of the file from Server2 and store the data received into one file. Server2 is supposed to receive each of these requests and create a stream pipes the data over.
Suppose the files stored(directory) in Server 2 are as following
bigfile.gz
bigfile.gz.part-0
bigfile.gz.part-1
bigfile.gz.part-2
......
So Server1 will send a request for part-0 then part-1 and so on to the Server2. Hence the use of the loop to make requests.
Server 1 (code snippet)
for (var i in requestInfo['blockName']) {
var blockName = i;
var IP = requestInfo['blockName'][i][0];
var fileData = JSON.stringify({
blockName: blockName,
fileName: requestInfo['fileName']
});
makeRequest(fileData, IP);
console.log(counter);
}
function makeRequest(fileData, IP) {
var options = {
host: IP,
port: 5000,
path: '/read',
method: 'POST',
headers: {
'Content-Type': 'application/json'
}
};
var req = http.request(options, function(res) {
var data = '';
res.on('data', function(chunk) {
data += chunk;
});
res.on('end', function() {
console.log(data.length);
//fs.appendFileSync(fileName, data);
var writeStream = fs.createWriteStream(fileName, { "flags": 'a' });
writeStream.write(data);
writeStream.end();
});
});
req.write(fileData);
req.end();
}
Server 2 (code snippet)
app.post('/read', function(req, res) {
var dataBody = req.body;
fs.createReadStream(dataBody.fileName + '/' + dataBody.blockName).pipe(res);
});
The one above works for when I test it with a 100MB txt file. But it fails when i have 1GB .gz file or even when I test it with a .zip file the output the final .zip generated on the Server 1 side is the incorrect size.
I am not sure what I am doing wrong here or is the alternate solution
EDIT:
Also my Server1 crashes when dealing with the big 1GB .gz file
Your main problem here is that you treating your data as string by appending chunks to a string.
By rewriting this should be
var req = http.request(options, function(res) {
var data = [];
res.on('data', function(chunk) {
data.push(chunk);
});
res.on('end', function() {
fs.writeFile(fileName, Buffer.concat(data), function() {
console.log("write end")
});
});
});
That way we are creating a big array of binary chunks, and when the download is complete we write the concatenation of all the chunks to a file.
But notice the word big
If you stick with this implementation you are risking to get out of memory, especially if you are dealing with large (>500mb) files.
Streams to the rescue
var req = https.request(options, function(res) {
res.pipe(fs.createWriteStream(fileName)).on("close", function() {
console.log("write end");
});
});
Using the above implementation memory footprint should stay low. Because the moment you get a specific amount of data from your download, you write them to the file. That way you never keep the whole file into the program's memory.
In my website I am using node js for backend and html for front end. I need to get external website metadata (keywords).
Have any package for get the metadata in node js?
For example i have 100 website url in array following like this.
var arrayName = ["http://www.realsimple.com/food-recipes/9-healthy-predinner-snacks", "http://www.womenshealthmag.com/weight-loss/100-calorie-snacks", "https://www.pinterest.com/explore/healthy-snacks/", "http://www.rd.com/slideshows/healthy-snacks-for-adults/", "http://greatist.com/snacking", "http://www.bodybuilding.com/fun/26-best-healthy-snacks.html"]
I need to get all website metadata particularly in keywords of metadata.
In node js have any package for this ?
I found some code in google.
var options = {
host: 'www.google.com',
port: 80,
path: '/index.html'
};
http.get(options, function(res) {
console.log("Got response: " + res.statusCode);
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
Have any other options?
Expected Outputs:
Array1 = ["keyword1","keyword2","keyword3"];
Array2 = ["keyword1","keyword2","keyword3"];
Array3 = ["keyword1","keyword2","keyword3"];
Array1, Array2, Array3 are Site1,Site2,Site3 like this.
I'll suggest you to use any from following packages:
http://npm.im/cheerio
http://npm.im/request
Note: You need to code it by yourself to grasp keywords from site data.
I'm trying to make a request to here, if you click on the link you should see a JSON response (as expected). I've, tried https and http, it doesnt matter (at least I don't think so).
Anyways the problem when I try to get a response from the command line, I get non UTF-8 characters like �������B��������E��9 as a response, even when I specify utf-8 encoding. Ive tried the npm module request and doing node http/https requests.
All i need is to just return a JSON response.
I've also tried JSON.parse() but to no avail.
Here's the code I've tried
var request = require("request")
var url = endpoint;
request({
url: url,
json: true
}, function (error, response, body) {
if (!error && response.statusCode === 200) {
console.log(body); // Print the json response
}
})
and the basic http request
var endpoint = 'http://api.stackexchange.com/2.1/search/advanced?order=desc&sort=relevance&q=jsonp&site=stackoverflow';
var req = http.request(endpoint, function(res) {
res.setEncoding('utf8');
res.on('data', function (chunk) {
console.log('BODY: ' + chunk);
});
});
req.on('error', function(e) {
console.log('problem with request: ' + e.message);
});
// write data to request body
req.write('data\n');
req.write('data\n');
req.end();
Stackoverflow servers are misconfigured, so they return gzip-encoded body even though you didn't ask for it.
Looks like you have to gunzip your response after receiving it.
Right now I'm using this script in PHP. I pass it the image and size (large/medium/small) and if it's on my server it returns the link, otherwise it copies it from a remote server then returns the local link.
function getImage ($img, $size) {
if (#filesize("./images/".$size."/".$img.".jpg")) {
return './images/'.$size.'/'.$img.'.jpg';
} else {
copy('http://www.othersite.com/images/'.$size.'/'.$img.'.jpg', './images/'.$size.'/'.$img.'.jpg');
return './images/'.$size.'/'.$img.'.jpg';
}
}
It works fine, but I'm trying to do the same thing in Node.js and I can't seem to figure it out. The filesystem seems to be unable to interact with any remote servers so I'm wondering if I'm just messing something up, or if it can't be done natively and a module will be required.
Anyone know of a way in Node.js?
You should check out http.Client and http.ClientResponse. Using those you can make a request to the remote server and write out the response to a local file using fs.WriteStream.
Something like this:
var http = require('http');
var fs = require('fs');
var google = http.createClient(80, 'www.google.com');
var request = google.request('GET', '/',
{'host': 'www.google.com'});
request.end();
out = fs.createWriteStream('out');
request.on('response', function (response) {
response.setEncoding('utf8');
response.on('data', function (chunk) {
out.write(chunk);
});
});
I haven't tested that, and I'm not sure it'll work out of the box. But I hope it'll guide you to what you need.
To give a more updated version (as the most recent answer is 4 years old, and http.createClient is now deprecated), here is a solution using the request method:
var fs = require('fs');
var request = require('request');
function getImage (img, size, filesize) {
var imgPath = size + '/' + img + '.jpg';
if (filesize) {
return './images/' + imgPath;
} else {
request('http://www.othersite.com/images/' + imgPath).pipe(fs.createWriteStream('./images/' + imgPath))
return './images/' + imgPath;
}
}
If you can't use remote user's password for some reasons and need to use the identity key (RSA) for authentication, then programmatically executing the scp with child_process is good to go
const { exec } = require('child_process');
exec(`scp -i /path/to/key username#example.com:/remote/path/to/file /local/path`,
(error, stdout, stderr) => {
if (error) {
console.log(`There was an error ${error}`);
}
console.log(`The stdout is ${stdout}`);
console.log(`The stderr is ${stderr}`);
});