Node.js cheerio weird characters (cyrillic letters) - javascript

I get weird characters when i am trying to parse a page.
Here is my code:
var getPageContent = function getPageContent(url, callback) {
https.get(url, function (res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function () {
callback(data));
});
}).on("error", function () {
callback(null);
});
};
getPageContent(url, function (response) {
var $ = cheerio.load(response, { decodeEntities: false });
$("div.details-info").each(function() {
console.log($(this).html());
});
});
My result is:
<span>Ст��атегии</span>
<span>Стратег��и</span>
<span>Стра��егии</span>
<span>Стратегии</span>
<span>Стратегии</span>
...
The strangest thing is that from same url, sometimes i get this strange characters, sometimes i don't. And also when i am running this from my computer it's working fine. I get this characters on server only.

You will probably need to manually convert the charset of response to UTF-8. You can do this using the iconv or iconv-lite modules. cheerio itself does not automatically handle charset conversions.

Related

Node.js https.get return invalid json

Any help is welcome, I have been struggling for many hours now...
I have a somewhat straight forward code in which I ping the GitHub API to retrieve a JSON. If I execute this code synchronously it works very well. However, when I receiving multiple call at the same time (or if I run it in async.parallel), sometimes the result of aa is an invalid JSON (I can see it in my logs) and JSON.parse crash. It seems like there's a missing chunk in the final version of aa
app.get('/update-tests', function(req, res) {
const getOptionsGitHub = {
...
};
var req = https.get(getOptionsGitHub, function(res) {
aa = "";
res.on('data', function(chunk) { aa += chunk; });
res.on('end', function() {
try {
console.dir(aa);
callback(JSON.parse(aa));
} catch (e) {
console.dir(e);
}
});
});
res.send(200);
});
Any idea why I would have some missing chunk sometimes?

pdf2json fail on passing a pdf from http request

I'm trying to get information from PDF files on nodejs script.
I get this error when executing the program.
Error: stream must have data
at error (eval at <anonymous> (/Users/.../node_modules/pdf2json/lib/pdf.js:60:6), <anonymous>:193:7)
....
Here is the code:
http.get(url_Of_Pdf_File, function(res) {
var body = '';
res.on('data', function (chunk) {
body += chunk;
});
res.on('end', function() {
// Here body have the pdf content
pdf2table.parse(body, function (err, rows, rowsdebug) { // <-- Conflict
// Code fail executing the previous line
if(err) return console.log(err);
toMyFormat(rows, function(data){
console.log(JSON.stringify(data,null," "));
});
});
});
});
I am not sure why the code does not work, because if I download the PDF file and then instead of getting with 'http.request' method I get the file with 'fs.readFile' method the code before works.
fs.readFile(pdf_file_path, function (err, buffer) {
if (err) return console.log(err);
pdf2table.parse(buffer, function (err, rows, rowsdebug) {
if(err) return console.log(err);
console.timeEnd("Processing time");
toMyFormat(rows, function(data){
output(JSON.stringify(rows, null, " "));
});
});
});
My question is:
What is the difference between the content of 'body' and 'buffer' in two eamples?
In first example, chunk is buffer and you are converting it to utf8 string by adding it with empty body ''. When you add buffer with string, it converts to utf8 and your original data is lost.
Try this:
var chunks = [];
res.on('data', function (chunk) {
chunks.push(chunk)
});
res.on('end', function() {
// Here body have the pdf content
pdf2table.parse(Buffer.concat(chunks), function (err, rows, rowsdebug) {
//...
});
});

Is there a way to check the body of a Node.js https request before I send it?

I'm trying to debug some API calls, and I want to confirm that I'm sending what I think I'm sending.
var req = https.request(request, function(res) {
var body;
res.setEncoding('utf8');
body = "";
res.on('data', function(chunk) {
return body += chunk;
});
res.on('end', function() {
if (res.statusCode === 200) {
return settings.success(null, JSON.parse(body), res);
} else {
return settings.error(body, null, res);
}
});
return res.on('error', function() {
return settings.error(null, Array.prototype.slice.apply(arguments), res);
});
});
if (settings.data != null) {
req.write(settings.data);
}
// check body of req here
settings.data is a protobuf buffer. I want to check the final written body because there is a bug somewhere in the call and I would like to see exactly what I am sending. Can anyone help?
EDIT: I misunderstood the original question. To check what you're sending, you can use util.inspect(settings.data) to return a string that displays the contents of that variable or use console.dir(settings.data) to display that string implicitly to stdout (console.dir() uses util.inspect() behind the scenes).
Original answer:
I believe that protocol buffers are binary, however you're converting all of the data to a utf8 string. You might try keeping it all as binary:
var body = [], nb = 0;
res.on('data', function(chunk) {
body.push(chunk);
nb += chunk.length;
});
res.on('end', function() {
body = Buffer.concat(body, nb);
// now `body` contains all protocol buffer data in the original binary format
});
If you then want to get a textual representation of what's in the buffer, you could use util.inspect(). For example:
var inspect = require('util').inspect;
// ...
res.on('end', function() {
body = Buffer.concat(body, nb);
if (res.statusCode === 200)
settings.success(null, inspect(body), res);
else
settings.error(body, null, res);
});

Unable to get data from REST Calls using Node.js

I am making a REST API call from my php and Node.js application to a particular URL provided by the client which returns a Json object. It works fine from with the PHP. However, I am unable to receive data from my node application? What might be the possible reason can someone help me ?
Note: I have pasted a dummy REST URI for security reasons
It works fine with PHP infact i get the json formatted data in like couple of seconds.
$response =
file_get_contents('http://xyz.net/v2_resmgr/providers/pools'); echo
$response;
I try the same url using node.js i get a TimeOut error. I also tried setting the timeout but it would still not work.
var job = new CronJob({
cronTime: '0 */3 * * * *',
onTick: function () {
url= "http://xyznet/v2_resmgr/providers/pools";
var request = http.get(url, function (response) {
var buffer = "",
data,
route;
response.on("data", function (chunk) {
buffer += chunk;
});
response.on("end", function (err) {
console.log(buffer);
});
request.setTimeout( 200000, function( ) {
// handle timeout here
console.log("Time Out call to the Rest API");
});
});
},
start: true
});
job.start();
I don't know if this is the answer you are looking for, but life gets easier when you use the 'request' package (https://www.npmjs.org/package/request)
Here is what the code would look like using the request module:
var request = require('request');
request('http://xyznet/v2_resmgr/providers/pools', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Print the body of the response.
}
})
Update: I coded something a little closer to your post. The code below does not use the "request" module and it contacts the server every 3 seconds.
setInterval(function () {
http.get('http://echo.jsontest.com/key/value', function (response) {
var responseBody = '';
response.on('data', function (chunk) {
responseBody += chunk;
});
response.on('end', function () {
console.log(responseBody);
var object = JSON.parse(responseBody)
});
});
}, 3000);

How to dump tabular data from a webpage made of no-js class?

I am trying to scrape through a web-page which uses no-js html class.
I've come up with code to scrape.
Now that webpage has a table always and I want that full table in an excel file.
That means scrape through the webpage and dump the table into a file.
How do I do that?
Here's the code so far.
var http = require("http");
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var url = "http://kayak.com"
download(url, function(data) {
if (data) {
console.log(data);
}
else console.log("error");
});
You could use the request module to get the page markup and then parse it with cherrio.
Cherrio provides a lightweight jquery implementation that can be used on the server:
https://github.com/MatthewMueller/cheerio
Request provides a simplified http client:
https://github.com/mikeal/request
var request = require('request');
var cheerio = require('cheerio');
var url = 'http://kayak.com';
request(url, function(err, res, body){
$ = cheerio.load(body);
var $rows = $('table tr').toArray();
$rows.map(function(row){
var cells = $(row).find('td').toArray();
console.log(cells.map(function(cell){
return $(cell).text().trim();
}).join(', '));
});
});

Categories

Resources