pdf2json fail on passing a pdf from http request - javascript

I'm trying to get information from PDF files on nodejs script.
I get this error when executing the program.
Error: stream must have data
at error (eval at <anonymous> (/Users/.../node_modules/pdf2json/lib/pdf.js:60:6), <anonymous>:193:7)
....
Here is the code:
http.get(url_Of_Pdf_File, function(res) {
var body = '';
res.on('data', function (chunk) {
body += chunk;
});
res.on('end', function() {
// Here body have the pdf content
pdf2table.parse(body, function (err, rows, rowsdebug) { // <-- Conflict
// Code fail executing the previous line
if(err) return console.log(err);
toMyFormat(rows, function(data){
console.log(JSON.stringify(data,null," "));
});
});
});
});
I am not sure why the code does not work, because if I download the PDF file and then instead of getting with 'http.request' method I get the file with 'fs.readFile' method the code before works.
fs.readFile(pdf_file_path, function (err, buffer) {
if (err) return console.log(err);
pdf2table.parse(buffer, function (err, rows, rowsdebug) {
if(err) return console.log(err);
console.timeEnd("Processing time");
toMyFormat(rows, function(data){
output(JSON.stringify(rows, null, " "));
});
});
});
My question is:
What is the difference between the content of 'body' and 'buffer' in two eamples?

In first example, chunk is buffer and you are converting it to utf8 string by adding it with empty body ''. When you add buffer with string, it converts to utf8 and your original data is lost.
Try this:
var chunks = [];
res.on('data', function (chunk) {
chunks.push(chunk)
});
res.on('end', function() {
// Here body have the pdf content
pdf2table.parse(Buffer.concat(chunks), function (err, rows, rowsdebug) {
//...
});
});

Related

Convert raw buffer data to a JPEG image

This question is a follow-up from this answer. I'm asking it as a question because I don't have enough reputation to make comments.
I am trying to write an Android app that takes an image and sends it to a Node.js server.
I found this code in the post linked above, but it leaves me with a raw buffer of data. How can I save the contents of this buffer as a JPEG image?
app.post('/upload-image', rawBody, function(req, res) {
if (req.rawBody && req.bodyLength > 0) {
// TODO save image (req.rawBody) somewhere
// send some content as JSON
res.send(200, {
status: 'OK'
});
} else {
res.send(500);
}
});
function rawBody(req, res, next) {
var chunks = [];
req.on('data', function(chunk) {
chunks.push(chunk);
});
req.on('end', function() {
var buffer = Buffer.concat(chunks);
req.bodyLength = buffer.length;
req.rawBody = buffer;
next();
});
req.on('error', function(err) {
console.log(err);
res.status(500);
});
}
You can simply use writeFile
Pass the file path as first argument, your buffer as second argument, and a callback to execute code when the file has been written.
Alternatively, use writeFileSync, a synchronous version of writeFile which doesn't take a callback

Node.js https.get return invalid json

Any help is welcome, I have been struggling for many hours now...
I have a somewhat straight forward code in which I ping the GitHub API to retrieve a JSON. If I execute this code synchronously it works very well. However, when I receiving multiple call at the same time (or if I run it in async.parallel), sometimes the result of aa is an invalid JSON (I can see it in my logs) and JSON.parse crash. It seems like there's a missing chunk in the final version of aa
app.get('/update-tests', function(req, res) {
const getOptionsGitHub = {
...
};
var req = https.get(getOptionsGitHub, function(res) {
aa = "";
res.on('data', function(chunk) { aa += chunk; });
res.on('end', function() {
try {
console.dir(aa);
callback(JSON.parse(aa));
} catch (e) {
console.dir(e);
}
});
});
res.send(200);
});
Any idea why I would have some missing chunk sometimes?

Node.js cheerio weird characters (cyrillic letters)

I get weird characters when i am trying to parse a page.
Here is my code:
var getPageContent = function getPageContent(url, callback) {
https.get(url, function (res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function () {
callback(data));
});
}).on("error", function () {
callback(null);
});
};
getPageContent(url, function (response) {
var $ = cheerio.load(response, { decodeEntities: false });
$("div.details-info").each(function() {
console.log($(this).html());
});
});
My result is:
<span>Ст��атегии</span>
<span>Стратег��и</span>
<span>Стра��егии</span>
<span>Стратегии</span>
<span>Стратегии</span>
...
The strangest thing is that from same url, sometimes i get this strange characters, sometimes i don't. And also when i am running this from my computer it's working fine. I get this characters on server only.
You will probably need to manually convert the charset of response to UTF-8. You can do this using the iconv or iconv-lite modules. cheerio itself does not automatically handle charset conversions.

Is there a way to check the body of a Node.js https request before I send it?

I'm trying to debug some API calls, and I want to confirm that I'm sending what I think I'm sending.
var req = https.request(request, function(res) {
var body;
res.setEncoding('utf8');
body = "";
res.on('data', function(chunk) {
return body += chunk;
});
res.on('end', function() {
if (res.statusCode === 200) {
return settings.success(null, JSON.parse(body), res);
} else {
return settings.error(body, null, res);
}
});
return res.on('error', function() {
return settings.error(null, Array.prototype.slice.apply(arguments), res);
});
});
if (settings.data != null) {
req.write(settings.data);
}
// check body of req here
settings.data is a protobuf buffer. I want to check the final written body because there is a bug somewhere in the call and I would like to see exactly what I am sending. Can anyone help?
EDIT: I misunderstood the original question. To check what you're sending, you can use util.inspect(settings.data) to return a string that displays the contents of that variable or use console.dir(settings.data) to display that string implicitly to stdout (console.dir() uses util.inspect() behind the scenes).
Original answer:
I believe that protocol buffers are binary, however you're converting all of the data to a utf8 string. You might try keeping it all as binary:
var body = [], nb = 0;
res.on('data', function(chunk) {
body.push(chunk);
nb += chunk.length;
});
res.on('end', function() {
body = Buffer.concat(body, nb);
// now `body` contains all protocol buffer data in the original binary format
});
If you then want to get a textual representation of what's in the buffer, you could use util.inspect(). For example:
var inspect = require('util').inspect;
// ...
res.on('end', function() {
body = Buffer.concat(body, nb);
if (res.statusCode === 200)
settings.success(null, inspect(body), res);
else
settings.error(body, null, res);
});

Function to check if externally hosted textfile contains string acting slow- Node

I have the following code, which will retrieve a text file from an external server, and search the file for a specific string.
The function:
function checkStringExistsInFile(String, cb) {
var opt = {
host: 'site.com',
port: 80,
path: '/directory/data.txt',
method: 'GET',
};
var request = http.request(opt, function(response){
response
.on('data', function(data){
var string = data+"";
var result = ((string.indexOf(" "+ String +" ")!=-1)?true:false);
cb(null, result);
})
.on('error', function(e){
console.log("-ERR: Can't get file. "+JSON.stringify(e));
if(cb) cb(e);
})
.on('end', function(){
console.log("+INF: End of request");
});
});
request.end();
}
And this is where I call the function, and do something with the results.
checkStringExistsInFile(String, function(err, result){
if(!err) {
if(result) {
//there was a result
} else {
//string not present in file
}
} else {
// error occured
}
});
This worked great in the beginning (small text file), but my textfile is getting larger (4000 characters+) and this is not working anymore.
What can I do to solve this? Should I safe the temporary save the file on my server first, should I open the file as a stream?
It would be appreciated if you can support your answer with a relevant example. Thanks in advance!
Documentation :
If you attach a data event listener, then it will switch the stream into flowing mode, and data will be passed to your handler as soon as it is available.
If you just want to get all the data out of the stream as fast as possible, this is the best way to do so.
http://nodejs.org/api/stream.html#stream_event_data
Data event is emitted as soon as there are data, even if the stream is not completely loaded. So your code just look for your string in the first lines of your stream, then callbacks.
What to do ?
Your function should only call callback on the end() event, or as soon as it finds something.
function checkStringExistsInFile(String, cb) {
var opt = {
host: 'site.com',
port: 80,
path: '/directory/data.txt',
method: 'GET',
};
var request = http.request(opt, function(response){
var result = false;
response
.on('data', function(data){
if(!result) {
var string = data+"";
result = (string.indexOf(" "+ String +" ")!=-1);
if(result) cb(null, result);
}
})
.on('error', function(e){
console.log("-ERR: Can't get file. "+JSON.stringify(e));
if(cb) cb(e);
})
.on('end', function(){
console.log("+INF: End of request");
cb(null, result)
});
});
request.end();
}

Categories

Resources