Can't process multiple pdf files using pdf2json for nodejs

Can't process multiple pdf files using pdf2json for nodejs - javascript

I'm using https://github.com/modesty/pdf2json to parse multiple pdf files. It works with 1 single file, but when trying to load multiple files, the pdfParser_dataReadyevent seems to fire always with the same file.
This is what i've tried
var PDFParser = require('pdf2json');
var pdfParser = new PDFParser();
var fs = require('fs');
var fileNames = [];
var fileCont = 0;
fs.readdir(fileFolder, function(err, files){
for (var i = files.length - 1; i >= 0; i--) {
if (files[i].indexOf('.pdf') !== -1){
fileNames.push(files[i]);
}
pdfParser.loadPDF(fileNames[fileCont]);
});
pdfParser.on('pdfParser_dataReady', function(data){
//Do all my stuff and insert in db...
fileCont++;
If (fileCont === fileNames.lenght){
for (var i = fileNames.length - 1; i >= 0; i--) {
fs.unlink(fileFolder + fileNames[i]);
}
return res.json({
data: 'ok '
});
}
pdfParser.loadPDF(fileFolder + fileNames[fileCont]);
});

I managed to make pdf2json work with multiple files by creating a new PDFparser in each iteration. This is not a very 'pretty' way to manage multiple pdf files, the library should have an easy way of doing it, but it works!
var PDFParser = require('pdf2json');
var fs = require('fs');
var fileNames = [];
var fileFolder = 'myFolder/';
var fileCont = 0;
var loadPDF = function(filePath){
if(fileNames.length === fileCont){
//Insert in db and add any FINAL code, then return;
}
else{
//Call for another file to process
var pdfParser = null;
pdfParser = new PDFParser();
pdfParser.loadPDF(filePath);
pdfParser.on('pdfParser_dataError', function(err){
//Handle pdfParser error
});
pdfParser.on('pdfParser_dataReady', function(data){
//Get the pdf data and process it
fileCont++; //increase the file counter
loadPDF(fileFolder + fileNames[fileCont]); //parse the next file
});
}
};
fs.readdir(fileFolder, function(err, files){
for (var i = files.length - 1; i >= 0; i--) {
if (files[i].indexOf('.pdf') !== -1){
fileNames.push(files[i]);
}
}
loadPDF(fileFolder + fileNames[fileCont]);
});

Related

How to read and Write multiple files using node js?

In a array I have filenames; I want to first read one file and perform some operation then store result in a separate file. Then read 2nd file, perform operation again and save result in new 2nd file. Do the same procedure for all files. Below I have written code to read and write files.
TextReader.js
var fs = require('fs');
const readline= require('readline');
var headerIndex = [];
var isFirstLine = true;
var finalList = [];
module.exports={
readTextFile: (filename)=>{
console.log('inside textreader')
readline.createInterface({
input: fs.createReadStream(`./s3/${filename}`)
}).on('line', function(line) {
console.log(line);
console.log("-----------------------------");
if (isFirstLine) {
headerIndex = line.split('|');
}
else if (!isFirstLine){
let rowValues = line.split('|');
let valueIndex = 0;
var singlePerson = {};
headerIndex.forEach(currentval => {
singlePerson[currentval] = rowValues[valueIndex];
valueIndex++;
});
finalList.push(singlePerson);
}
isFirstLine = false;
}).on('close',function(){
//console.log(finalList);
var data='';
var header= "Employee ID"+'\t'+headerIndex[0]+'\t'+headerIndex[2]+'\t'+headerIndex[1]+'\t'+headerIndex[4]
+'\t'+headerIndex[3]+'\t'+headerIndex[5]+'\n';
for (var i = 0; i < finalList.length; i++) {
function split(name){
var conv=name.split(' ');
var result=[conv.slice(0, -1).join(' '),conv.slice(-1)[0]].join(conv.length < 2 ? '' : ',');
return result;
}
split(finalList[i].UserName);
data=data+finalList[i].LoginID+'\t'+split(finalList[i].UserName)+'\t'+finalList[i].Email+'\t'
+finalList[i].LoginID+'\t'+'A&G Professional'+'\t'+finalList[i].Title+'\t'+finalList[i].State+'\n';
}
var newFilename= filename.substr(0, filename.lastIndexOf("."))
var alldata= header + data;
//console.log(alldata)
fs.appendFile(`./s3/${filename}.xlsx`,alldata, (err) => {
if (err) throw err;
console.log('File created');
});
});
}
}
I am calling readTextFile(); from another file.
demo.js
const { readTextFile } = require("./textReader");
var array=['UserRoleDetails_12102021063206.txt',
'UserRoleDetails_12102021064706 (1).txt',
'UserRoleDetails_12102021064706.txt',
'UserRoleDetails_12102021070206.txt']
array.forEach(function(currentItem){
readTextFile(currentItem);
})
The problem i am facing is that all files are processed at the same time and all the datas of all files are stored together.

first, this node js is not work in sequential as you mention here
and second, array.forEach is not useful here to do the sequential operation
you need to use
const { readTextFile } = require("./textReader");
var array=['UserRoleDetails_12102021063206.txt',
'UserRoleDetails_12102021064706 (1).txt',
'UserRoleDetails_12102021064706.txt',
'UserRoleDetails_12102021070206.txt']
for (const element of array) {
readTextFile(currentItem);
}
NOTE:- readTextFile(currentItem) your this function is not async so maybe you need to make it async
if you are not clear then raise your hand

Reading a file line by line, parse them and insert them in mongo in node js

I have a file which is tab separated. It has thousands of data. How can I use nodeJs to read the file, line by line, parse them and create an object and insert them in a mongo DB.
I am just learning node and mongo. I come from different background. So how can this be done.
Finally the Mongo DB has to be populated with proper data.
I searched in net but I could not find the complete solution.
Thanks.

I had an issue with the answer by Juvenik. My problem was that the database would not be populated by the time readline had completed. The lines were being read synchronously, but the DB insertion was asynchronous.
Instead, I found a simpler solution with the line-reader package. It reads the lines and waits for a callback before continuing.
var MongoClient = require('mongodb').MongoClient
var dbName = 'yourDbName'
var url = 'mongodb://localhost:27017/' + dbName
var collectionName = 'yourCollectionName'
var filename = 'yourFileName.txt'
var printLine = 1000
MongoClient.connect(url, function(err, db) {
if (err) {
console.error('Problem connecting to database')
} else {
console.log('Connected correctly to server.')
var lineReader = require('line-reader')
var collection = db.collection(collectionName)
var lineNum = -1
var headers = []
lineReader.eachLine(filename, function(line, last, cb) {
lineNum++
try {
var split = line.split('\t')
var object = {}
if (lineNum > 0) {
for (var i = 0; i < split.length; i += 1) {
object[headers[i]] = split[i]
}
collection.insert(object, function (insertErr, insertObj) {
if (insertErr) console.error(insertErr)
if (lineNum % printLine === 0) console.log('Line ' + lineNum)
if (last) {
console.log('Done with ' + filename + ' (' + lineNum + ' records)')
process.exit(0)
} else {
cb()
}
})
} else {
headers = line.split('\t')
cb()
}
} catch (lineError) {
console.error(lineError)
}
})
}
})

I came across similar problem. This approach worked for me.
Have a look, it might be helpful.
var mongoDb = require('mongodb');
var mongoClient = mongoDb.MongoClient;
var dbname = 'YOUR_DB_NAME';
var collectionName = 'YOUR_COLLECTION_NAME';
var url = 'mongodb://localhost:27017/'+dbname;
var filename = 'FIle_Name.txt';
console.log('***************Process started');
mongoClient.connect(url,function(err,db){
if(err){
console.log('error on connection '+err);
}
else{
console.log('***************Successfully connected to mongodb');
var collection = db.collection(collectionName);
var fs = require('fs');
var readline = require('readline');
var stream = require('stream');
var instream = fs.createReadStream(filename);
var outstream = new stream;
var rl = readline.createInterface(instream,outstream);
console.log('***************Parsing, please wait ...');
rl.on('line',function(line){
try{
var arr = line.split('\t');
var object = {};
//Parse them here
//Example
object['name'] = arr[0]; //Just an example
var res = collection.insert(object);
}
catch (err){
console.log(err);
}
});
rl.on('close',function(){
db.close();
console.log('***************completed');
});
}
});
I am a learner too. If someone can make it better, it will be good.

Here is a more performant (inserting batches of objects) and updated version (using async and latest mongo driver) of frank-0's answer
const lineReader = require('line-reader');
async function readFileAndInsertInMongo(file) {
let total = 0;
return new Promise((resolve, reject) => {
let buffer = [];
lineReader.eachLine(file, (line, last, cb) => {
// prepare your object based on the line content
let insertObject = {'some_content': 'some_value'};
if (total % 10000 === 0 || last) {
collection.insertMany(buffer, function(err, res){
if (last) {
if (err) {
reject(err);
} else {
resolve(res);
}
} else {
buffer = [];
return cb();
}
});
} else {
buffer.push(insertObject);
return cb();
}
});
});
}
This really is the best solution I have found to parse huge files and insert them in the database without exploding Node's memory. Hope this can help ;)

Parse does not show all my file once I add a new file

My web application allows users to create and view files they have created. When the user logs in they can see all the files they created, however when the users creates a new file, and then clicks to view all the files, the newly created file is not there.
Here is my code for saving the file in parse:
router.post('/newfile', function(req, res) {
var usercode = req.body.code;
console.log(usercode);
newname = req.body.name;
console.log(newname);
var bytes = [];
var doc = {};
for (var i = 0; i < usercode.length; i++) {
bytes.push(usercode.charCodeAt(i));
};
console.log("passed byetes");
var parseFile = new Parse.File(newname, bytes);
parseFile.save();
var FileClass = Parse.Object.extend("File");
var newFile = new FileClass();
newFile.save({
user: Parse.User.current(),
fileName: newname,
javaFile: parseFile
});
var newFileClass = Parse.Object.extend("File");
var query = new Parse.Query(newFileClass);
query.find(function(results) {
for (var i = 0; i < results.length; i++) {
var object = results[i];
var codefile = object.get('javaFile');
temp = codefile.name();
name = temp.split("-").pop();
url = codefile.url();
doc[name] = url;
}
}).then(function() {
console.log("Inside then(function()");
console.log(doc);
res.json({
FIles: JSON.stringify(doc)
});
});
});
So once the new file is saved, I do a parse.query.find to get all the files again. When the program comes to the line console.log(doc) it should print all the files including the new one, but it only prints the old files. However once I log out and log back in the new file is there. How do I fix this such that after the user saves the new file it appears along with the other files?

save() runs asynchronously, so all of that code after save() runs before it, before the query will be able to show you anything new . Restructure like this:
save().then(function () {
// everything that follows your save() here
var FileClass = Parse.Object.extend("File");
// etc.
});
EDIT - better code formatting in your edit revealed that there are several asynch functions being attempted, use promises to keep them straight...
router.post('/newfile', function(req, res) {
var usercode = req.body.code;
console.log(usercode);
newname = req.body.name;
console.log(newname);
var bytes = [];
var doc = {};
for (var i = 0; i < usercode.length; i++) {
bytes.push(usercode.charCodeAt(i));
};
console.log("passed byetes");
var parseFile = new Parse.File(newname, bytes);
parseFile.save();
var FileClass = Parse.Object.extend("File");
var newFile = new FileClass();
newFile.save().then(function () {
user: Parse.User.current(),
fileName: newname,
javaFile: parseFile
var newFileClass = Parse.Object.extend("File");
var query = new Parse.Query(newFileClass);
return query.find();
}).then(function(results) {
for (var i = 0; i < results.length; i++) {
var object = results[i];
var codefile = object.get('javaFile');
temp = codefile.name();
name = temp.split("-").pop();
url = codefile.url();
doc[name] = url;
}
console.log("Inside then(function()");
console.log(doc);
res.json({
FIles: JSON.stringify(doc)
});
});
});
I find even this a little tough on the eyes. I can't be sure I didn't just inadvertently code in a syntax error for you. An even better restructure would be to divide this big function into several promise-returning smaller ones, named by what they promise to do.

Displaying node.js file inside a node.js server?

Trying to display this node.js file through my node.js server. However, all it does is display the code in proper format as if it were in notepad rather than actually displaying what it does in the console when executed. So in other words, I want my node.js file to execute and display on the browser, not just have the code display on the webpage.
var fs = require('fs');
var colour = require('colour');
var array = fs.readFileSync('songs/sister_golden_hair.txt').toString().split("\n");
var chordsArray =[];
var lyricsArray =[];
var count1 = 0;
for(var line in array) {
count1++;
if (count1 >= 3) {
var count2 = 0;
lyricsArray[count1-3] = [];
chordsArray[count1-3] = [];
while (array[line].indexOf("[") != -1) {
lyricsArray[count1-3][count2] = array[line].substring(0, array[line].indexOf("["));
array[line] = array[line].substring(array[line].indexOf("[")+1, array[line].length);
chordsArray[count1-3][count2] = array[line].substring(0, array[line].indexOf("]"));
array[line] = array[line].substring(array[line].indexOf("]")+1, array[line].length);
count2++;
}
if (array[line].length > 0) {
lyricsArray[count1-3][count2] = array[line];
chordsArray[count1-3][count2] = "";
}
}
else {
console.log(array[line]);
}
}
for (var i = 0; i < chordsArray.length; i++) {
for (var j = 0; j < chordsArray[i].length; j++) {
if (j == 0) {
for (var k = 0; k < lyricsArray[i][j].length; k++) {
process.stdout.write(" ");
}
process.stdout.write(chordsArray[i][j].green);
}
else {
for (var k = 0; k < lyricsArray[i][j].length - chordsArray[i][j-1].length; k++) {
process.stdout.write(" ");
}
process.stdout.write(chordsArray[i][j].green);
}
}
console.log();
for (var j = 0; j < lyricsArray[i].length; j++) {
process.stdout.write(lyricsArray[i][j].yellow);
}
}
and here is the node.js server I am running where I try to say that if the client goes to "http://127.0.0.1/sister_golden_hair.html" then it should display the above node.js Here is the node.js server:
var http = require('http');
var fs = require('fs');
var counter = 1000;
function serveStaticFile(res, path, contentType, responseCode){
if(!responseCode) responseCode = 200;
fs.readFile(__dirname + path, function(err, data){
if(err){
res.writeHead(200, {'Content-Type': 'text/plain'});
res.end('[' + counter++ + ']: ' + '500 INTERNAL FILE ERROR' + '\n');
}
else {
res.writeHead(responseCode , {'Content-Type': contentType});
res.end(data);
}
});
}
http.createServer(function (request,response){
var path = request.url.replace(/\/?(?:\?.*)$/,'').toLowerCase();
//write HTTP header
var page = '';
switch(path){
case '/sister_golden_hair.html':
serveStaticFile(response,
'/Problem4.js',
'application/javascript');
break;
}
}).listen(3000, "127.0.0.1");
console.log('Server Running at http://127.0.0.1:3000 CNTL-C to quit');

If you want to use node.js code in the browser, you're going to need to use something like browserify.

Have you tried using eval() - https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/eval ? Check out the warning though.

HTML5 IndexedDB - How to merge all records into one

My idea is download chunks a huge file from server.
Chunks stored into an IndexedDB.
After download all chunks, merge all records into one (as SQL SELECT * FROM XXX ...).
Save into disk "saveAs()" or create URL to IndexedDB...
(Or any idea?)
I do not know how to do step 2. and 3.
(Below is an example of storing 10,000 records in the DB and the expected output after step 2. should be SusanSusanSusanSusanSusan...)
if (transaction) {
transaction.oncomplete = function () {
}
transaction.onabort = function () {
writeToConsoleScreen("transaction aborted.");
localDatabase.db.close();
}
transaction.ontimeout = function () {
writeToConsoleScreen("transaction timeout.");
localDatabase.db.close();
}
var store = transaction.objectStore(osTableName);
if (store) {
var req;
var customer = {};
// create ten thousand records
for (var loop = 0; loop < 10000; loop++) {
customer = {};
customer.fname = 'Susan';
req = store.add(customer);
req.onsuccess = function (ev) {
}
req.onerror = function (ev) {
writeToConsoleScreen("Failed to add record." + " Error: " + ev.message);
}
}
}
}

<!DOCTYPE html>
<script>
var open = indexedDB.open('chunks-example');
open.onupgradeneeded = function() {
// Create schema if necessary
var db = open.result;
db.createObjectStore('chunks');
};
// 1. Chunks stored into an IndexedDB.
open.onsuccess = function() {
var db = open.result;
var tx = db.transaction('chunks', 'readwrite');
var store = tx.objectStore('chunks');
for (var i = 0; i < 10; ++i) {
// For realz, this would be via
// XMLHttpRequest.response and async.
var chunk = new Blob(['chunk ' + i + '\n'],
{type: 'application/octet-stream'});
store.put(chunk, i);
}
tx.oncomplete = function() { merge(db); };
};
// 2. After "download" all chunks, merge all records into one
function merge(db) {
var tx = db.transaction('chunks');
var store = tx.objectStore('chunks');
var chunks = [];
var request = store.openCursor();
request.onsuccess = function() {
var cursor = request.result;
if (cursor) {
chunks.push(cursor.value);
cursor.continue();
} else {
saveAs('myfile', new Blob(chunks,
{type: 'application/octet-stream'}));
}
};
}
// 3. Save into disk "saveAs()"
function saveAs(filename, blob) {
var a = document.documentElement.appendChild(document.createElement('a'));
a.href = URL.createObjectURL(blob);
a.download = filename;
a.click();
a.parentElement.remove(a);
}
</script>

Develop Reference

JavaScript is the programming language of the Web.

Can't process multiple pdf files using pdf2json for nodejs - javascript

Related

How to read and Write multiple files using node js?

Reading a file line by line, parse them and insert them in mongo in node js

Parse does not show all my file once I add a new file

Displaying node.js file inside a node.js server?

HTML5 IndexedDB - How to merge all records into one

Categories

Resources