Nodejs: Loop Through File and Parse PDFs using Callback in Sync - javascript

I am new to node so am struggling quite a bit with the Async nature of it.
I am trying to create a script that will parse the pdfs inside a directory and output them in txt format in another directory.
To do this, I am using fs and pdf2json npm packages. I am passing the parseData function as a callback in the loopingFiles function. The only problem I am having is the async nature of node.
It will loop through all the files at the same time and the output is then a jumbled mess in the last file index.
I would like to process this synchronously such that it will wait once the data is finished parsing to write to the txt and then loop again.
I have tried promises but to no avail. Any help would be much appreciated!
var fs = require('fs'),
PDFParser = require("pdf2json");
let pdfParser = new PDFParser(this,1);
var parseData = function(pdf, index) {
txtFile = "/Users/janet/node/pdf/Destination/".concat(index.toString().concat(".txt"))
pdfFile = "/Users/janet/node/pdf/Source/".concat(pdf);
pdfParser.loadPDF(pdfFile);
// Parsing the pdf file in question
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) );
pdfParser.on("pdfParser_dataReady", pdfData => {
fs.writeFile(txtFile, pdfParser.getRawTextContent());
});
};
var loopingFiles = function(callback) {
fs.readdir("/Users/janet/node/pdf/Source", function (err, files) {
if (err) {
console.log(err);
} else {
files.forEach( function(file, index) {
callback(file, index);
});
};
});
};
loopingFiles(parseData);

Something like this?
var fs = require("fs"),
PDFParser = require("pdf2json");
let pdfParser = new PDFParser(this, 1);
var parseData = function(pdfs, index = 0) {
// finished
if (index >= pdfs.length) return;
let pdf = pdfs[index];
txtFile = "/Users/janet/node/pdf/Destination/".concat(
index.toString().concat(".txt")
);
pdfFile = "/Users/janet/node/pdf/Source/".concat(pdf);
// Parsing the pdf file in question
pdfParser.on("pdfParser_dataError", errData => {
console.error(errData.parserError)
// not sure if you want to call this here to keep going or not
parseData(pdfs, index + 1);
});
pdfParser.on("pdfParser_dataReady", pdfData => {
fs.writeFile(txtFile, pdfParser.getRawTextContent(), function() {
// when we're all done, call this function again, with the index of the next pdf
parseData(pdfs, index + 1);
});
});
pdfParser.loadPDF(pdfFile);
};
var loopingFiles = function(callback) {
fs.readdir("/Users/janet/node/pdf/Source", function(err, files) {
if (err) {
console.log(err);
} else {
callback(files, 0);
}
});
};
loopingFiles(parseData);
the main difference is passing the whole array of pdfs to the function with an index, and only calling that function again with an incremented index once the current one is completed

Related

Check array after async task finishes

Today I'm having an issue with async task while using JSZIP.
I want to check the array content after the async task executed by JSZip ends.
I have a zip which contains one XML file which I read and get a specific node to store them in another list to later do some stuffs. Well, my issue is that the checking array is called before the XML file is read and, as it is executed before XML file is read, array is empty.
I tried some ways to make it work, but, without success yet.
fileElement.addEventListener('change', (e) => {
try {
var zip = new JSZip();
zip.loadAsync( fileElement.files[0])
.then(function(zip) {
let xmlfiles = []
const _ziptask = async () => {for(let [filename, file] of Object.entries(zip.files)) {
if (filename.includes("file.xml")) {
file.async("string").then(function (data) {
let xmlDoc = new DOMParser().parseFromString(data,"text/xml");
let metaInputs = [...xmlDoc.querySelectorAll("file")];
xmlfiles = metaInputs.filter(_node => null != _node.getAttribute('src'));
console.log("FILE.XML LOOP ENDED")
});
}
}}
async () => {
await _ziptask().then(() => {
console.log("CHECKING FILE.XML ARRAY ")
console.log(xmlfiles)
})
}
}, function() {console.error("ERROR: NOT ZIP FILE")});
} catch (error) {
restoreFileInput("Something went wrong, try it again later")
}
});
Well, basically after testing different things, I reached the goal by using an array of promises and using Promise.all, which basically check that all the promises were resolved successfully.
Its curious that where I read this, the promises are stored in a const declaration instead var or let.
Anyway, if someone want to see the result:
fileElement.addEventListener('change', (e) => {
try {
var zip = new JSZip();
zip.loadAsync( fileElement.files[0])
.then(function(zip) {
let xmlfiles = []
const promises = [];
for(let [filename, file] of Object.entries(zip.files)) {
if (filename.includes("file.xml")) {
promises.push(file.async("string").then(function (data) {
let xmlDoc = new DOMParser().parseFromString(data,"text/xml");
let metaInputs = [...xmlDoc.querySelectorAll("file")];
xmlfiles = metaInputs.filter(_node => null != _node.getAttribute('src'));
console.log("FILE.XML LOOP ENDED")
}));
}
}
Promise.all(promises).then(function () {
console.log("CHECKING FILE.XML ARRAY ")
console.log(xmlfiles)
});
}, function() {console.error("ERROR: NOT ZIP FILE")});
} catch (error) {
restoreFileInput("Something went wrong, try it again later")
}
});
Thanks for the help to the guys who commented previously.
Best regards.

Node does not wait for loop to complete

I have tried async/await & using promises however I cannot get this code to execute in order.
The code iterates through a document and parses it before saving it to an array, then saving the array to .json file.
The code continues to run before the loop finishes however which means it writes an empty file as the parsing has not been completed.
Turning it into an async function to await does not solve the issue. Nor does returning a promise and then using .then() to execute final code. It still runs straight away.
const fs = require('fs');
const cheerio = require('cheerio');
const mammoth = require("mammoth");
const articleFolder = './Articles/';
var allArticles = [];
const extractDocuments = async () => {
let files = fs.readdirSync(articleFolder);
for(const file of files) {
await convertToHTML(file);
}
completedExtraction();
}
async function convertToHTML(filename) {
var filepath = articleFolder + filename;
mammoth.convertToHtml({path: filepath})
.then(function(result){
let html = result.value; // The generated HTML
let messages = result.messages; // Any messages, such as warnings during conversion
updateArticles(filename, html);
})
.done();
}
function updateArticles (filename, html) {
var article = {
file: filename,
content: parseHTML(html)
}
allArticles.push(article);
}
function parseHTML (html) {
let $ = cheerio.load(html);
let title = $('h3').first().text();
let date = $('h3:eq(1)').text();
$('h3').slice(0,2).remove()
let content = $('body').html();
let parsedArticle = {
title: title,
date: date,
content: content
}
return parsedArticle;
}
function completedExtraction() {
fs.writeFile('./articles.json', JSON.stringify(allArticles), (err)=>{
if (err) throw err;
console.log('File Written.');
});
console.log('Finished.');
}
extractDocuments();
To solve with map I would do something similar to:
const extractDocuments = async () => {
let files = fs.readdirSync(articleFolder);
const articlePromises = files.map(async file => {
const html = await convertToHTML(file)
return {
filename: file,
html: html
}
})
allArticles = await Promise.all(articlePromises)
completedExtraction();
}
async function convertToHTML(filename) {
var filepath = articleFolder + filename;
return mammoth.convertToHtml({path: filepath})
.then(function(result){
let html = result.value; // The generated HTML
let messages = result.messages; // Any messages, such as warnings during conversion
return html
})
.done();
}
So to wrap up extractDocuments uses a map to iterate and create articles. convertToHTML only returns the created HTML and nothing more. We no longer use the updateArticles since this is now handled in the extractDocuments
Hopes this helps a bit. Hope it points you in the right direction

Writing to file in NodeJS calling from another JS file

I am having a weird issue writing to a file in NodeJS.
I have this code in my FileHandler.js:
module.exports.writeFile = function (arr) {
var fs = require('fs');
console.log(arr);
var file = fs.createWriteStream(__dirname+'\\test.txt',{encoding: 'utf8'});
file.on('error', function (err) {
console.log(err); });
file.on("finish", function() {
console.log("finished");
});
arr.forEach(function (item) {
file.write(item+"\n");
});
file.end();
}
If I append
exports.writeFile(["1","2","3"])
To the end of this file and then run node FileHandler.js
The file is created correctly.
However, if I call the writeFile function from another .js file as:
var R = require("r-script");
const dataHandler = require("./DataHandler");
const fileHandler = require("./FileHandler");
var out = R(__dirname + "\\apriori.R");
exports.getRules = function () {
dataHandler.getListOfPageVisitsBySession(1000781912582,1530781912582,function (result){
//ignored result variable
fileHandler.writeFile(["1","2","3"]);
})
}
and passing the exact same array to the function it doesn't write anything (but the file is created), neither fires err or finish event.
If it matters, the DataHandler method contains a request module and a GET to another API.
Any clue of the problem?
Thanks in advance

How to make a synchronous module in NodeJS

I already asked the question but I fele like I didn't asked it properly.
I'm trying to make a little encryption module in NodeJS but I have a very hard time with the asynchronous nature of it.
First of, the result variable in my main file is undefined a milisecond after the script is called, I was expecting that.
Also, the file is processed while the encryption occur, not before.
Note that I do not wish to encrypt the file itself.
The big question:
How can I make everything run smoothly and in order? :D
www.js
var mod = require('./mymodule.js')
var result = mod.doencrypt();
mymodule.js
module.exports.doencrypt = function() {
var content = processFile(); //Open a file, increment counter
var key = generateKey();
var iv = generateIV();
var encrypt = doEncryption(content);
return encrypt;
}
//File manipulation
async function openFile() {
return new Buffer(await readFile('monolitic.txt', "binary"));
}
async function saveFile(bin) {
await fs.writeFile("monolitic.txt", bin, "binary", function(err) {
if(err) {
console.log(err);
} else {
console.log("The monolitic file was saved!");
return bin;
}
});
}
function processFile() {
console.log("Reading buffer")
openFile().then(function (bin) {
monoCounter = bin;
//Increment
inc(monoCounter);
console.log(monoCounter);
monoCounter = saveMonoCounter(monoCounter);
return monoCounter;
}).catch((err) => {
monoCounter = Buffer.alloc(128);
saveMonoCounter(Buffer.alloc(128));
});
}

How to upload multiple files to Firebase?

Is there a way to upload multiple files to Firebase storage. It can upload single file within single attempt as follows.
fileButton.addEventListener('change', function(e){
//Get file
var file = e.target.files[0];
//Create storage reference
var storageRef = firebase.storage().ref(DirectryPath+"/"+file.name);
//Upload file
var task = storageRef.put(file);
//Update progress bar
task.on('state_changed',
function progress(snapshot){
var percentage = snapshot.bytesTransferred / snapshot.totalBytes * 100;
uploader.value = percentage;
},
function error(err){
},
function complete(){
var downloadURL = task.snapshot.downloadURL;
}
);
});
How to upload multiple files to the Firebase storage.
I found the solution for my above question and I like to put it here because it can be useful for anyone.
//Listen for file selection
fileButton.addEventListener('change', function(e){
//Get files
for (var i = 0; i < e.target.files.length; i++) {
var imageFile = e.target.files[i];
uploadImageAsPromise(imageFile);
}
});
//Handle waiting to upload each file using promise
function uploadImageAsPromise (imageFile) {
return new Promise(function (resolve, reject) {
var storageRef = firebase.storage().ref(fullDirectory+"/"+imageFile.name);
//Upload file
var task = storageRef.put(imageFile);
//Update progress bar
task.on('state_changed',
function progress(snapshot){
var percentage = snapshot.bytesTransferred / snapshot.totalBytes * 100;
uploader.value = percentage;
},
function error(err){
},
function complete(){
var downloadURL = task.snapshot.downloadURL;
}
);
});
}
Firebase Storage uses Promise, so you can use Promises to achieve it.
Here's the firebase blog article that covers this subject:
Keeping our Promises (and Callbacks)
Give Promise.all() an "Array of Promises"
Promise.all(
// Array of "Promises"
myItems.map(item => putStorageItem(item))
)
.then((url) => {
console.log(`All success`)
})
.catch((error) => {
console.log(`Some failed: `, error.message)
});
Upload each file and return a Promise
putStorageItem(item) {
// the return value will be a Promise
return firebase.storage().ref("YourPath").put("YourFile")
.then((snapshot) => {
console.log('One success:', item)
}).catch((error) => {
console.log('One failed:', item, error.message)
});
}
YourPath and YourFile can be carried with myItems array (thus the item object).
I omitted them here just for readability, but you get the concept.
I believe there's a simpler solution:
// set it up
firebase.storage().ref().constructor.prototype.putFiles = function(files) {
var ref = this;
return Promise.all(files.map(function(file) {
return ref.child(file.name).put(file);
}));
}
// use it!
firebase.storage().ref().putFiles(files).then(function(metadatas) {
// Get an array of file metadata
}).catch(function(error) {
// If any task fails, handle this
});
let ad_images=["file:///data/user/0/..../IMG-20181216-WA00001.jpg",
"file:///data/user/0/..../IMG-20181216-WA00002.jpg",
"file:///data/user/0/..../IMG-20181216-WA00003.jpg"];
let firebase_images=[];
const ref = firebase.firestore().collection('ads').doc(newRecord.id);
putStorageItem = (url,index,ext) => {
return firebase.storage().ref('YOURFOLDER/'+ index +'.'+ext ).putFile(url)
.then((snapshot) => {
console.log(snapshot)
firebase_images[index] = snapshot.downloadURL;
//OR
//firebase_images.push(snapshot.downloadURL);
}).catch((error) => {
console.log('One failed:', error.message)
});
}
Promise.all(
ad_images.map( async (item,index) => {
let ext = item.split('/').pop().split(".").pop();
console.log(newRecord.id, item, index, ext);
await putStorageItem(newRecord.id, item, index, ext);
})
)
.then((url) => {
console.log(`All success`);
console.log(firebase_images);
})
.catch((error) => {
console.log(`Some failed: `, error.message)
});
This is a modification of the marked answer for those looking to wait for each upload to complete before the other starts.
As the marked answer stands, the promise is not resolved or rejected so when the upload begins from the loop everything just starts, the 1st file, 2nd.....
Think of 3 uploads each 20mb. The loop will call the upload function almost at the same time, making them run almost concurrently.
This answer solves this using async/await to handle the promises
fileButton.addEventListener('change', async function(e){
//Get files
for (var i = 0; i < e.target.files.length; i++) {
var imageFile = e.target.files[i];
await uploadImageAsPromise(imageFile).then((res)=>{
console.log(res);
});
}
});
//Handle waiting to upload each file using promise
async function uploadImageAsPromise (imageFile) {
return new Promise(function (resolve, reject) {
var storageRef = firebase.storage().ref(fullDirectory+"/"+imageFile.name);
var task = storageRef.put(imageFile);
//Update progress bar
task.on('state_changed',
function progress(snapshot){
var percentage = snapshot.bytesTransferred / snapshot.totalBytes *
100;
},
function error(err){
console.log(err);
reject(err);
},
function complete(){
var downloadURL = task.snapshot.downloadURL;
resolve(downloadURL);
}
);
});
}
#isuru, the guy who uploaded the question has a great solution provided below. But, some of the firebase functions have been updated. So, I have just updated the solution with the new updates in the Firebase.
//Firebase Storage Reference
const storageRef = firebase.storage().ref();
//Upload Image Function returns a promise
async function uploadImageAsPromise(imageFile) {
return new Promise(function (resolve, reject) {
const task = storageRef.child(imageFile.name).put(imageFile);
task.on(
"state_changed",
function progress(snapshot) {
const percentage = (snapshot.bytesTransferred / snapshot.totalBytes) * 100;
},
function error(err) {
reject(err);
},
async function complete() {
//The getDownloadURL returns a promise and it is resolved to get the image url.
const imageURL = await task.snapshot.ref.getDownloadURL();
resolve(imageURL);
}
);
});
}
//Handling the files
fileButton.addEventListener('change', function(e){
const promises = [];
for(const file of e.target.files){//Instead of e.target.files, you could also have your files variable
promises.push(uploadImageAsPromise(file))
}
//The Promise.all() will stop the execution, until all of the promises are resolved.
Promise.all(promises).then((fileURLS)=>{
//Once all the promises are resolved, you will get the urls in a array.
console.log(fileURLS)
})
});
Upload a file & get download URL
export const handleFileUploadOnFirebaseStorage = async (bucketName, file) => {
// 1. If no file, return
if (file === "") return "";
// 2. Put the file into bucketName
const uploadTask = await storage.ref(`/${bucketName}/${file.name}`).put(file);
// 3. Get download URL and return it as
return uploadTask.ref.getDownloadURL().then((fileURL) => fileURL);
};
Upload multiple files & get download URL
export const handleFilesUploadOnFirebaseStorage = async (bucketName, files) => {
// 1. If no file, return
if (files.length === 0) return [];
// 2. Create an array to store all download URLs
let fileUrls = [];
// 3. Loop over all the files
for (var i = 0; i < files.length; i++) {
// 3A. Get a file to upload
const file = files[i];
// 3B. handleFileUploadOnFirebaseStorage function is in above section
const downloadFileResponse = await handleFileUploadOnFirebaseStorage(bucketName, file);
// 3C. Push the download url to URLs array
fileUrls.push(downloadFileResponse);
}
return fileUrls;
};
all the promises get messy pretty quickly, why not use async and await instead?
Here, I have a function that keep tracks of all the images selected from the input/file control to be uploaded:
let images =[];
let imagePaths=[];
const trackFiles =(e)=>{
images =[];
imagePaths =[];
for (var i = 0; i < e.target.files.length; i++) {
images.push(e.target.files[i]);
}
}
And I have another function that will be triggered by a button that the user will click on when ready to do the actual upload:
const uploadFiles =()=>{
const storageRef = storage.ref();
images.map(async img =>{
let fileRef = storageRef.child(img.name);
await fileRef.put(img);
const singleImgPath = await fileRef.getDownloadURL();
imagePaths.push(singleImgPath);
if(imagePaths.length == images.length){
console.log("got all paths here now: ", imagePaths);
}
})
}
We basically loop through each image and perform the upload, and push the image paths into a separate imagePaths array one by one as each of them gets finished at its own pace, I then grab all the paths once we know they are all done by comparing the length of the images and their final paths.
We can Combine multiple Promises like this
Promise.all([promise1, promise2, promise3]).then(function(values) {
console.log(values);
});
And we can Chain Promise like this
return myFirstPromise.then( (returnFromFirst) => {
//Do something
return secondPromise();
}).then( (returnFromSecond) => {
//Do something
return thirdPromise();
}).then( (returnFromThird) => {
//All Done
}).catch( (e) =>{}
console.error("SOMETHING WENT WRONG!!!");
);
Idea is to combine upload file promises with Promise.all & chain them together to get download URLS after each upload
Promise.all(
//Array.map creates a new array with the results
// of calling a function for every array element.
//In this case Array of "Promises"
this.state.filesToUpload.map(item =>
this.uploadFileAsPromise(item))
)
.then(url => {
console.log(`All success`);
//Handle Success all image upload
})
.catch(error => {
console.log(`Some failed: `, error.message);
//Handle Failure some/all image upload failed
});
//return a promise which upload file & get download URL
uploadFileAsPromise(imageFile) {
// the return value will be a Promise
return storageRef
.child("images/users/" + imageFile.name)
.put(imageFile.file)
.then(snapshot => {
console.log("Uploaded File:", imageFile.name);
return snapshot.ref.getDownloadURL().then(downloadURL => {
//promise inside promise to get donloadable URL
console.log("File available at", downloadURL);
);
});
})
.catch(error => {
console.log("Upload failed:", imageFile.name, error.message);
});
}
This was a breeze implementing with rxjs's switchMap and combineLatest for the Angular fire

Categories

Resources