Node does not wait for loop to complete - javascript

I have tried async/await & using promises however I cannot get this code to execute in order.
The code iterates through a document and parses it before saving it to an array, then saving the array to .json file.
The code continues to run before the loop finishes however which means it writes an empty file as the parsing has not been completed.
Turning it into an async function to await does not solve the issue. Nor does returning a promise and then using .then() to execute final code. It still runs straight away.
const fs = require('fs');
const cheerio = require('cheerio');
const mammoth = require("mammoth");
const articleFolder = './Articles/';
var allArticles = [];
const extractDocuments = async () => {
let files = fs.readdirSync(articleFolder);
for(const file of files) {
await convertToHTML(file);
}
completedExtraction();
}
async function convertToHTML(filename) {
var filepath = articleFolder + filename;
mammoth.convertToHtml({path: filepath})
.then(function(result){
let html = result.value; // The generated HTML
let messages = result.messages; // Any messages, such as warnings during conversion
updateArticles(filename, html);
})
.done();
}
function updateArticles (filename, html) {
var article = {
file: filename,
content: parseHTML(html)
}
allArticles.push(article);
}
function parseHTML (html) {
let $ = cheerio.load(html);
let title = $('h3').first().text();
let date = $('h3:eq(1)').text();
$('h3').slice(0,2).remove()
let content = $('body').html();
let parsedArticle = {
title: title,
date: date,
content: content
}
return parsedArticle;
}
function completedExtraction() {
fs.writeFile('./articles.json', JSON.stringify(allArticles), (err)=>{
if (err) throw err;
console.log('File Written.');
});
console.log('Finished.');
}
extractDocuments();

To solve with map I would do something similar to:
const extractDocuments = async () => {
let files = fs.readdirSync(articleFolder);
const articlePromises = files.map(async file => {
const html = await convertToHTML(file)
return {
filename: file,
html: html
}
})
allArticles = await Promise.all(articlePromises)
completedExtraction();
}
async function convertToHTML(filename) {
var filepath = articleFolder + filename;
return mammoth.convertToHtml({path: filepath})
.then(function(result){
let html = result.value; // The generated HTML
let messages = result.messages; // Any messages, such as warnings during conversion
return html
})
.done();
}
So to wrap up extractDocuments uses a map to iterate and create articles. convertToHTML only returns the created HTML and nothing more. We no longer use the updateArticles since this is now handled in the extractDocuments
Hopes this helps a bit. Hope it points you in the right direction

Related

Check array after async task finishes

Today I'm having an issue with async task while using JSZIP.
I want to check the array content after the async task executed by JSZip ends.
I have a zip which contains one XML file which I read and get a specific node to store them in another list to later do some stuffs. Well, my issue is that the checking array is called before the XML file is read and, as it is executed before XML file is read, array is empty.
I tried some ways to make it work, but, without success yet.
fileElement.addEventListener('change', (e) => {
try {
var zip = new JSZip();
zip.loadAsync( fileElement.files[0])
.then(function(zip) {
let xmlfiles = []
const _ziptask = async () => {for(let [filename, file] of Object.entries(zip.files)) {
if (filename.includes("file.xml")) {
file.async("string").then(function (data) {
let xmlDoc = new DOMParser().parseFromString(data,"text/xml");
let metaInputs = [...xmlDoc.querySelectorAll("file")];
xmlfiles = metaInputs.filter(_node => null != _node.getAttribute('src'));
console.log("FILE.XML LOOP ENDED")
});
}
}}
async () => {
await _ziptask().then(() => {
console.log("CHECKING FILE.XML ARRAY ")
console.log(xmlfiles)
})
}
}, function() {console.error("ERROR: NOT ZIP FILE")});
} catch (error) {
restoreFileInput("Something went wrong, try it again later")
}
});
Well, basically after testing different things, I reached the goal by using an array of promises and using Promise.all, which basically check that all the promises were resolved successfully.
Its curious that where I read this, the promises are stored in a const declaration instead var or let.
Anyway, if someone want to see the result:
fileElement.addEventListener('change', (e) => {
try {
var zip = new JSZip();
zip.loadAsync( fileElement.files[0])
.then(function(zip) {
let xmlfiles = []
const promises = [];
for(let [filename, file] of Object.entries(zip.files)) {
if (filename.includes("file.xml")) {
promises.push(file.async("string").then(function (data) {
let xmlDoc = new DOMParser().parseFromString(data,"text/xml");
let metaInputs = [...xmlDoc.querySelectorAll("file")];
xmlfiles = metaInputs.filter(_node => null != _node.getAttribute('src'));
console.log("FILE.XML LOOP ENDED")
}));
}
}
Promise.all(promises).then(function () {
console.log("CHECKING FILE.XML ARRAY ")
console.log(xmlfiles)
});
}, function() {console.error("ERROR: NOT ZIP FILE")});
} catch (error) {
restoreFileInput("Something went wrong, try it again later")
}
});
Thanks for the help to the guys who commented previously.
Best regards.

Making any reference to Nodejs' process.argv causes errors in unexpected place (reading a file)

I am writing code that generates a very large JSON object, saves it to a file, then loads the file and inserts the data into a Mongo collection. I want to pass a string from the command line when calling the script that I use to set the file name, as well as the collection name. I call it like so: node --max-old-space-size=8192 data_generator.js foo 1000000.
The code fails with error ENOENT: no such file or directory, open 'foo.json' on the third line of the function gen_collection() where I set the variable data. This error does not appear when a file foo.json already exists, even if it is empty. Before it fails, the code successfully creates a file foo.json but it contains only an empty array [].
The code fails with this same exact error when I include any reference to process.argv. This includes when I try to set any variable to a value from the process.argv array. The code works when I set the variables fname as const fname = "foo" and size as const size = 0. However, even if the only reference I have to process.argv is in a console.log i.e. adding console.log(process.argv[2] to main(), it fails with the exact same error as above.
Here is the code I am trying to run:
const { MongoClient } = require("mongodb");
const fs = require('fs');
const bjson = require('big-json');
async function main() {
const uri = "my db uri";
const client = new MongoClient(uri);
const fname = process.argv[2];
const size = parseInt(process.argv[3]);
// const fname = 'small'
// const size = 1
try {
await client.connect({ useUnifiedTopology: true });
await write_data_to_disk(fname, size);
await gen_collection(client, fname);
} catch (e) {
console.error(e);
} finally {
await client.close();
}
};
// generate data as json aray and write to local file
async function write_data_to_disk(fname, size) {
let arr = [];
for (let i = 0; i < size; i++) {
let doc = gen_document();
arr.push(doc);
}
const strStream = bjson.createStringifyStream({
body: arr
})
let logger = fs.createWriteStream(`${fname}.json`);
strStream.on('data', (d) => {
logger.write(d);
})
};
async function gen_collection(client, fname) {
let db = client.db('test');
let collection = db.collection(fname);
let data = JSON.parse(fs.readFileSync(`${fname}.json`, 'utf8')); // ERROR APPEARS ON THIS LINE
bulkUpdateOps = [];
data.forEach((doc) => {
bulkUpdateOps.push({"insertOne": {"document": doc}});
if (bulkUpdateOps.length === 1000) {
collection.bulkWrite(bulkUpdateOps);
bulkUpdateOps = [];
}
})
if (bulkUpdateOps.length > 0) {
collection.bulkWrite(bulkUpdateOps);
}
};
function gen_document() {
// returns json object
};
You're doing
await write_data_to_disk(...)
but that function doesn't return a promise that is connected to when it's done. So, you're trying to read the resulting file BEFORE it has been created or before it has valid content in it and thus the ENOENT error as the file doesn't yet exist when you're trying to read from it in the following function.
Writestreams do not play nicely with promises unless you wrap them in your own promise that resolves when you are completely done writing to the stream and the file has been closed.
Also, you probably want to just .pipe() strStream to the logger stream. Much easier and you can then just monitor when that pipe() operation is done to resolve the promise you wrap around that operation.
You can promisify write_data_to_disk() like this:
// generate data as json aray and write to local file
function write_data_to_disk(fname, size) {
return new Promise((resolve, reject) => {
const arr = [];
for (let i = 0; i < size; i++) {
let doc = gen_document();
arr.push(doc);
}
const strStream = bjson.createStringifyStream({ body: arr });
const dest = fs.createWriteStream(`${fname}.json`, {emitClose: true});
// monitor for completion and errors
dest.on('error', reject).on('close', resolve);
strStream.on('error', reject);
// pipe all the content from strStream to the dest writeStream
strStream.pipe(dest);
});
}
Since this returns a promise that is truly tied to when the write operation is done, you can then use await write_data_to_disk(...).

Push into an array from foreach and make it available outside foreach

I stuck by looping through an array that receive values from a promise and push values into a new array which is available outside the foreach.
What i have:
app.post('/submit', function (req, res) {
uploadPics(req, res, function (err) {
if (err instanceof multer.MulterError) {
res.send(JSON.stringify({UploadResult: err.message}));
console.log(err.message + ' ' +'Redirect /home');
} else if (err) {
console.log(err);
} else {
res.send(JSON.stringify({UploadResult: 'Success'}));
var filesarray = req.files;
var picinfos = [];
filesarray.forEach(function(file){
GetFileMetaInfo.filemetainfo(file.path).then(function (metadata){
//Stuck here! Can push values into an array (picinfos) but only available in the foreach. not outside..
})
})
//I need picinfos array here....
}
})
})
How i receive my metadata:
var exif = require('exif-parser');
var fs = require('fs');
exports.filemetainfo = function (filepath) {
return new Promise((resolve) => {
var file = filepath;
var buffer = fs.readFileSync(file);
var parser = exif.create(buffer);
var result = parser.parse();
resolve (result);
}).then(function (metadata){
if (metadata.tags.CreateDate !== undefined){
date = new Date (metadata.tags.CreateDate*1000);
datevalues = [
date.getFullYear(),
date.getMonth()+1,
date.getDate(),
date.getHours(),
date.getMinutes(),
date.getSeconds(),
];
CreateDate = date.getFullYear()+'-'+(date.getMonth()+1)+'-'+date.getDate();
CreateTime = date.getHours()+':'+date.getMinutes()+':'+date.getSeconds();
console.log("CrDate:" +CreateDate, "CrTime:" +CreateTime );
} else {
console.log("No Metadata Creation Infos found in " +filepath);
CreateDate = "";
CretaeTime = "";
}
if (metadata.tags.GPSLatitude !== undefined){
GPSLat = metadata.tags.GPSLatitude;
GPSLon = metadata.tags.GPSLongitude;
console.log("GPSLat:" + GPSLat , "GPSLon:" +GPSLon);
}
else {
console.log("No Metadata GPS Infos found in " +filepath)
GPSLat = "";
GPSLon = "";
}
return MetaData = {
GPSLat: GPSLat ,
GPSLon: GPSLon,
CreateDate: CreateDate,
CreateTime: CreateTime,
}
})
}
May i ask someone to give a hand. How can i make my array available outside the foreach. thank you very much!
The reason you're getting empty array at the end of forEach is because, GetFileMetaInfo.filemetainfo() returns a promise and forEach won't wait for async actions.
You could use async/await with for...of loop to get your desired result.
app.post('/submit', function (req, res) {
uploadPics(req, res, async function (err) { // note async here
if (err instanceof multer.MulterError) {
res.send(JSON.stringify({UploadResult: err.message}));
console.log(err.message + ' ' +'Redirect /home');
} else if (err) {
console.log(err);
} else {
res.send(JSON.stringify({UploadResult: 'Success'}));
var filesarray = req.files;
var picinfos = [];
for(let file of filesarray) {
const metadata = await GetFileMetaInfo.filemetainfo(file.path);
// push metadata into your array here
picinfos.push(metadata);
}
// You will have picinfos here
}
})
})
Although the question is already answered by Dinesh Pandiyan there are still some adjustments that can be made. The following code in his answer runs sequential, meaning that every async request is made after the previously returned result is resolved.
for(let file of filesarray) {
const metadata = await GetFileMetaInfo.filemetainfo(file.path);
// ^- pauses the execution of the current running code
// push metadata into your array here
picinfos.push(metadata);
}
async call #1 ╌╌await╌╌> async call #2 ╌╌await╌╌> async call #3 ╌╌await╌╌> result
You could make the code concurrent by first executing all async statements and then wait until all results are resolved. This can be done by simply changing the following:
// execute all the async functions first, reducing the wait time
for(let file of filesarray) {
const metadata = GetFileMetaInfo.filemetainfo(file.path);
// ^- remove the await
// push metadata into your array here
picinfos.push(metadata);
}
// wait for all results to be resolved
picinfos = await Promise.all(picinfos);
// ^- instead await here
async call #1 ╌╌┐
async call #2 ╌╌┼╌╌await all╌╌> result
async call #3 ╌╌┘
The above could be further simplified by simply using an Array.map() in combination with the already shown Promise.all().
var filesarray = req.files;
var picinfos = await Promise.all(filesarray.map(file => {
return GetFileMetaInfo.filemetainfo(file.path);
}));
// picinfos should be present
Or if you want to avoid working with async/await:
var filesarray = req.files;
Promise.all(filesarray.map(file => {
return GetFileMetaInfo.filemetainfo(file.path);
})).then(picinfos => {
// picinfos should be present
});

Nodejs: Loop Through File and Parse PDFs using Callback in Sync

I am new to node so am struggling quite a bit with the Async nature of it.
I am trying to create a script that will parse the pdfs inside a directory and output them in txt format in another directory.
To do this, I am using fs and pdf2json npm packages. I am passing the parseData function as a callback in the loopingFiles function. The only problem I am having is the async nature of node.
It will loop through all the files at the same time and the output is then a jumbled mess in the last file index.
I would like to process this synchronously such that it will wait once the data is finished parsing to write to the txt and then loop again.
I have tried promises but to no avail. Any help would be much appreciated!
var fs = require('fs'),
PDFParser = require("pdf2json");
let pdfParser = new PDFParser(this,1);
var parseData = function(pdf, index) {
txtFile = "/Users/janet/node/pdf/Destination/".concat(index.toString().concat(".txt"))
pdfFile = "/Users/janet/node/pdf/Source/".concat(pdf);
pdfParser.loadPDF(pdfFile);
// Parsing the pdf file in question
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) );
pdfParser.on("pdfParser_dataReady", pdfData => {
fs.writeFile(txtFile, pdfParser.getRawTextContent());
});
};
var loopingFiles = function(callback) {
fs.readdir("/Users/janet/node/pdf/Source", function (err, files) {
if (err) {
console.log(err);
} else {
files.forEach( function(file, index) {
callback(file, index);
});
};
});
};
loopingFiles(parseData);
Something like this?
var fs = require("fs"),
PDFParser = require("pdf2json");
let pdfParser = new PDFParser(this, 1);
var parseData = function(pdfs, index = 0) {
// finished
if (index >= pdfs.length) return;
let pdf = pdfs[index];
txtFile = "/Users/janet/node/pdf/Destination/".concat(
index.toString().concat(".txt")
);
pdfFile = "/Users/janet/node/pdf/Source/".concat(pdf);
// Parsing the pdf file in question
pdfParser.on("pdfParser_dataError", errData => {
console.error(errData.parserError)
// not sure if you want to call this here to keep going or not
parseData(pdfs, index + 1);
});
pdfParser.on("pdfParser_dataReady", pdfData => {
fs.writeFile(txtFile, pdfParser.getRawTextContent(), function() {
// when we're all done, call this function again, with the index of the next pdf
parseData(pdfs, index + 1);
});
});
pdfParser.loadPDF(pdfFile);
};
var loopingFiles = function(callback) {
fs.readdir("/Users/janet/node/pdf/Source", function(err, files) {
if (err) {
console.log(err);
} else {
callback(files, 0);
}
});
};
loopingFiles(parseData);
the main difference is passing the whole array of pdfs to the function with an index, and only calling that function again with an incremented index once the current one is completed

How to use ES8 async/await with streams?

In https://stackoverflow.com/a/18658613/779159 is an example of how to calculate the md5 of a file using the built-in crypto library and streams.
var fs = require('fs');
var crypto = require('crypto');
// the file you want to get the hash
var fd = fs.createReadStream('/some/file/name.txt');
var hash = crypto.createHash('sha1');
hash.setEncoding('hex');
fd.on('end', function() {
hash.end();
console.log(hash.read()); // the desired sha1sum
});
// read all file and pipe it (write it) to the hash object
fd.pipe(hash);
But is it possible to convert this to using ES8 async/await instead of using the callback as seen above, but while still keeping the efficiency of using streams?
The await keyword only works on promises, not on streams. There are ideas to make an extra stream-like data type that would get its own syntax, but those are highly experimental if at all and I won't go into details.
Anyway, your callback is only waiting for the end of the stream, which is a perfect fit for a promise. You'd just have to wrap the stream:
var fd = fs.createReadStream('/some/file/name.txt');
var hash = crypto.createHash('sha1');
hash.setEncoding('hex');
// read all file and pipe it (write it) to the hash object
fd.pipe(hash);
var end = new Promise(function(resolve, reject) {
hash.on('end', () => resolve(hash.read()));
fd.on('error', reject); // or something like that. might need to close `hash`
});
There also exists a helper function to do just that in more recent versions of nodejs - pipeline from the stream/promises module:
import { pipeline } from 'node:stream/promises';
const fd = fs.createReadStream('/some/file/name.txt');
const hash = crypto.createHash('sha1');
hash.setEncoding('hex');
// read all file and pipe it (write it) to the hash object
const end = pipeline(fd, hash);
Now you can await that promise:
(async function() {
let sha1sum = await end;
console.log(sha1sum);
}());
If you are using node version >= v10.0.0 then you can use stream.pipeline and util.promisify.
const fs = require('fs');
const crypto = require('crypto');
const util = require('util');
const stream = require('stream');
const pipeline = util.promisify(stream.pipeline);
const hash = crypto.createHash('sha1');
hash.setEncoding('hex');
async function run() {
await pipeline(
fs.createReadStream('/some/file/name.txt'),
hash
);
console.log('Pipeline succeeded');
}
run().catch(console.error);
Node V15 now has a promisfiy pipeline in stream/promises.
This is the cleanest and most official way.
const { pipeline } = require('stream/promises');
async function run() {
await pipeline(
fs.createReadStream('archive.tar'),
zlib.createGzip(),
fs.createWriteStream('archive.tar.gz')
);
console.log('Pipeline succeeded.');
}
run().catch(console.error);
We all should appreciate how much works it's done here:
Capture errors in all the streams.
Destroy unfinished streams when error is raised.
Only return when the last writable stream is finished.
This pipe thing is one of the most powerful feature Node.JS has. Making it fully async is not easy. Now we have it.
Something like this works:
for (var res of fetchResponses){ //node-fetch package responses
const dest = fs.createWriteStream(filePath,{flags:'a'});
totalBytes += Number(res.headers.get('content-length'));
await new Promise((resolve, reject) => {
res.body.pipe(dest);
res.body.on("error", (err) => {
reject(err);
});
dest.on("finish", function() {
resolve();
});
});
}
2021 Update:
New example from Node documentation:
async function print(readable) {
readable.setEncoding('utf8');
let data = '';
for await (const chunk of readable) {
data += chunk;
}
console.log(data);
}
see https://nodejs.org/api/stream.html#stream_readable_symbol_asynciterator
I would comment, but don't have enough reputation.
A WORD OF CAUTION:
If you have an application that is passing streams around AND doing async/await, be VERY CAREFUL to connect ALL pipes before you await. You can end up with streams not containing what you thought they did. Here's the minimal example
const { PassThrough } = require('stream');
async function main() {
const initialStream = new PassThrough();
const otherStream = new PassThrough();
const data = [];
otherStream.on('data', dat => data.push(dat));
const resultOtherStreamPromise = new Promise(resolve => otherStream.on('end', () => { resolve(Buffer.concat(data)) }));
const yetAnotherStream = new PassThrough();
const data2 = [];
yetAnotherStream.on('data', dat => data2.push(dat));
const resultYetAnotherStreamPromise = new Promise(resolve => yetAnotherStream.on('end', () => { resolve(Buffer.concat(data2)) }));
initialStream.pipe(otherStream);
initialStream.write('some ');
await Promise.resolve(); // Completely unrelated await
initialStream.pipe(yetAnotherStream);
initialStream.end('data');
const [resultOtherStream, resultYetAnotherStream] = await Promise.all([
resultOtherStreamPromise,
resultYetAnotherStreamPromise,
]);
console.log('other stream:', resultOtherStream.toString()); // other stream: some data
console.log('yet another stream:', resultYetAnotherStream.toString()); // yet another stream: data
}
main();
I believe it will be helpful for someone:
async function readFile(filename) {
let records = []
return new Promise(resolve => {
fs.createReadStream(filename)
.on("data", (data) => {
records.push(data);
})
.on("end", () => {
resolve(records)
});
})
}

Categories

Resources