I seem to regularly get stuck on handling async i/o issues and always seem to come up with clunky solutions. See this snippet for my current challenge.
Purpose: AWS Lambda function which reads the contents of a DynamoDB table and writes it to a file stored in S3. This Lambda function gets called whenever there is an update to the DynamoDB table.
Problem: see the commented out section of code in the center of the function onScan? That is to handle the case where it takes multiple calls to dynDoc.scan() to deliver the entire contents of the DynamoDB table. The limit is 100 rows per call. However, ideally the S3 file would be written once with the entire contents of the table have been delivered, not on every call of dynDoc.scan() as the code is currently constructed. This is a challenge with the asynchronous I/O to DynamoDB and the onScan callback. In addition, I clear the variable fileContents each time onScan is executed, because, if you invoke this lambda function twice with less than 5 minutes between, it will stay in memory and the global fileContents will accumulate two copies of the table.
One idea I have is to initialize a counter before the initial call to dynDoc.scan() and then increment the counter for each subsequent call to dynDoc.scan(). I would decrement the counter after the commented out section of code and then test for it to be zero before writing out the S3 file and clearing fileContents.
Is there a more elegant solution? Something more idiomatic Javascript?
Note that Lambda currently supports node.js version 8.10.
Thank you for looking at this!
'use strict';
var AWS = require("aws-sdk");
AWS.config.update({ region: "us-east-1" });
var s3 = new AWS.S3();
var s3Params = {
Body: "testing",
Bucket: "my-bucket",
Key: "my-file.csv"
};
var dyn = new AWS.DynamoDB();
var dynDoc = new AWS.DynamoDB.DocumentClient;
var dynParamsDoc = { TableName: "MyTable" };
var itemCount = 0;
var fileContents = "";
exports.handler = (event, context, callback) => {
function onScan(err,data) {
if (err) {
console.error("Unable to scan Dynamodb.\nError JSON:",
JSON.stringify(err, null, 2));
} else {
fileContents = ""; // added, because it was not getting cleared
data.Items.forEach((entry) => {
fileContents += entry.ClientName + "," + entry.ClientAbbrev + "\n";
});
// eventually, we should really loop on partial DynamoDB table transfers:
// if (typeof data.LastEvaluatedKey != "undefined") {
// console.log("Scanning for more...");
// dynParamsDoc.ExclusiveStartKey = data.LastEvaluatedKey;
// dynDoc.scan(dynParamsDoc, onScan);
// }
// Save S3 file
s3Params.Body = fileContents;
s3.putObject(s3Params, function(err,data) {
if (err) console.log(err,err.stack);
else console.log(data);
});
};
};
// Now retrieve the entire table from DynamoDB and write it to a file
dynDoc.scan(dynParamsDoc, onScan);
callback(null, "Successfully processed table.");
};
In addition, I clear the variable fileContents each time onScan is executed
That I think is the problem. You should not need to clear it - because you should not have used a global (module-scoped, static) variable. You should declare an initialise var fileContents = ""; inside the exports.handler function, not in onScan. With that fixed, I would expect your commented-out approach to work:
var AWS = require("aws-sdk");
AWS.config.update({ region: "us-east-1" });
var s3 = new AWS.S3;
var dyn = new AWS.DynamoDB;
var dynDoc = new AWS.DynamoDB.DocumentClient;
exports.handler = (event, context, callback) => {
var s3Params = {
Body: "testing",
Bucket: "my-bucket",
Key: "my-file.csv"
};
var dynParamsDoc = { TableName: "MyTable" };
var fileContents = "";
function onScan(err,data) {
if (err) {
callback("Unable to scan Dynamodb.\nError JSON:",
JSON.stringify(err, null, 2));
} else {
data.Items.forEach((entry) => {
fileContents += entry.ClientName + "," + entry.ClientAbbrev + "\n";
});
if (typeof data.LastEvaluatedKey != "undefined") {
console.log("Scanning for more...");
dynParamsDoc.ExclusiveStartKey = data.LastEvaluatedKey;
dynDoc.scan(dynParamsDoc, onScan);
} else {
// Save S3 file
s3Params.Body = fileContents;
s3.putObject(s3Params, function(err,data) {
if (err) {
callback(err);
} else {
console.log(data);
callback(null, "Successfully processed table.");
}
});
}
}
}
// Now retrieve the entire table from DynamoDB and write it to a file
dynDoc.scan(dynParamsDoc, onScan);
};
Is there a more elegant solution? Something more idiomatic Javascript?
Yes, a modern approach would use promises with async/await:
var AWS = require("aws-sdk");
AWS.config.update({ region: "us-east-1" });
var s3 = new AWS.S3;
var dyn = new AWS.DynamoDB;
var dynDoc = new AWS.DynamoDB.DocumentClient;
exports.handler = async (event, context) => {
var dynParamsDoc = { TableName: "MyTable" };
var fileContents = "";
do {
var data = await dynDoc.scan(dynParamsDoc).promise();
for (var entry of data.Items) {
fileContents += entry.ClientName + "," + entry.ClientAbbrev + "\n";
}
dynParamsDoc.ExclusiveStartKey = data.LastEvaluatedKey;
} while (typeof data.LastEvaluatedKey != "undefined");
var s3Params = {
Body: "testing",
Bucket: "my-bucket",
Key: "my-file.csv",
Body: fileContents,
};
var res = await s3.putObject(s3Params).promise();
console.log(res);
return "Successfully processed table.";
};
Based only on your code (i.e. I can't assert the general architecture here), you can pass partial content to the recursive call:
// one more arg!
function onScan(err, data, memory = []) {
if (err) {
console.error(...);
return callback(err); // see Bergi's comment on your post
}
// add current data to our "global" data
memory.push.apply(memory, data.Items);
// in case there's more...
if (typeof data.LastEvaluatedKey !== "undefined") {
dynParamsDoc.ExclusiveStartKey = data.LastEvaluatedKey;
// ...pass the "global" data to next scan, and stop here
return dynDoc.scan(dynParamsDoc, (err, res) => {
onScan(err, res, memory);
});
}
// if we got here, we have no more data to fetch, so we address S3 now
s3Params.Body = memory.map((row) => {
return `${row.ClientName},${row.ClientAbbrev}`;
}).join("\n") + "\n"; // that last \n to exactly reproduce your behavior
s3.putObject(s3Params, function(err, data) {
if (err) console.log(err, err.stack);
else console.log(data);
callback(err, "Successfully processed table."); // see Bergi's comment on your post
});
}
dynDoc.scan(dynParamsDoc, onScan);
Related
I'm trying to create a Lambda function to generate a .js script (in order to use with Chart.JS).
This script sends a query to a table in DynamoDB and outputs the results in .js file (which is stored in an S3 bucket).
I try for many hours to make it functional, but I'm stuck with classical problems on Node.js: order on callback functions and variables scope.
Here is the code I used:
var AWS = require('aws-sdk');
AWS.config.update({region: 'eu-west-1'});
var s3 = new AWS.S3();
var tweetValue ;
var neutralValue ;
var destBucket = "twitterappfront1";
var ddb = new AWS.DynamoDB.DocumentClient({apiVersion: '2012-08-10'});
function sentimentVal(inputparams) {
// function resultrequest()
ddb.get(inputparams, function(err, data) {
if (err) {
console.log("Error", err);
} else {
console.log("Success", data.Item);
//Catch tweets number in DynamoTB table and store un descriptor
var numtweets = (JSON.parse(JSON.stringify(AWS.DynamoDB.Converter.marshall(data.Item)))).tweets ;
var tweetsObject = Object.getOwnPropertyDescriptor(numtweets, 'N') ;
tweetValue = tweetsObject.value ;
console.log ("test stringify = ", numtweets) ;
console.log (tweetsObject.value) ;
console.log ("Value = ", tweetValue) ;
return tweetValue ;
}
});
}
exports.handler = (event) => {
// Read options from the event.
var paramsNeutral = {
TableName: 'twitterSentiment',
Key: { 'sentiment':'NEUTRAL' }
};
// Call sentimentVal function with paramsNeutral, and setNeutralValue callback function
//
sentimentVal(paramsNeutral, setNeutralValue);
function setNeutralValue (error, tweetValue) {
if (error) console.error('ERROR !', error) ;
else console.log ('callback tweetValue = ', tweetValue) ;
}
};
My problem is that it seems the callback function is never used: I have no console output "ERROR" or "Callback tweetValue ="
And I don't understand how to catch the value from the sentvimentVal function. I tried a "return", but I don't know if it is the right way.
Can you please help me ?
Thank you
You are not waiting for the update to DynamoDB to finish.
Update it to return a promise and use async/await
async function sentimentVal(inputparams) {
try {
// function resultrequest()
const data = await ddb.get(inputparams).promise()
console.log("Success", data.Item);
//Catch tweets number in DynamoTB table and store un descriptor
var numtweets = (JSON.parse(JSON.stringify(AWS.DynamoDB.Converter.marshall(data.Item)))).tweets ;
var tweetsObject = Object.getOwnPropertyDescriptor(numtweets, 'N') ;
tweetValue = tweetsObject.value ;
console.log ("test stringify = ", numtweets) ;
console.log (tweetsObject.value) ;
console.log ("Value = ", tweetValue) ;
return tweetValue ;
} catch (err) {
console.log("Error", err);
throw err
}
}
And await for it in handler
exports.handler = (event) => {
// Read options from the event.
var paramsNeutral = {
TableName: 'twitterSentiment',
Key: { 'sentiment':'NEUTRAL' }
};
// Call sentimentVal function with paramsNeutral, and setNeutralValue callback function
//
const tweet = await sentimentVal(paramsNeutral, setNeutralValue);
function setNeutralValue (error, tweetValue) {
if (error) console.error('ERROR !', error) ;
else console.log ('callback tweetValue = ', tweetValue) ;
}
};
I'm not sure what setNeutralValue is supposed to do.
I stuck by looping through an array that receive values from a promise and push values into a new array which is available outside the foreach.
What i have:
app.post('/submit', function (req, res) {
uploadPics(req, res, function (err) {
if (err instanceof multer.MulterError) {
res.send(JSON.stringify({UploadResult: err.message}));
console.log(err.message + ' ' +'Redirect /home');
} else if (err) {
console.log(err);
} else {
res.send(JSON.stringify({UploadResult: 'Success'}));
var filesarray = req.files;
var picinfos = [];
filesarray.forEach(function(file){
GetFileMetaInfo.filemetainfo(file.path).then(function (metadata){
//Stuck here! Can push values into an array (picinfos) but only available in the foreach. not outside..
})
})
//I need picinfos array here....
}
})
})
How i receive my metadata:
var exif = require('exif-parser');
var fs = require('fs');
exports.filemetainfo = function (filepath) {
return new Promise((resolve) => {
var file = filepath;
var buffer = fs.readFileSync(file);
var parser = exif.create(buffer);
var result = parser.parse();
resolve (result);
}).then(function (metadata){
if (metadata.tags.CreateDate !== undefined){
date = new Date (metadata.tags.CreateDate*1000);
datevalues = [
date.getFullYear(),
date.getMonth()+1,
date.getDate(),
date.getHours(),
date.getMinutes(),
date.getSeconds(),
];
CreateDate = date.getFullYear()+'-'+(date.getMonth()+1)+'-'+date.getDate();
CreateTime = date.getHours()+':'+date.getMinutes()+':'+date.getSeconds();
console.log("CrDate:" +CreateDate, "CrTime:" +CreateTime );
} else {
console.log("No Metadata Creation Infos found in " +filepath);
CreateDate = "";
CretaeTime = "";
}
if (metadata.tags.GPSLatitude !== undefined){
GPSLat = metadata.tags.GPSLatitude;
GPSLon = metadata.tags.GPSLongitude;
console.log("GPSLat:" + GPSLat , "GPSLon:" +GPSLon);
}
else {
console.log("No Metadata GPS Infos found in " +filepath)
GPSLat = "";
GPSLon = "";
}
return MetaData = {
GPSLat: GPSLat ,
GPSLon: GPSLon,
CreateDate: CreateDate,
CreateTime: CreateTime,
}
})
}
May i ask someone to give a hand. How can i make my array available outside the foreach. thank you very much!
The reason you're getting empty array at the end of forEach is because, GetFileMetaInfo.filemetainfo() returns a promise and forEach won't wait for async actions.
You could use async/await with for...of loop to get your desired result.
app.post('/submit', function (req, res) {
uploadPics(req, res, async function (err) { // note async here
if (err instanceof multer.MulterError) {
res.send(JSON.stringify({UploadResult: err.message}));
console.log(err.message + ' ' +'Redirect /home');
} else if (err) {
console.log(err);
} else {
res.send(JSON.stringify({UploadResult: 'Success'}));
var filesarray = req.files;
var picinfos = [];
for(let file of filesarray) {
const metadata = await GetFileMetaInfo.filemetainfo(file.path);
// push metadata into your array here
picinfos.push(metadata);
}
// You will have picinfos here
}
})
})
Although the question is already answered by Dinesh Pandiyan there are still some adjustments that can be made. The following code in his answer runs sequential, meaning that every async request is made after the previously returned result is resolved.
for(let file of filesarray) {
const metadata = await GetFileMetaInfo.filemetainfo(file.path);
// ^- pauses the execution of the current running code
// push metadata into your array here
picinfos.push(metadata);
}
async call #1 ╌╌await╌╌> async call #2 ╌╌await╌╌> async call #3 ╌╌await╌╌> result
You could make the code concurrent by first executing all async statements and then wait until all results are resolved. This can be done by simply changing the following:
// execute all the async functions first, reducing the wait time
for(let file of filesarray) {
const metadata = GetFileMetaInfo.filemetainfo(file.path);
// ^- remove the await
// push metadata into your array here
picinfos.push(metadata);
}
// wait for all results to be resolved
picinfos = await Promise.all(picinfos);
// ^- instead await here
async call #1 ╌╌┐
async call #2 ╌╌┼╌╌await all╌╌> result
async call #3 ╌╌┘
The above could be further simplified by simply using an Array.map() in combination with the already shown Promise.all().
var filesarray = req.files;
var picinfos = await Promise.all(filesarray.map(file => {
return GetFileMetaInfo.filemetainfo(file.path);
}));
// picinfos should be present
Or if you want to avoid working with async/await:
var filesarray = req.files;
Promise.all(filesarray.map(file => {
return GetFileMetaInfo.filemetainfo(file.path);
})).then(picinfos => {
// picinfos should be present
});
I'm trying to assign a MongoDB document object to a Javascript variable running on a Node.js server. I'm having trouble understanding how to do this with the async behavior of the MongoDB driver on Node. In my main app.js file I have the following
// MongoDB
//-------------------------------------------------
var DOC = require('./test_mongodb.js');
console.log(typeof(DOC)); //<---------------- this fails
console.log('Works:', JSON.stringify(DOC)); //<------------this fails
The required file test_mongodb.js is as follows
const config = require('../config/mongodb.config.js'); // <--- This just pulls in the ip:port and database name...nothing special
const mongodb = require('mongodb');
var DOC = {}; // <------------------I want to store a document here
// Client
const mongoClient = mongodb.MongoClient;
// Connection URL
const mongoUrl = 'mongodb://' + config.db.host;
console.log(config.db.host);
// Use connect method to connect to the server
mongoClient.connect(mongoUrl, function(err, client) {
if (err) {
console.log('Unable to connect to the mongoDB server. Error:', err);
}
else {
console.log("Connected to mongoDB server");
// Select database
const db = client.db('DATA');
// Get the documents collection
var coll = db.collection('TEST');
//We have a cursor now with our find criteria
var cursor = coll.find({"name": "testing"});
//Lets iterate on the result
var count = 0;
cursor.each(function (err, doc) {
if (err) {
console.log(err);
} else {
console.log('Fetched:', doc);
if (count == 3) {
//console.log(typeof(doc));
//DOC = JSON.parse(JSON.stringify(doc)) ;
//console.log(typeof(DOC))
//console.log('Inside:',DOC);
DOC = doc; <----------------------just capture one doc
}
count = count + 1;
}
});
}
// Close connection when done
client.close();
});
console.log(typeof(DOC));
console.log('Inside:',DOC);
module.exports = DOC; // <-------------I want to export the variable here
The output of console.log(typeof(DOC)) in app.js is undefined. Overall, I realize this is a problem with the async behavior, but I don't understand how to correct it. I've read that the solution is to use callback functions, but I don't fully grasp how this is done here with Mongo. In your solution, please give a detailed explanation of how the callbacks are physically working as this is my main confusion.
One other side question...is there a difference between cursor.each and cursor.forEach?
The problem here is that the module.exports assignment is happening before the query is complete.
Your environment probably supports Promises, so you can return one here. Here's how your code should like:
test_mongodb.js
const config = require('../config/mongodb.config.js'); // <--- This just pulls in the ip:port and database name...nothing special
const mongodb = require('mongodb');
var DOC = new Promise(function(resolve, reject) {
// Client
const mongoClient = mongodb.MongoClient;
// Connection URL
const mongoUrl = 'mongodb://' + config.db.host;
console.log(config.db.host);
// Use connect method to connect to the server
mongoClient.connect(mongoUrl, function(err, client) {
if (err) {
console.log('Unable to connect to the mongoDB server. Error:', err);
} else {
console.log("Connected to mongoDB server");
// Select database
const db = client.db('DATA');
// Get the documents collection
var coll = db.collection('TEST');
//We have a cursor now with our find criteria
var cursor = coll.find({
"name": "testing"
});
//Lets iterate on the result
var count = 0;
cursor.each(function(err, doc) {
if (err) {
console.log(err);
} else {
console.log('Fetched:', doc);
if (count == 3) {
resolve(doc);
}
count = count + 1;
}
});
}
// Close connection when done
client.close();
});
})
module.exports = DOC;
Then wait for the promise to resolve in the parent.
app.js
require('./test_mongodb.js').then(function(DOC) {
console.log(DOC);
}, function(err) {
console.log(err);
});
I am new to node so am struggling quite a bit with the Async nature of it.
I am trying to create a script that will parse the pdfs inside a directory and output them in txt format in another directory.
To do this, I am using fs and pdf2json npm packages. I am passing the parseData function as a callback in the loopingFiles function. The only problem I am having is the async nature of node.
It will loop through all the files at the same time and the output is then a jumbled mess in the last file index.
I would like to process this synchronously such that it will wait once the data is finished parsing to write to the txt and then loop again.
I have tried promises but to no avail. Any help would be much appreciated!
var fs = require('fs'),
PDFParser = require("pdf2json");
let pdfParser = new PDFParser(this,1);
var parseData = function(pdf, index) {
txtFile = "/Users/janet/node/pdf/Destination/".concat(index.toString().concat(".txt"))
pdfFile = "/Users/janet/node/pdf/Source/".concat(pdf);
pdfParser.loadPDF(pdfFile);
// Parsing the pdf file in question
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) );
pdfParser.on("pdfParser_dataReady", pdfData => {
fs.writeFile(txtFile, pdfParser.getRawTextContent());
});
};
var loopingFiles = function(callback) {
fs.readdir("/Users/janet/node/pdf/Source", function (err, files) {
if (err) {
console.log(err);
} else {
files.forEach( function(file, index) {
callback(file, index);
});
};
});
};
loopingFiles(parseData);
Something like this?
var fs = require("fs"),
PDFParser = require("pdf2json");
let pdfParser = new PDFParser(this, 1);
var parseData = function(pdfs, index = 0) {
// finished
if (index >= pdfs.length) return;
let pdf = pdfs[index];
txtFile = "/Users/janet/node/pdf/Destination/".concat(
index.toString().concat(".txt")
);
pdfFile = "/Users/janet/node/pdf/Source/".concat(pdf);
// Parsing the pdf file in question
pdfParser.on("pdfParser_dataError", errData => {
console.error(errData.parserError)
// not sure if you want to call this here to keep going or not
parseData(pdfs, index + 1);
});
pdfParser.on("pdfParser_dataReady", pdfData => {
fs.writeFile(txtFile, pdfParser.getRawTextContent(), function() {
// when we're all done, call this function again, with the index of the next pdf
parseData(pdfs, index + 1);
});
});
pdfParser.loadPDF(pdfFile);
};
var loopingFiles = function(callback) {
fs.readdir("/Users/janet/node/pdf/Source", function(err, files) {
if (err) {
console.log(err);
} else {
callback(files, 0);
}
});
};
loopingFiles(parseData);
the main difference is passing the whole array of pdfs to the function with an index, and only calling that function again with an incremented index once the current one is completed
I have a module with a function which generates the value for a vaariable for a variable "stitcheBook". I can see and use this value using a callback.
However, I want to have this value available to me as a module property. How can i achieve this?
Note: I wish the output of the _BookStitcher.stitchAllStories function to go into the _BookStitcher.stitchedBook property.
module.exports = _BookStitcher = (function() {
var db = require('../modules/db');
var stitchedBook = {};
var stitchAllStories = function(callback) {
db.dbConnection.smembers("storyIdSet", function (err, reply) {
if (err) throw err;
else {
var storyList = reply;
console.log(storyList);
// start a separate multi command queue
multi = db.dbConnection.multi();
for (var i=0; i<storyList.length; i++) {
multi.hgetall('story/' + String(storyList[i]) + '/properties');
};
// drains multi queue and runs atomically
multi.exec(function (err, replies) {
stitchedBook = replies;
// console.log(stitchedBook);
callback(stitchedBook);
});
};
});
};
return {
stitchedBook : stitchedBook,
stitchAllStories: stitchAllStories
}
})();
EDIT: to add: I know that I can actually set the value from outside by doing something like this;
_BookStitcher.stitchAllStories(function (reply) {
console.log("Book has been stitched!\n\n")
console.log("the Book is;\n");
console.log(reply);
_BookStitcher.stitchedBook = reply;
console.log("-------------------------------------------------------------------------\n\n\n");
console.log(_BookStitcher.stitchedBook);
});
I was wondering if there was a way of doing it from inside the _BookStitcher module itself.
You could take advantage of how object references work in JavaScript, and assign it to a property:
module.exports = _BookStitcher = (function() {
var db = require('../modules/db');
// CHANGE HERE
var stitched = { book: null };
var stitchAllStories = function(callback) {
db.dbConnection.smembers("storyIdSet", function (err, reply) {
if (err) throw err;
else {
var storyList = reply;
console.log(storyList);
// start a separate multi command queue
multi = db.dbConnection.multi();
for (var i=0; i<storyList.length; i++) {
multi.hgetall('story/' + String(storyList[i]) + '/properties');
};
// drains multi queue and runs atomically
multi.exec(function (err, replies) {
// CHANGE HERE
stitched.book = replies;
// console.log(stitchedBook);
callback(replies);
});
};
});
};
return {
stitched : stitched,
stitchAllStories: stitchAllStories
};
}());
So instead of having it inside _BookStitcher.stitchedBook, you'd have it at _BookStitcher.stitched.book.
But that looks awful, and I'd never use it! You can't know when the value will be available, it's only safe to use it from the callback, when you're sure it's been set.