How can I build an ETL pipeline script to un-gunzip, extract, transform, save and gunzip files? I am able to get up to un-gunzip, but I am unable to extract, transform, save, and gunzip. I was attempting to follow this tutorial to get me started: https://www.mariokandut.com/transform-data-etl-pipeline-node-js/ One thing that I'm stuck on is how to loop through files after each sequential step. I get an unexpected error SyntaxError: Unexpected end of JSON input during the extract step.
I was able to extract, transform, and save in a separate example. Although I am unable to successfully combine it into this ETL pipeline script.
const fs = require('fs');
const {promises: {readdir, readFile, writeFile}} = require("fs");
var url = require('url');
const zlib = require('zlib');
const input_dir = __dirname + '/input'
const input_unzipped_dir = __dirname + '/input-unzipped'
const output_dir = __dirname + '/output'
async function get_files(dir) {
return await readdir(dir).then(response =>
response
);
}
function read_file(file_path, callback) {
fs.readFile(file_path, 'utf-8', (err, file_data) => {
if (err) {
return callback && callback(err);
}
try {
const object = JSON.parse(file_data);
return callback && callback(null, object);
} catch (err) {
return callback && callback(err);
}
})
}
function transform_JSON(file_data) {
console.log("ts is:", file_data.ts); // => "timestamp"
console.log("u is:", file_data.u); // => "url"
console.log("e is:", file_data.e); // => "event"
console.log(url.parse(file_data.u))
u = url.parse(file_data.u)
const query_map = new Map(Object.entries(file_data.e));
const output = {
timestamp: file_data.ts,
url_object: {
domain: u.host,
path: u.path,
query_object: query_map,
hash: u.hash,
},
ec: file_data.e,
}
const jsonString = JSON.stringify(output)
console.log(jsonString)
return jsonString
}
const orchestrate_etl_pipeline = async () => {
try {
// extract
files = await get_files(input_dir);
console.log(files);
if (!fs.existsSync(input_unzipped_dir)){
fs.mkdirSync(input_unzipped_dir);
}
Promise.all(files.map(filename => {
if (filename.endsWith('.gz')) {
return new Promise((resolve, reject) => {
const fileContents = fs.createReadStream(`${input_dir}/${filename}`);
const writeStream = fs.createWriteStream(`${input_unzipped_dir}/${filename.slice(0, -3)}`);
const unzip = zlib.createGunzip();
fileContents.pipe(unzip).pipe(writeStream).on('finish', (err) => {
if (err) return reject(err);
else resolve();
})
})
}
}))
.then(
console.log('unzip done')
);
// transform
files_unzipped = await get_files(input_unzipped_dir);
Promise.all(files_unzipped.map(filename => {
if (filename.endsWith('.json')) {
read_file(`${input_unzipped_dir}/${filename}`, (err, file_data) => {
if (err) {
console.error(err);
return
}
transform_JSON = transform_JSON(file_data)
console.log(transform_JSON)
})
}
}))
.then(
console.log('transform done')
);
// save file
// gunzip file
} catch (error) {
console.log(error);
}
}
orchestrate_etl_pipeline().then(console.log('etl done'));
Separate transform and save file example:
function jsonReader(file_path, callback) {
fs.readFile(file_path, (err, file_data) => {
if (err) {
return callback && callback(err);
}
try {
const object = JSON.parse(file_data);
return callback && callback(null, object);
} catch (err) {
return callback && callback(err);
}
});
}
jsonReader(`${input_zipped_dir}/t1669976028340.json`, (err, input) => {
if (err) {
console.log(err);
return;
}
console.log("ts is:", input.ts); // => "ts"
console.log("u is:", input.u); // => "u"
console.log("e is:", input.e); // => "e"
console.log(url.parse(input.u))
u = url.parse(input.u)
const query_map = new Map(Object.entries(input.e));
const output = {
timestamp: input.ts,
url_object: {
domain: u.host,
path: u.path,
query_object: query_map,
hash: u.hash,
},
ec: input.e,
}
jsonString = JSON.stringify(output)
console.log(jsonString)
fs.writeFile(`${input_zipped_dir}/t1669976028340.json`, jsonString, err => {
if (err) {
console.log('Error writing file', err)
} else {
console.log('Successfully wrote file')
}
})
})
Related
I'm making a request to my database, I set the functions as asynchronous and to wait, but it still returns me undefined or Promise { pending }
how do I just return it when I have the result?
export const getGerente = async (req, res) => {
var query = "SELECT * FROM inventory;"
const r = await select(query)
console.log(r)
return res.json({message:"teste"})
}
export async function select(query) {
var teste = await client.connect(() =>{
client
.query(query)
.then((resultado) => {
console.log('sucess!!');
return resultado.rows
/*
const rows=resultado.rows
rows.map(x =>{
console.log(x.name)
})*/
})
.catch((erro) => {
console.log("erro: " + erro.message);
})
.then((teste) => {
console.log('Finished execution, exiting now');
process.exit();
});
})
}
result: Promise { pending }
I'm calling her for a request
Your select function is not awaiting the client.connect properly.
Try this for select function -
export async function select(query) {
const promisifiedRows = new Promise((resolve, reject) => {
client.connect((err) => {
if (err) {
reject(err); // err in connecting
} else {
console.log('Connected!');
client.query(query, (err, rows) => {
if (err) {
reject(err); // err while exceuting the query
} else {
resolve(rows);
}
});
}
});
});
const rows = await promisifiedRows();
return rows;
}
I am parsing data from an API call into a text file. However, I wanted to use async-await and break the call below call into 3 separate functions.
#!/usr/bin/env node
const yargs = require("yargs");
const axios = require("axios");
const fs = require("fs");
const options = yargs
.usage("Usage: -n <name>")
.option("n", {
alias: "name",
describe: "Your name",
type: "string",
demandOption: true,
})
.option("s", { alias: "search", describe: "Search Term", type: "string" })
.argv;
const greetings = `Hello ${options.name}!`;
console.log(greetings);
console.log("Here's a random joke for you: ");
const url = options.search
? `https://icanhazdadjoke.com/search?term${escape(options.search)}`
: " https://icanhazdadjoke.com/";
axios.get(url, { headers: { Accept: "application/json" } }).then((res) => {
if (options.search) {
res.data.results.forEach((j) => {
fs.appendFile("jokes.txt", "\n" + j.jokes, (err) => {});
});
if (res.data.results.length === 0) {
console.log("no joke found ðŸ˜");
}
} else {
fs.appendFile("jokes.txt", res.data.joke, (err) => {
if (err) throw err;
console.log("File Updated");
});
}
});
So the above code works absolutely fine and generates the file perfectly, however when I tried to break it into the following below functions, I just get undefined in the text file, I am not sure why this is happening.
const getJoke = async (url) => {
try {
const joke = await axios.get(url, {
headers: { Accept: "application/json" },
});
return joke;
} catch (error) {
console.error(error);
}
};
const parseJokes = (res) => {
if (options.search) {
res.data.results.forEach((j) => {
return `\n ${j.joke}`;
});
if (res.data.results.length === 0) {
console.log("no joke found ðŸ˜");
}
} else {
return res.data.joke;
}
};
const addJokeToFile = async () => {
const result = await getJoke(url)
.then((res) => {
parseJokes(res);
})
.catch((err) => {
console.error(`ERROR: ${err}`);
});
fs.appendFile("jokes.txt", result, (err) => {
console.error(err);
});
};
In the second (functional approach) addJokeToFile method, you are waiting for the promise to be resolved using both ways, await and .then, following modification to the code, might help you get through:
const addJokeToFile = async () => {
getJoke(url)
.then((res) => {
// Aside, we should also return some value from parseJokes function for "no joke found ðŸ˜" case, or return null and put a check here and only append to file when jokeString is not null.
const jokeString = parseJokes(res);
fs.appendFile("jokes.txt", jokeString, (err) => {
console.error(err);
});
})
.catch((err) => {
console.error(`ERROR: ${err}`);
});
};
Try using appendFile from 'fs/promises' so that you can stick with the async/await style. Since getJoke returns a promise I would expect result to be a Promise<string | undefined> depending on if any errors show up earlier in the chain.
const { appendFile } = require('fs/promises');
const addJokeToFile = async () => {
try {
const result = await getJoke(url);
const parsed = parseJokes(result);
await appendFile('jokes.txt', parsed);
} catch (err) {
console.error(err);
}
};
I am still very new to node.js. In my current test project I want to send a confirmation email or other emails, depending on the loaded template. The template is stored in MySQL.
The result I am getting is:
{
"message": {
"error": {},
"foo": "bar"
}
}
So the error bit is empty and I don't know why...
If I reject manually at a different point in the code it works just fine, so the problem is not with the middleware, router or server.js file.
Also I have rejected "Foo: Bar" back, to check which catch block catched the error.
Here is my mailer.js file:
const nodemailer = require('nodemailer');
let conDB;
module.exports = (injectedMySql) => {
conDB = injectedMySql
return {
sendMail: sendMail
}
}
const sendMail = (mail) => {
return new Promise((resolve,reject) => {
loadTemplate(mail.templateId, mail.languageId)
.then(data => {
const mailserver = {
host: "something.com",
port: 465,
secure: true, // use TLS
auth: {
user: "something#something.com",
pass: "PASSWORD"
},
tls: {
// do not fail on invalid certs
rejectUnauthorized: false
}
};
const body = {
from: 'something#something.com',
to: mail.toAdress,
subject: allReplace(data.subject, mail.subjectReplace),
text: allReplace(data.body, mail.textReplace),
html: allReplace(data.html, mail.htmlReplace)
}
// create a nodemailer transporter using smtp
let transporter = nodemailer.createTransport(mailserver)
transporter.sendMail(body)
.then(data => {console.log(data)
resolve(data)
})
.catch(err => {reject("sendMail problem")})
})
.catch(error => {reject({"error": error, "foo": "bar"})})
})
}
function allReplace (str, obj) {
var retStr = str;
for (var x in obj) {
retStr = retStr.replace(new RegExp(x, 'g'), obj[x]);
}
return retStr;
};
const loadTemplate = (mailTemplate, languageId) => {
return new Promise((resolve,reject) => {
if(mailTemplate === null || languageId === null)
reject("nop, something is missing");
else
{
if (typeof conDB.query === "function")
{
conDB.query('SELECT * FROM email_template WHERE language_id = ? AND template_id = ?', [mailTemplate,languageId])
.then(data => {resolve(data)})
.catch(err => {reject("mysql has a problem")})
}
else
{
reject("function is not available");
}
}
})
}
Here is my mysql.js file:
var mysql = require('mysql2/promise');
const databaseConfigs = {
host: 'localhost',
user: 'USERNAME',
password: 'PASSWORD',
database: 'DBNAME'
};
const createID = table => {
return new Promise((resolve,reject) => {
//execute the query to register the user
let query = '';
let id = Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15)
query = `SELECT * FROM ${table} WHERE id = ?`
this.query(query,[table,id])
.then(data => {
console.log(data[0].length)
if(data[0].length==0)
{
resolve(id)
}
else
{
createID(table)
.then(data => {resolve(data)})
.catch(error => {reject(error)})
}
})
.catch(error => {reject(error)})
})
}
async function query (sql,att) {
let connection = await mysql.createConnection(databaseConfigs);
return new Promise( ( resolve, reject ) => {
console.log(`Query: '${sql}'`);
connection.query(sql,att)
.then(data => {resolve(data)})
.catch(error => {reject(error)})
connection.end();
});
}
async function transaction(queries, queryValues) {
if (queries.length !== queryValues.length) {
return Promise.reject(
'Number of provided queries did not match the number of provided query values arrays'
)
}
const connection = await mysql.createConnection(databaseConfigs)
try {
await connection.beginTransaction()
const queryPromises = []
queries.forEach((query, index) => {
queryPromises.push(connection.query(query, queryValues[index]))
})
const results = await Promise.all(queryPromises)
await connection.commit()
await connection.end()
return results
} catch (err) {
await connection.rollback()
await connection.end()
return Promise.reject(err)
}
}
module.exports.transaction = transaction;
module.exports.query = query;
module.exports.createID = createID;
Thanks to you all!
Chris
I cleand up your code a bit. Specially the error handling as you always mask your errors with your Promise.reject("message").
I think what confused you is that you're already using libraries which work with promise (you don't need to wrap those into promises again). Thats quite good as you just can use async/await then.
I hope it helps. If something is unclear just ask.
const nodemailer = require('nodemailer');
let conDB;
module.exports = (injectedMySql) => {
conDB = injectedMySql
return {
sendMail: sendMail
}
}
// your load template function already uses promises no need to wrap it
const sendMail = async mail => {
const data = await loadTemplate(mail.templateId, mail.languageId)
const mailserver = {
host: "something.com",
port: 465,
secure: true, // use TLS
auth: {
user: "something#something.com",
pass: "PASSWORD"
},
tls: {
// do not fail on invalid certs
rejectUnauthorized: false
}
};
const body = {
from: 'something#something.com',
to: mail.toAdress,
subject: allReplace(data.subject, mail.subjectReplace),
text: allReplace(data.body, mail.textReplace),
html: allReplace(data.html, mail.htmlReplace)
}
// create a nodemailer transporter using smtp
let transporter = nodemailer.createTransport(mailserver)
try {
// Return the value of sendmail
return await transporter.sendMail(body);
} catch (err) {
// handle error or throw it. I'll throw as you rejected the Promise here it.
// this part will actually help you as you now can see the correct error instead of your rejected "foo bar" erro object
throw err;
}
}
function allReplace(str, obj) {
var retStr = str;
for (var x in obj) {
retStr = retStr.replace(new RegExp(x, 'g'), obj[x]);
}
return retStr;
};
const loadTemplate = async (mailTemplate, languageId) => {
if (mailTemplate === null || languageId === null)
throw new Error("nop, something is missing");
else {
if (typeof conDB.query === "function") {
try {
const data = await conDB.query('SELECT * FROM email_template WHERE language_id = ? AND template_id = ?', [mailTemplate, languageId]);
} catch (err) {
// it's better to use the real error you always hide the real reason why something went wrong with your promise reject :).
throw err;
}
}
else {
throw new error("function is not available");
}
}
}
.
var mysql = require('mysql2/promise');
const databaseConfigs = {
host: 'localhost',
user: 'USERNAME',
password: 'PASSWORD',
database: 'DBNAME'
};
const createID = async table => {
// use GUID? https://www.npmjs.com/package/guid
let id = Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15)
let query = `SELECT * FROM ${table} WHERE id = ?`
try {
data = await this.query(query, [table, id]);
} catch (error) {
// as we throw the error in query we got to catch it here
// handle it or throw it (I throw it because I can't handle it ;).)
throw error;
}
console.log(data[0].length)
if (data[0].length == 0) {
return id;
} else {
return await createID(table);
}
}
const query = async (sql, att) => {
let connection = await mysql.createConnection(databaseConfigs);
console.log(`Query: '${sql}'`);
try {
const data = await connection.query(sql, att);
return data;
} catch (error) {
// Handle error or throw it again
// you rejected the promise so i throw it here
throw error;
} finally {
connection.end();
}
}
// I changed it to make it the same as the other functions from this
// async function transaction(queries, queryValues) { to
const transaction = async (queries, queryValues) => {
if (queries.length !== queryValues.length) {
// just throw an error
throw new Error('Number of provided queries did not match the number of provided query values arrays');
}
const connection = await mysql.createConnection(databaseConfigs)
try {
await connection.beginTransaction()
const queryPromises = []
queries.forEach((query, index) => {
queryPromises.push(connection.query(query, queryValues[index]))
})
const results = await Promise.all(queryPromises)
await connection.commit()
await connection.end()
return results
} catch (err) {
await connection.rollback()
await connection.end()
// this is not needed
// return Promise.reject(err)
// if you don't want to handle it here just throw the error
throw err;
}
}
module.exports.transaction = transaction;
module.exports.query = query;
module.exports.createID = createID;
I'm using a snippet example for Amazon Athena just to test inserting some data. I can't tell why it isn't working and CloudWatch logs does not show any output when the statement execution is completed. Even when I change it to a simple select statement I can't see any output. I know the query, database and table is fine, because when I test it using the Athena query editor it executes without a problem.
module.exports.dlr = async event => {
let awsFileCreds = {
accessKeyId: "XXX",
secretAccessKey: "XXX"
};
let creds = new AWS.Credentials(awsFileCreds);
AWS.config.credentials = creds;
let client = new AWS.Athena({ region: "eu-west-1" });
let q = Queue((id, cb) => {
startPolling(id)
.then(data => {
return cb(null, data);
})
.catch(err => {
console.log("Failed to poll query: ", err);
return cb(err);
});
}, 5);
const sql = "INSERT INTO delivery_receipts (status, eventid, mcc, mnc, msgcount, msisdn, received, userreference) VALUES ('TestDLR', 345345, 4353, '5345435', 234, '345754', 234, '8833')"
makeQuery(sql)
.then(data => {
console.log("Row Count: ", data.length);
console.log("DATA: ", data);
})
.catch(e => {
console.log("ERROR: ", e);
});
function makeQuery(sql) {
return new Promise((resolve, reject) => {
let params = {
QueryString: sql,
ResultConfiguration: { OutputLocation: ATHENA_OUTPUT_LOCATION },
QueryExecutionContext: { Database: ATHENA_DB }
};
client.startQueryExecution(params, (err, results) => {
if (err) return reject(err);
q.push(results.QueryExecutionId, (err, qid) => {
if (err) return reject(err);
return buildResults(qid)
.then(data => {
return resolve(data);
})
.catch(err => {
return reject(err);
});
});
});
});
}
function buildResults(query_id, max, page) {
let max_num_results = max ? max : RESULT_SIZE;
let page_token = page ? page : undefined;
return new Promise((resolve, reject) => {
let params = {
QueryExecutionId: query_id,
MaxResults: max_num_results,
NextToken: page_token
};
let dataBlob = [];
go(params);
function go(param) {
getResults(param)
.then(res => {
dataBlob = _.concat(dataBlob, res.list);
if (res.next) {
param.NextToken = res.next;
return go(param);
} else return resolve(dataBlob);
})
.catch(err => {
return reject(err);
});
}
function getResults() {
return new Promise((resolve, reject) => {
client.getQueryResults(params, (err, data) => {
if (err) return reject(err);
var list = [];
let header = buildHeader(
data.ResultSet.ResultSetMetadata.ColumnInfo
);
let top_row = _.map(_.head(data.ResultSet.Rows).Data, n => {
return n.VarCharValue;
});
let resultSet =
_.difference(header, top_row).length > 0
? data.ResultSet.Rows
: _.drop(data.ResultSet.Rows);
resultSet.forEach(item => {
list.push(
_.zipObject(
header,
_.map(item.Data, n => {
return n.VarCharValue;
})
)
);
});
return resolve({
next: "NextToken" in data ? data.NextToken : undefined,
list: list
});
});
});
}
});
}
function startPolling(id) {
return new Promise((resolve, reject) => {
function poll(id) {
client.getQueryExecution({ QueryExecutionId: id }, (err, data) => {
if (err) return reject(err);
if (data.QueryExecution.Status.State === "SUCCEEDED")
return resolve(id);
else if (
["FAILED", "CANCELLED"].includes(data.QueryExecution.Status.State)
)
return reject(
new Error(`Query ${data.QueryExecution.Status.State}`)
);
else {
setTimeout(poll, POLL_INTERVAL, id);
}
});
}
poll(id);
});
}
function buildHeader(columns) {
return _.map(columns, i => {
return i.Name;
});
}
return { message: 'Go Serverless v1.0! Your function executed successfully!', event };
};
Figured it out. Using aws lambda events with athena is easy using the athena-express package. You can specify your configuration and query the athena database like you normally would with significantly less code than what's provided in the amazon athena nodejs example.
This is the code I used to achieve a result:
"use strict";
const AthenaExpress = require("athena-express"),
aws = require("aws-sdk");
const athenaExpressConfig = {
aws,
db: "messaging",
getStats: true
};
const athenaExpress = new AthenaExpress(athenaExpressConfig);
exports.handler = async event => {
const sqlQuery = "SELECT * FROM delivery_receipts LIMIT 3";
try {
let results = await athenaExpress.query(sqlQuery);
return results;
} catch (error) {
return error;
}
};
I would like to call use async waterfall function within a async series function. looks like parallel functions are executed but final callback function of async series does not seem to be executed. I get the results of the instances of the function which is called with different arguments in series function but cannot have the line executed for some reason.
➜ lib git:(jumpstart-compare) ✗ node aws-ecs.compare.js
Data 1f701a9754eb22ce8f0dcdb4c1b0b366a51ade9a
Data 4cc27bcc2a8482478ac2e5c0cf3ac1babe153374
var AWS = require('aws-sdk');
const async = require('async')
const _ = require('lodash');
AWS.config.update({
region: 'us-east-1'
});
const ecs = new AWS.ECS();
getClusterSha = (clustername,app,callback) => {
const ListServices = (callback) => {
let params = {
cluster: clustername
}
//console.log('list tasks executed')
ecs.listServices(params, (err, data) => {
if (err) {
callback(err)
} else {
let dataObj = {
data: data,
cluster: clustername
}
callback(null,dataObj)
}
})
}
const getService = (arg1, callback) => {
let appname = app
arg1.cluster
let finaldata = arg1.data.serviceArns.filter((elem)=>{
if(elem.indexOf(appname) != -1){
return elem
}
});
//console.log('finaldata: ',finaldata)
if(finaldata.length > 0){
callback(null,finaldata.toString().split('/')[1])
}else{
callback('No app with name: '+appname+' found!')
}
}
const describeService = (arg2, callback) => {
let params = {
services: [arg2],
cluster: clustername
}
ecs.describeServices(params, (err, data) => {
if (err) {
callback(err)
} else {
// console.log(data)
callback(null,data.services[0].taskDefinition.split('/')[1])
}
})
}
const describeTaskDef = (arg3, callback) => {
let params = {
taskDefinition: arg3
}
ecs.describeTaskDefinition(params, (err, data) => {
if (err) {
callback(err)
} else {
//console.log(data.taskDefinition.containerDefinitions[0].image.split('/')[1].split(':')[1])
finaldata = data.taskDefinition.containerDefinitions[0]
.image.split('/')[1]
.split(':')[1]
callback(null,finaldata)
}
})
}
// const githubCall = (arg4,callback) => {
// console.log('https://github.com/Jumpstart-Auto'+'/'+app+'/commit/'+arg4)
// callback(null,'https://github.com/Jumpstart-Auto'+'/'+app+'/commit/'+arg4)
// }
async.waterfall([
ListServices,
getService,
describeService,
describeTaskDef,
], (err, data) => {
if (err) {
console.log('Error', err)
callback(err)
} else {
console.log('Data', data)
callback(null,data)
}
})
}
compareSha = (clustername1,clustername2,app) => {
async.series([
getClusterSha(clustername1,app,(data)=>{return data}),
getClusterSha(clustername2,app,(data)=>{return data})
], (err,result)=>{
console.log(err,result)
})
}
compareSha('dev','staging','jamobot',function(err,data){
console.log(data)
})
//module.exports = getShaCluster
changing async.series to following fixed the problem.
async.waterfall([
ListServices,
getService,
describeService,
describeTaskDef,
], (err, data) => {
if (err) {
console.log('Error', err)
callback(err)
} else {
console.log('Data', data)
callback(null,data)
}
})
}
compareSha = (clustername1,clustername2,app,cb) => {
async.series([
function(callback){
getClusterSha(clustername1,app,callback)
},
function(callback){
getClusterSha(clustername2,app,callback)
},
], (err,result)=>{
if(err){
cb(err)
}else{
cb(null,result)
}
})