Better way to write CSV from Parquet in Javascript - javascript

I am converting from Parquet to CSV using javascript.
The example below works, but i am storing in memory the array of values read from Parquet, in records.
Parquet library uses AsyncIterator while the CSV library uses Node Stream API.
I would like to know how to implement a more elegant solution, leveraging streams and reducing memory footprint. TIA
libraries -
Parquet: https://github.com/ironSource/parquetjs
CSV: https://csv.js.org/
import pts from 'parquets'
let { ParquetSchema, ParquetWriter, ParquetReader } = pts
import * as fs from 'fs'
import stringify from 'csv-stringify'
// declare a schema for the `PI` table
let schema = new ParquetSchema({
Source: { type: 'UTF8' },
TagID: { type: 'UTF8' },
Timestamp: { type: 'TIMESTAMP_MILLIS' },
Value: { type: 'DOUBLE' },
});
const WriterParquet = async () => {
// create new ParquetWriter that writes to 'pi.parquet`
let writer = await ParquetWriter.openFile(schema, 'pi.parquet')
// append a few rows to the file
await writer.appendRow({Source: 'PI/NO-SVG-PISRV01', TagID: 'OGP8TI198Z.PV', Timestamp: new Date(), Value: 410 })
await writer.appendRow({Source: 'PI/NO-SVG-PISRV01', TagID: 'OGP8TI198Z.PV', Timestamp: new Date(), Value: 420 })
await writer.close()
}
const WriterCSV = async () => {
// create new ParquetReader that reads from 'pi.parquet`
let reader = await ParquetReader.openFile('pi.parquet')
// create a new cursor
let cursor = reader.getCursor()
// read all records from the file and print them
let records = []
let record = null;
while (record = await cursor.next()) {
console.log(record)
records.push(record)
}
await reader.close()
// write to CSV
stringify(records, {
header: true
}, function (err, output) {
console.log(output)
fs.writeFile('./pi.csv', output, () => {});
})
}
const Main = async () => {
console.log('writing parquet...')
await WriterParquet()
console.log('reading parquet and writing csv...')
await WriterCSV()
}
Main()

Instead of using the cursor i used the Readable.from(reader) creating a ReadableStream, after this, it was easy to pipe into csv-stringify:
const WriterCSV = async () => {
// create new ParquetReader that reads from 'pi.parquet`
let reader = await ParquetReader.openFile('pi.parquet')
// read all records from the file and print them
const readStream = Readable.from(reader)
readStream.pipe(
stringify({
header: true,
columns: {
Source: 'Source',
TagID: 'TagID',
Timestamp: 'Timestamp',
Value: 'Value'
}
}, function (error, output) {
fs.writeFile('./pi.csv', output, () => {});
}))
readStream.on('end', async function () {
await reader.close();
});
}

Related

How to read multiple json file using fs and bulk request

I'm using elasticsearch search engine with my react app, I was reading one file at the backend as you see in the code and it work perfectly, but now I want to read three different JSON files to three different indexes using the "fs" package and bulk request, can you please help me?
the code:
// Start reading the json file
fs.readFile("DocRes.json", { encoding: "utf-8" }, function (err, data) {
if (err) {
throw err;
}
// Build up a giant bulk request for elasticsearch.
bulk_request = data.split("\n").reduce(function (bulk_request, line) {
var obj, ncar;
try {
obj = JSON.parse(line);
} catch (e) {
console.log("Done reading 1");
return bulk_request;
}
// Rework the data slightly
ncar = {
id: obj.id,
name: obj.name,
summary: obj.summary,
image: obj.image,
approvetool: obj.approvetool,
num: obj.num,
date: obj.date,
};
bulk_request.push({
index: { _index: "ncar_index", _type: "ncar", _id: ncar.id },
});
bulk_request.push(ncar);
return bulk_request;
}, []);
// A little voodoo to simulate synchronous insert
var busy = false;
var callback = function (err, resp) {
if (err) {
console.log(err);
}
busy = false;
};
// Recursively whittle away at bulk_request, 1000 at a time.
var perhaps_insert = function () {
if (!busy) {
busy = true;
client.bulk(
{
body: bulk_request.slice(0, 1000),
},
callback
);
bulk_request = bulk_request.slice(1000);
console.log(bulk_request.length);
}
if (bulk_request.length > 0) {
setTimeout(perhaps_insert, 100);
} else {
console.log("Inserted all records.");
}
};
perhaps_insert();
});
You can create multiple promises for each file read and feed it to the elastic search bulk_request.
const fsPromises = require('fs').promises,
files = ['filename1', 'filename1'],
response = [];
const fetchFile = async (filename) => {
return new Promise((resolve, reject) => {
const path = path.join(__dirname, filename);
try {
const data = await fsPromises.readFile(path)); // make sure path is correct
resolve(data);
} catch (e) {
reject(e)
}
});
files.forEach((fileName) => results.push(fetchFile()));
Promise.all(results).then(data => console.log(data)).catch(e => console.log(e));
}
Once you get data from all the promises pass it to the elastic search.

Writing on JSON file writes but didn't save it

Is it normal on writing into JSON File but not saving it? if so, how do you actually save it if its done? After I write into JSON file and read it again... it came back to normal...
module.exports = {
data: async function() {
const fs = require('fs');
let obj;
fs.readFile('test.json', 'utf8', function readFileCallBack(err, data) {
if (err) console.log(err);
else if (data !== undefined) {
obj = JSON.parse(data);
obj.table.push({ id: 1, name: 'test' });
json = JSON.stringify(obj);
console.log(obj); //{ table: [{id:1, name: 'test'}] }
fs.writeFile('test.json', json, readFileCallBack);
}
});
fs.readFile('test.json', 'utf8', function readFileCallBack(err, data) {
if (err) console.log(err);
else if (data !== undefined) {
obj = JSON.parse(data);
console.log(obj); //{ table: [] }
}
});
}
}
My JSON File:
{
"table": [
]
}
Is there someting to do with localstorage.setItem() to actually save it?
Since JS is asynchronous in nature, both of your fs.readFile functions will be executed in parallel. So, you want to update the data that is there in the JSON file. To handle asynchronous code you can use callbacks or promises. Here is an example using promise ( async/await )
const fs = require("fs");
const util = require("util");
const defaultObj = {
table: []
};
const readFilePromise = util.promisify(fs.readFile);
const writeFilePromise = util.promisify(fs.writeFile);
async function readFromJSON() {
const data = await readFilePromise('test.json', { encoding: "utf-8" });
return data.toString("utf-8");
}
function writeToJSON(json) {
return writeFilePromise('test.json', json);
}
async function getData() {
let data = await readFromJSON(); // read from JSON file
const obj = data ? JSON.parse(data) : defaultObj;
obj.table.push({ id: 1, name: 'test' });
const json = JSON.stringify(obj);
await writeToJSON(json); // update the data
return obj;
}
module.exports = {
data: getData
};

Get list of objects from s3 bucket (min.io or amazon) using promise

I am trying to get a list of object name from s3 bucket using min.io javascript API (https://docs.min.io/docs/javascript-client-api-reference#listObjectsV2). The API returns a stream. However, I always get an empty list.
The example of the dataStream is:
{
name: 'sample-mp4-file-1.mp4',
lastModified: 2020-10-14T02:35:38.308Z,
etag: '5021b3b7c402468d5b018a8b4a2b448a',
size: 10546620
}
{
name: 'sample-mp4-file-2.mp4',
lastModified: 2020-10-14T15:54:44.672Z,
etag: '5021b3b7c402468d5b018a8b4a2b448a',
size: 10546620
}
My function
public async listFiles(
bucketName: string,
prefix?: string
): Promise<string[]> {
const objectsList = [];
await minioClient.listObjectsV2(bucketName, "", true, "", function(
err,
dataStream
) {
if (err) {
console.log("Error listFiles: ", err);
return;
}
console.log("Succesfully get data");
dataStream.on("data", function(obj) {
objectsList.push(obj.name);
});
dataStream.on("error", function(e) {
console.log(e);
});
dataStream.on("end", function(e) {
console.log("Total number of objects: ", objectsList.length);
});
});
return objectsList;
}
Expected output is a list object name, [sample-mp4-file-1.mp4, sample-mp4-file-2.mp4]
According to the documentation, listObjectsV2() is returning a stream, not a promise. Therefore, await is returning immediately, before objectsList will contain anything.
The API you're using has to support Promises if you want to await them.
You could work around this by doing something like this:
const objectsList = await new Promise((resolve, reject) => {
const objectsListTemp = [];
const stream = minioClient.listObjectsV2(bucketName, '', true, '');
stream.on('data', obj => objectsListTemp.push(obj.name));
stream.on('error', reject);
stream.on('end', () => {
resolve(objectsListTemp);
});
});

Delay the foreach Loop in Node JS

I hope you are well. I am getting data from one API and sending to Shopify store API. As its working fine but its entering some products as when it iterates in the loop API is busy with index suppose 0,1,2 and then indexes 3,4,...10 bypassed. So , according to me I should delay the foreach loop with 10,15 seconds. Please help me to do this . I tried it many times with SETTimer etc but foreach loop structure is difficult for me as a new person. Please check the below code. Thanks
const request = require('request');
const { json } = require('express');
const { Parser } = require('json2csv');
const fastcsv = require('fast-csv');
//const csv = require('csv-parser');
const fs = require('fs');
const { privateDecrypt } = require('crypto');
const { time } = require('console');
const fields = ['Vendor', 'Price', 'SKU','error'];
const opts = { fields };
const createCsvWriter = require('csv-writer').createObjectCsvWriter;
const csvWriter = createCsvWriter({
path: 'C:/Users/IT City/Desktop/arslan.csv',
header: [
{id: 'Vendor', title: 'Vendor'},
{id: 'Price', title: 'Price'},
{id: 'SKU', title: 'SKU'},
{id: 'error', title: 'error'},
]
});
let new_products = {
product_final: {
Vendor: String,
Price: String,
SKU: String,
Error: String,
}
};
//////////First API from which I am getting the data
const options = {
method: 'GET',
url: 'https://api.dexi.io/runs/f58f7795-c11a-478c-b670-c7ae5df8677b/latest/result',
headers: {
accept: 'application/json',
json: true,
'x-dexiio-access': '9d56e967dfXXXXXXX725e234b311655c96',
'x-dexiio-account': '5e597feb-99axxxxxxxx-37f1315723ab'
}
};
products = ['Vendor','Price','SKU','Error']
let product = {};
request(options, function (error, response, body) {
if (error) throw new Error(error);
pro = JSON.parse(body);
/////Looping through the each Item from the first API and sending data to Shopify. But its entering only 10-12 products
//// As handle is busy to entering the product.
/// I have to delay the foreach loop 10, 15 seconds
pro.rows.forEach(
row => {
for (let z = 0; z < row.length; z++)
{
product[pro.headers[z]] = row[z];
product_final2[pro.headers[z]] = row[z];
}
productssdata.push(product_final2)
products.push(product)
var Price = product.Price;
var SKU = product.SKU;
var Vendor = product.Vendor;
var body_html = "THISFSDFSDFSDFSDFSFSDF";
let new_products = {
product: {
title: Vendor,
body_html: Price,
vendor: Vendor,
product_type: SKU,
tags: Price
}
};
const options = {
method: 'POST',
url:
'https://0abcfsdfsdf4bb6532f3b#amjad.myshopify.com/admin/api/2020-07/products.json',
headers: {
accept: 'application/json',
'apiKey': '07649cABSDCCSD8ffbae7af02',
'password': 'sSDCSDFDF',
body: new_products,
json: true,
};
request(options, function (error, response, body) {
if (error) throw new Error(error);
console.log(body)
});
}
);
}
);
you don't need to use setTimeOut() to delay the loop. thats what async and await are for let me share you an example how to make the forEach loop delay with await!!.
step1 : return a function with promise and use await until it is complete.
const wait =async props => {
return new Promise((reslove,reject) => {
return reslove(Math.random());
})
}
const x = [1,2,3,4]
x.forEach(async number =>{
const num = await wait();
console.log('start')
console.log(num);
console.log('end');
})
Request is deprecated
It can't be used with await anyway, which makes it inconvenient. There used to be another module, request-promise, that as wrapping request and return a Promise, so one could await it, but it's still deprecated.
For these reasons, use Axios or Fetch instead
You can't use await or delay a .forEach() loop, but you can in a for loop.
You think you need to delay the calls, because they are asynchronous, but in reality you should simply await each call. Delaying calls with an arbitrary timeout is a dirty workaround.
In the end, you can do something like :
( async () => {
let options = {
url : "http://......"
};
const response = await axios.get(options); // Add try/catch block around this to manage errors
const pro = response.data;
for( let row of pro.rows) {
options = {
url : "http://some.other.url"
}
const response = await axios.get(options); // Each call will be done one by one and awaited in order
}
})()
setTimeout(pro.rows.forEach((row) => {...}), 10000)
This executes the forEach after 10 seconds.

Parse streamed chunk data into JSON

Hi I'm trying to display data in chunk since I'm getting data in chunk.
for example let us assume that data is something like this.
data: {
user: [
{
name: 'a',
bankAccounts: ['123', '234', '567'],
address: ['some address', 'some other address', 'some more addres']
},
{
name: 'b',
bankAccounts: ['1233', '2334', '5637'],
address: ['some address1', 'some other address1', 'some more addres1']
},
{
name: 'c',
bankAccounts: ['123355', '233455', '563700'],
address: ['some address12', 'some other address12', 'some more addres12']
},
]
}
but the chunk I'm receiving is something like this
1st chunk: "data: user: [ {name: a"
2nd chunk: "bankAccounts: ['123', '234', '567'],"
3rd chunk: "address: ['some address', 'some other address', 'some more addres']"
and so on..
I'm receiving chunked data in such a way which can't be converted into json
since it is incomplete.
How can I stream this data in UI?
Any Idea !!!
My code for fetching streaming data
fetch('some url which stream data')
// Retrieve its body as ReadableStream
.then(response => {
const reader = response.body.getReader();
let decoder = new TextDecoder();
return new ReadableStream({
start(controller) {
return pump();
function pump() {
return reader.read().then(({ done, value }) => {
// When no more data needs to be consumed, close the stream
let newData = decoder.decode(value, {stream: !done});
console.log(newData);
if (done) {
controller.close();
return;
}
// Enqueue the next data chunk into our target stream
controller.enqueue(value);
return pump();
});
}
}
})
})
.then(stream => new Response(stream))
.then(response => {
console.log('response', response)
})
I know that generators are not very commonly used, but i feel like they would be perfect for streaming the data in this task,
async function* streamAsyncIterator(stream) {
const reader = stream.getReader();
const decoder = new TextDecoder();
while (true) {
const {done,value} = await reader.read();
if (done) break;
yield decoder.decode(value, { stream: !done });
}
reader.releaseLock();
}
fetch('https://httpbin.org/stream/1')
.then(async response => {
let str="";
for await (const value of streamAsyncIterator(response.body))
str+=value;
return JSON.parse(str);
})
.then(response => {
console.log('response', response)
})
however it seems what you want is to parse partially complete JSON, which can be achieved in variety of ways, for instance by using an npm library partial-json-parser
import partialParse from 'partial-json-parser';
fetch('https://httpbin.org/stream/1')
.then(async response => {
let str="";
for await (const value of streamAsyncIterator(response.body)){
str+=value;
functionUpdatingYourUi(partialParse(str));
}
return JSON.parse(str);
})
.then(response => {
console.log('response', response)
})
You can accept a string(start with an empty string) to your function pump and keep appending it until chunk is there. at the end when terminating the recursion, return the parsed data.
const manager = require('./manager');
// manager.UpdateEC2Instances().then(console.log);
manager.UpdateRDSInstances().then(console.log);
fetch('some url which stream data')
// Retrieve its body as ReadableStream
.then(response => {
const reader = response.body.getReader();
let decoder = new TextDecoder();
return new ReadableStream({
start(controller) {
return pump('');
function pump(str) {
return reader.read().then(({ done, value }) => {
// When no more data needs to be consumed, close the stream
str += decoder.decode(value, { stream: !done });
console.log(str);
if (done) {
controller.close();
return JSON.parse(str);
}
// Enqueue the next data chunk into our target stream
controller.enqueue(value);
return pump(str);
});
}
}
})
})
.then(stream => new Response(stream))
.then(response => {
console.log('response', response)
})
See this thread for a more complete discussion & more complete examples from #Damian Nadales.
If you are expecting your chunks to be complete JSON, which is not at all guarantee, you may decode your chunked value (of type Uint8Array) into UTF-8 using TextDecoder.decode, then parse the JSON using JSON.parse. E.g.,
var num = JSON.parse(
new TextDecoder("utf-8").decode(result.value)
);

Categories

Resources