Stream array of remote files to amazon S3 in Node.js - javascript

I have an array of URLs to files that I want to upload to an Amazon S3 bucket. There are 2916 URLs in the array and the files have a combined size of 361MB.
I try to accomplish this using streams to avoid using too much memory. My solution works in the sense that all 2916 files get uploaded, but (at least some of) the uploads seem to be incomplete, as the total size of the uploaded files varies between 200MB and 361MB for each run.
// Relevant code below (part of a larger function)
/* Used dependencies and setup:
const request = require('request');
const AWS = require('aws-sdk');
const stream = require('stream');
AWS.config.loadFromPath('config.json');
const s3 = new AWS.S3();
*/
function uploadStream(path, resolve) {
const pass = new stream.PassThrough();
const params = { Bucket: 'xxx', Key: path, Body: pass };
s3.upload(params, (err, data) => resolve());
return pass;
}
function saveAssets(basePath, assets) {
const promises = [];
assets.map(a => {
const url = a.$.url;
const key = a.$.path.substr(1);
const localPromise = new Promise(
(res, rej) => request.get(url).pipe(uploadStream(key, res))
);
promises.push(localPromise);
});
return Promise.all(promises);
}
saveAssets(basePath, assets).then(() => console.log("Done!"));
It's a bit messy with the promises, but I need to be able to tell when all files have been uploaded, and this part seems to work well at least (it writes "Done!" after ~25 secs when all promises are resolved).
I am new to streams so feel free to bash me if I approach this the wrong way ;-) Really hope I can get some pointers!

It seems I was trying to complete too many requests at once. Using async.eachLimit I now limit my code to a maximum of 50 concurrent requests which is the sweetspot for me in terms of trade-off between execution time, memory consumption and stability (all of the downloads completes every time!).

Related

React Native Expo - FileSystem readAsStringAsync Byte Allocation Failed (Out of Memory)

I am creating an Android App using React Native with Expo Module (FileSystem and Expo AV) to record a local video using the phone's camera, then I send the encoded base64 video to the server.
The code to send the base64 string looks like this:
const encodeBase64 = async () => {
const fileUri = videoUri;
const options = {
encoding: FileSystem.EncodingType.Base64,
};
let result = await FileSystem.readAsStringAsync(fileUri, options);
return result;
};
const upload = async () => {
const base64 = await encodeBase64(videoUri);
const result = await myAPI(base64);
}
It works on my phone (Oppo A3s), but on another phone like Samsung A51, it gives memory allocation error like this:
How to solve this problem?
This is memory error.
Every phone's storage is different each other.
You can use chunk buffer.
So in this case you can split your base64 data to post to server and combine data in server.
ex: client=> chunkbuffer=>1024*100(size)
server=> combine(array of client's data)
Good luck.
If you have any question please contact me.
I will help you anytime.

How to download huge list of remote files using Node JS http get method without running into errors

I'm trying to download a list of files generated by an internal processing system via HTTP get method in node js. For a single files or for a few files it works fine and there is an answer for that already here on stackoverflow. However, the problem occurs then you try to download a huge list of files with asyn requests, the system simply times out and throws an error.
So it's more of a scalability issue. The best way would be to download files one by one/or a few files in one go and move to the next batch, but I'm not sure how to do that. Here is the code I have so far which works fine for a few files but in this case I have ~850 files (a few MBs each), and it does not work-
const https = require("http");
var fs = require('fs');
//list of files
var file_list = [];
file_list.push('http://www.sample.com/file1');
file_list.push('http://www.sample.com/file2');
file_list.push('http://www.sample.com/file3');
.
.
.
file_list.push('http://www.sample.com/file850');
file_list.forEach(single_file => {
const file = fs.createWriteStream('files/'+single_file ); //saving under files folder
https.get(single_file, response => {
var stream = response.pipe(single_file);
stream.on("finish", function() {
console.log("done");
});
});
});
It runs fine for a few files and creates a lot of empty files in the files folder and then throws this error
events.js:288
throw er; // Unhandled 'error' event
^
Error: connect ETIMEDOUT 192.168.76.86:80
at TCPConnectWrap.afterConnect [as oncomplete] (net.js:1137:16)
Emitted 'error' event on ClientRequest instance at:
at Socket.socketErrorListener (_http_client.js:426:9)
at Socket.emit (events.js:311:20)
at emitErrorNT (internal/streams/destroy.js:92:8)
at emitErrorAndCloseNT (internal/streams/destroy.js:60:3)
at processTicksAndRejections (internal/process/task_queues.js:84:21) {
errno: 'ETIMEDOUT',
code: 'ETIMEDOUT',
syscall: 'connect',
address: '192.168.76.86',
port: 80
}
Seems like it gives a huge load to the network, probably downloading these one by one might also work. Please suggest the best scalable solution if possible. Thanks.
The issue is that you're loading them all at the same time, essentially DDoSing the server. You need to limit the threads and use a stack to process
Here is a simplified example of what that might look like (untested).
const MAX_THREADS = 3;
const https = require("http");
const fs = require("fs");
const threads = [];
//list of files
const file_list = [];
file_list.push("http://www.sample.com/file1");
file_list.push("http://www.sample.com/file2");
file_list.push("http://www.sample.com/file3");
// ...
file_list.push("http://www.sample.com/file850");
const getFile = (single_file, callback) => {
const file = fs.createWriteStream("files/" + single_file); //saving under files folder
https.get(single_file, (response) => {
var stream = response.pipe(single_file);
stream.on("finish", function () {
console.log("done");
callback(single_file);
});
});
};
const process = () => {
if (!file_list.length) return;
let file = file_list.unshift();
getFile(file, process); // the loop
};
while (threads.length < MAX_THREADS) {
const thread = "w" + threads.length;
threads.push(thread);
process();
}
you don't even need to use a worker array, just for loop to initiate them should be enough, but you could add an object into the treads pool, and use it to keep stats and handle advanced features like retries or throttling.
You're sending a zillion requests to the target server all at once. This will massively load the target server and will consume a lot of your local resources as you try to handle all the responses.
The simplest scheme for this is to send one request, when you get the response, send the next and so on. This would only ever have one request in flight at the same time.
You can typically improve throughput by managing a small number of requests in flight at the same time (perhaps 3-5).
And, if the target server implements rate limiting, then you may have to slow down the pace of requests you send to it (no more than N per 60 seconds).
There are lots of ways to do this. Here are pointers to some functions that implement various ways to do this.
mapConcurrent() here and pMap() here: These let you iterate an array, sending requests to a host, but manages things so that you only ever have N requests in flight at the same time where you decide what the value of N is.
rateLimitMap() here: Let's you manage how many requests per second are sent.
I would personally do something like this:
// currentIndex is the index of the next file to fetch
const currentIndex = 0;
// numWorkers is the maximum number of simultaneous downloads
const numWorkers = 10;
// promises holds each of our workers promises
const promises = [];
// getNextFile will download the next file, and after finishing, will
// then download the next file in the list, until all files have been
// downloaded
const getNextFile = (resolve) => {
if (currentIndex >= file_list.length) resolve();
const currentFile = file_list[currentIndex];
// increment index so any other worker will not get the same file.
currentIndex++;
const file = fs.createWriteStream('files/' + currentFile );
https.get(single_file, response => {
var stream = response.pipe(single_file);
stream.on("finish", function() {
if (currentIndex === file_list.length) {
resolve();
} else {
getNextFile(resolve);
}
});
});
}
for (let i = 0; i < numWorkers; i++) {
promises.push(new Promise((resolve, reject) => {
getNextFile(resolve);
}));
}
Promise.all(promises).then(() => console.log('All files complete'));

How do I reassemble a chunked POST request with an MP3 blob in the payload using Express.js?

I'm building a speech-to-audio web app that takes mic input, converts the recording to an MP3(using the mic-recorder-to-mp3 NPM package), and then sends it to the node.js/express server-side for storage and to pass along as a subsequent POST request to the speech-to-text API(rev.ai).
The recording functions fine on the UI, I have the recoding playing in an tag and it sounds fine and is the full length recording:
stopBtn.addEventListener("click", () => {
recorder
.stop()
.getMp3().then(([buffer, blob]) => {
let newBlob = new Blob(buffer);
recordedAudio.src = URL.createObjectURL(blob);
recordedAudio.controls=true;
sendData(blob);
}).catch((e) => {
console.log(e);
});
});
function sendData(blob) {
let fd = new FormData();
fd.append('audio', blob);
fetch('/audio', {
headers: { Accept: "application/json", "Transfer-Encoding": "chunked" },
method: "POST", body: fd
});
}
Now, at first in my server-side express route I was seeing multiple requests coming through per recording and thought it was an error that I could sort out later, so I put a quick boolean check to see if the request was already being processed and if so just res.end() back to the UI.
This was all good and fine until I realized that only the first 4 seconds of the recording were being saved. This 4 second recoding saved fine as an MP3 on the server-side and also plays correctly when opened up in a music app, and also transcribed correctly in rev.ai, but still it was only 4 seconds.
I realized that the audio blob was being sent in chunks to the UI and each chunk was part of the multiple requests I was seeing. So then I started looking into how to reassemble the chunks into on audio blob that can be saved as an MP3 and parsed correctly as audio on rev.ai, but nothing I've tried so far has worked. Here is my latest attempt:
app.post("/audio", async (req, res) => {
let audioBlobs = [];
let audioContent;
let filename = `narr-${Date.now()}.mp3`;
//let processed = false;
req.on('readable', async () => {
//if(!processed){
//processed = true;
//let audioChunk = await req.read();
//}
while(null !== (audioChunk = await req.read())) {
console.log("adding chunk")
audioBlobs.push(audioChunk);
}
});
req.on("end", () => {
audioContent = audioBlobs.join('');
fs.writeFile(`./audio/${filename}`, audioContent, async function(err) {
if (err) {
console.log("an error occurred");
console.error(err);
res.end();
}
const stream = fs.createReadStream(`./audio/${filename}`);
let job = await client.submitJobAudioData(stream, filename, {}).then(data => {
waitForRevProcessing(data.id);
}).catch(e => {
console.log("caught an error");
console.log(e);
});
res.end();
})
});
});
The blob is saved on the server-side with this code, but it's not playable in a music app and rev.ai rejects the recording as it does not interpret the blob as an audio file.
Something about the way I'm reassembling the chunks is corrupting the integrity of the MP3 format.
I'm thinking this could be for few reasons:
The chunks could be coming to the server-side out of order, although it wouldn't make a whole lot of sense considering that when I had the boolean check in place it was seemingly saving the first chunk and not mid-chunks
The last chunk is being left "open" or there's some metadata that's missing or padding that's messing with the encoding
These might not be the correct events to listen to for starting/ending the assembly
I'm hoping that Express/the http node module have something built-in to automatically handle this and I'm doing this manual reassembly unnecessarily - I was pretty surprised there was nothing off-the-shelf in Express to handle this, but maybe it's not as common a use case as I imagined?
Any help that can be offered would be greatly appreciated.

Most efficient way to download array of remote file URLs in Node?

I'm working on a Node project where I have an array of files such as
var urls = ["http://web.site/file1.iso", "https://web.site/file2.pdf", "https://web.site/file3.docx", ...];
I'm looking to download those files locally in the most efficient way possible. There could be as many as several dozen URLs in this array... Is there a good library that would help me abstract this out? I need something that I can call with the array and the desired local directory to that will follow redirects, work with http & https, intelligently limit simultaneous downloads, etc.
node-fetch is a lovely little library that brings fetch capability to node. Since fetch returns a promise, managing parallel downloads is simple. Here's an example:
const fetch = require('node-fetch')
const fs = require('fs')
// You can expand this array to include urls are required
const urls = ['http://web.site/file1.iso', 'https://web.site/file2.pdf']
// Here we map the list of urls -> a list of fetch requests
const requests = urls.map(fetch)
// Now we wait for all the requests to resolve and then save them locally
Promise.all(requests).then(files => {
files.forEach(file => {
file.body.pipe(fs.createWriteStream('PATH/FILE_NAME.EXT'))
})
})
Alternatively, you could write each file as it resolves:
const fetch = require('node-fetch')
const fs = require('fs')
const urls = ['http://web.site/file1.iso', 'https://web.site/file2.pdf']
urls.map(file => {
fetch(file).then(response => {
response.body.pipe(fs.createWriteStream('DIRECTORY_NAME/' + file))
})
})

How to download a large S3 file with AWS Lambda (javascript)

I have been struggling to get the following code to work properly. I am using the serverless framework and it works beautifully when invoked locally. I have a whole workflow where I take the downloaded file and process it before uploading back to S3. Ideally I would like to process line-by-line, but small steps first!
I can download a small file from S3 with no problems, but the file I am handling is much larger and shouldn't be in memory.
const AWS = require("aws-sdk");
const S3 = new AWS.S3();
const fs = require("fs");
module.exports = async function (event, context, callback) {
let destWriteStream = fs.createWriteStream("/tmp/a.csv");
let s3Input = { Bucket : "somebucket", Key : "somekey.csv" };
await new Promise((resolve, reject) => {
let s3Stream = S3.getObject(s3Input)
.createReadStream()
.on("close", function() {
resolve("/tmp/a.csv");
})
.pipe( destWriteStream );
}).then(d => {
console.log(d);
});
callback( null, "good" );
};
I have tried many different ways of doing this, including Promise's. I am running Node 8.10. It is just not working when run as a Lambda. It simply times out.
My issue is that I see very little complete examples. Any help would be appreciated.
I managed to get this working. Lambda, really does not do a good job with some of its error reporting.
The issue was that the bucket I was downloading from was from a different region than the Lambda was hosted in. Apparently this does not make a difference when running it locally.
So ... for others that may tread here ... check your bucket locations relative to your Lambda region.

Categories

Resources