youtube-dl in a javascript script errors - javascript

i have youtube-dl in a javascript script ,trying to download hundreds of caption files. I am getting errors
I have a javascript script. I am trying to download ~800 caption files using youtube-dl, i'm getting time out errors and it isn't downloading the files. It seems like it is moving too fast for my computer. I need help pausing the task until the download finishes and then starting the next one.
It is working very less data in videos array.
var json2csv = require('json2csv');
var fs = require('fs');
var youtubedl = require('youtube-dl');
// var fields = ["youtube_id", "title", "date", "duration", "captioned", "views"];
var videos = [
// More objects ~800+
];
for (i = 0; i < videos.length; i++) {
var v = videos[i];
var url = 'https://youtu.be/';
var options = {};
if (v["captioned"] == 'No') {
var url = url + v["youtube_id"];
console.log(url);
var options = {
auto: true,
all: false,
lang: 'en',
cwd: __dirname + "/auto_generated_captions",
};
youtubedl.getSubs(url, options, function(err, files) {
console.log("did i get here?");
if (err) throw err;
console.log('subtitle files downloaded:', files);
});
};
};

You are right. You download too much data at the same time. Try to control the concurrency flow with promise library like bluebird:
var json2csv = require('json2csv');
var fs = require('fs');
var youtubedl = require('youtube-dl');
var promise = require('bluebird');
// var fields = ["youtube_id", "title", "date", "duration", "captioned", "views"];
var videos = [
// More objects ~800+
];
promise
.map(videos, function (v) {
var url = 'https://youtu.be/';
var options = {};
if (v["captioned"] == 'No') {
var url = url + v["youtube_id"];
console.log(url);
var options = {
auto: true,
all: false,
lang: 'en',
cwd: __dirname + "/auto_generated_captions",
};
return new Promise(function (resolve, reject) {
youtubedl.getSubs(url, options, function (err, files) {
console.log("did i get here?");
if (err) {
reject(err);
} else {
console.log('subtitle files downloaded:', files);
resolve(files);
}
});
});
} else {
// return a promise for this case
}
}, { concurrency: 5 })
.then(function (results) {
console.log(results);
});

Related

How to convert excel data to json format in node js?

I am a beginner in nodejs, I was tasked to read a excel file and convert it into json format so that it can be stored into mongodb database.
excel2.js file:
const XlsxStreamReader = require("xlsx-stream-reader");
var fs = require('fs');
const config = require('./excelpush.json');
const db = require('./dbmanager.js');
var finalList = [];
var singlePerson = {};
class ExcelReader {
readFile() {
var workBookReader = new XlsxStreamReader();
workBookReader.on('error', function (error) {
throw (error);
})
workBookReader.on('worksheet', function (workSheetReader) {
if (workSheetReader.id > 1) {
workSheetReader.skip();
return;
}
var isFirstLine = true;
var headerIndex = [];
workSheetReader.on('row', function (row) {
if (isFirstLine) {
headerIndex = row.values.slice(1);
}
else if (!isFirstLine) {
let rowValues = row.values.slice(1);
let valueIndex = 0;
headerIndex.forEach(currentval => {
singlePerson[currentval] = rowValues[valueIndex];
valueIndex++;
});
finalList.push(singlePerson);
}
isFirstLine = false;
});
workSheetReader.on('end', function () {
});
workSheetReader.process()
});
workBookReader.on('end', function () {
//console.log(finalList);
console.log('finished!');
});
fs.createReadStream(config.filePath).pipe(workBookReader);
}
}
excelReader = new ExcelReader();
excelReader.readFile();
db.connectDb();
db.insertDb(finalList);
dbmanager.js file:
var mongoose = require('mongoose');
const config = require('./db.json');
function connectDb() {
mongoose.connect(`mongodb://${config.dbConfig.host}:${config.dbConfig.port}/${config.dbConfig.dbName}`);
mongoose.connection.once('open', function () {
console.log('Connection has been made');
}).on('error', function (error) {
console.log('error is:' + error);
})
}
function insertDb(list) {
var myschema = new mongoose.Schema({}, { strict: false });
var obj = mongoose.model('myschema', myschema, `${config.dbConfig.collectionName}`);
var obj1 = new obj(list);
obj1.save();
}
module.exports = {
connectDb, insertDb
}
db.json file:
{
"dbConfig": {
"host": "localhost",
"port": "27017",
"dbName": "PRACTICE",
"collectionName": "Excel"
}
}
excelpush.json file:
{
"filePath": "D:\\grv_tracx_updt (1).xlsx"
}
here excel2.js is taking excel file from excel2.json and reading it and storing as a array of objects in finalList variable.
In dbmanager.js file db connection and insertion code are written.
Here i am not able to store data in database, The code gets executed perfectly but data in not stored in mongodb database.
Note: excel file is large.

Unable to succesfully upload images to s3 and then view them

I'm trying to upload images to a s3 bucket as part of the application.
index.js
function upImg(req) {
if(req.files.img) {
var img = req.files.image;
var name = Math.round(Math.random()*10000).toString(); // Returns a random 5 digit number
if(myDB.uploadImg(img, name)) {
return name;
} else {
return "";
}
} else {
return "";
}
}
app.post('/newEV*', isLoggedIn, function(req, res) {
var myURL = req.path.replace('/newEV', '');
var imgPath = upImg(req);
fetch(myURL).then(function (events){
var myID;
var x = 0;
while(!myID) {
if(!events[x]) {
myID = x;
} else {
x++;
}
}
myDB.newEvent(myURL, req.body.name, req.body.desc, req.body.loc, imgPath, req.body.link, req.body.cap, req.body.date, req.body.time, myID, events);
res.redirect('/edit' + myURL);
});
});
myDB file
function signs3(file, name) {
devs3();
const s3 = new aws.S3();
const s3Params = {
Body: file,
Bucket: S3_BUCKET,
Key: name
};
s3.putObject(s3Params, function(err, data) {
if(err) {
throw err;
} else {
console.log("Data from putObject:" + JSON.stringify(data));
}
});
}
module.exports = {
uploadImg : function(file, name) {
var nName = "imgs/" + name;
console.log(nName);
signs3(file, nName);
return true;
}
}
I know that the signs3 function works because I can use it in other bits of my application to upload JSON files. Whenever I post to the URL, weirdly enough I can see in the console the 'data from putObject', however what I can't see is the nName. I don't understand this, as the console.log(nName) line should be run before the other one. When I go to look at bucket, the image hasn't uploaded (despite me getting an ETag from the console), and the page does not display it as there (I know this also works because it can display images already uploaded to the bucket).
You want to do something like this, soliciting events from the Request object created when you call putObject.
const req = s3.putObject( s3Params )
req.on('success', res => {
console.log ('upload complete! );
});
req.on ('error', res => {
console.error (res.error');
});
req.send();
Why does this appear to work differently for small files (JSON files) and large files (images)? Because the large files take longer to upload.

Uploading multiple images with Formidable and Jimp

I am trying to upload some images to my node backend with formidable middleware.
I want to upload multiple files from the same form. Each file will be stored in media/temp_files folder, then processed by Jimp to multiple resolutions and stored in a path with its resolution; finally all images should be returned to the next middleware. But I can't, after uploading both images to the temp folder, it only process and returns the first image.
I wrote this middleware for formidable:
var fs = require('fs');
var path = require('path');
var Jimp = require('jimp');
var mkdirp = require('mkdirp');
const formidable = require('formidable');
var fs = require('fs');
function parse(opts) {
return (req, res, next) => {
const form = new formidable.IncomingForm({
uploadDir: 'media/temp_images',
keepExtensions: true,
});
Object.assign(form, opts);
form.on('file', function(field, file) {
var date = new Date();
var newName = date.getTime() + '.' + file.type.split('/')[1];
var newPath = form.uploadDir + '/' + newName;
fs.rename(file.path, newPath);
file.path = newPath;
});
form.parse(req, (err, fields, files) => {
var processImage = function(image, options) {
options.sizes.map((item) => {
var clone = image.clone();
var uploadPath = path.join(options.folder, 'w' + item.width);
var filepath = path.join(uploadPath, options.name);
!fs.existsSync(uploadPath) && mkdirp.sync(uploadPath);
clone.resize(item.width, item.height).write(filepath);
});
};
var promises = [];
for (const file in files) {
var options = {
sizes: [{ width: 100, height: 200 }, { width: 200, height: 400 }, { width: 600, height: 1200 }],
folder: path.join('media/images', file.toLowerCase()),
name: files[file].name,
};
var promise = new Promise((resolve, reject) => {
Jimp.read(files[file].path)
.then((image) => {
processImage(image, options);
})
.catch((err) => {
console.error(err);
});
});
promises.push(promise);
}
Promise.all(promises);
if (err) {
next(err);
return;
}
Object.assign(req, { fields, files });
next();
});
};
}
module.exports = parse;
exports.parse = parse;
I add it to a file, I call it with var formidableMiddleware = require('./formidableMiddleware') and then, app.use(formidableMiddleware()), and that's all.
The problem is that this logic returns only the first item, and I cant make it return both of them. I think is related to the way I'm using the Promises, but I cant find the bug.
Any advice will be welcome!!!

Get all json files from node folder and find specific attr inside

i've folder in my node app with several json files (can be more then 10 ) and I need from validation aspects to read them and find specific property and if this property occur in more than one json file throw an error,what is the best way to do it from performance and efficiency aspects
for example my folder called plugins
and all the json are built like following
json1
{
"action": [
{
"delete": {
"path": "deleteFile",
"providedAction":"Del"
},
{
"update": {
"path": "updateFile",
"providedAction":"UPD"
}
}
]
}
this is valid json since providedAction = add is not exist in other json **
json2
{
"action": [
{
"add": {
"path": "addFile",
"providedAction":"Add"
}
}
]
}
this is not valid json since providedAction = UPD the action is already exist
JSON 3
{
"action": [
{
{
"update": {
"path": "updateFile",
"providedAction":"UPD"
}
}
]
}
I need to verify that just this json have the action "Del",if more than one json have this trow error,how its recommended to do it?
Ok, here is the code. If you don't understand something let me know and I will glad to help you!
var glob = require("glob");
var fs = require("fs");
var _inArray = function(needle, haystack) {
for(var k in haystack) {
if(haystack[k] === needle) {
return true;
}
}
return false;
}
glob("json/*.json", function(err, files) { // read the folder or folders if you want: example json/**/*.json
if(err) {
console.log("cannot read the folder, something goes wrong with glob", err);
}
var matters = [];
files.forEach(function(file) {
fs.readFile(file, 'utf8', function (err, data) { // Read each file
if(err) {
console.log("cannot read the file, something goes wrong with the file", err);
}
var obj = JSON.parse(data);
obj.action.forEach(function(crud) {
for(var k in crud) {
if(_inArray(crud[k].providedAction, matters)) {
// do your magic HERE
console.log("duplicate founded!");
// you want to return here and cut the flow, there is no point in keep reading files.
break;
}
matters.push(crud[k].providedAction);
}
})
});
});
});
JSON 1:
{"action": [
{
"delete": {
"path": "deleteFile",
"providedAction": "Del"
}
},
{
"update": {
"path": "updateFile",
"providedAction": "UPD"
}
}
]
}
JSON 2:
{
"action": [
{
"add": {
"path": "addFile",
"providedAction": "Add"
}
}
]
}
JSON 3:
{
"action": [
{
"update": {
"path": "updateFile",
"providedAction": "UPD"
}
}
]
}
Not the prettiest code I've written, but here it is:
// Require the nodejs file system library
var fs = require('fs');
var path = '/usr/local/var/jsons';
var delCounter = 0;
// Readdir reads a path and gives an array of filenames
// to the callback handleFiles.
fs.readdir(path, handleFiles);
function handleFiles (err, files) {
if (err) throw err;
var i;
var jsonFilePattern=/\.[json]+$/i;
var fileName;
var filePath;
// Tells fs to read an utf-8 file.
var fileReadOptions = {
'encoding':'utf-8'
};
for (i = 0; i < files.length; ++i) {
fileName = files[i];
// Check if the file has a .json extension
if (fileName.match(jsonFilePattern)) {
filePath = path + '/' + fileName;
// Open the file as utf-8 and call handleJsonFile back
// when done reading.
fs.readFile(filePath, fileReadOptions, handleJsonFile);
}
}
}
function handleJsonFile (err, data) {
if (err) throw err;
var dataObject = JSON.parse(data);
var i;
var action;
// Loop through all possible action.
for (i = 0; i < dataObject.action.length; ++i) {
action = dataObject.action[i];
if (action.delete &&
action.delete.providedAction &&
action.delete.providedAction === 'Del')
{
// If there is a 'Del', add it to the counter.
++delCounter;
}
}
if (delCounter > 1) {
throw new Exception('Jsons  not valid.');
}
}
Something like this perhaps?, enjoy!!!
npm install glob
JSON 1
module.exports = {
"action": [{
"delete": {
"path": "deleteFile",
"action":"Del"
}
}]
}
CODE
(function() {
var glob = require("glob");
glob("path/to/*.js", function(er, files) {
if(er) return;
var x = 0;
files.forEach(function(file) {
require(file)['action'].forEach(function(act) {
if(act.delete.action && act.delete.action == "Del") x++;
});
});
if(x > 1) throw new Exception(""); // or something ja!
});
})();
5am without sleep, sorry if I commit mistakes, I want to show you the way only... not for copy paste!! xD.
Using modern syntax, reduce and spread could be of great help here:
const files = readdirSync(path);
files.reduce((acc, curr) => {
const file = JSON.parse(readFileSync(path.join(path, curr), 'utf8'));
const merged = { ...acc, ...file };
// Check for destructive merging.
if (Object.keys(file).length + Object.keys(acc).length > Object.keys(merged).length) {
throw Error('Destructive merge of JSON files.');
}
return merged;
}, {});
const fs = require('fs');
const path = require('path');
// dir path that contains all your json file
const dirPath = './something/something';
const files = fs.readdirSync(dirPath);
const arr = []
files.forEach((val, i) => {
const file = JSON.parse(fs.readFileSync(path.join(dirPath, val), 'utf8'));
arr.push(file);
})
if (arr.length === files.length) {
console.log(arr)
}

NodeJS memory usage

I am playing with NodeJS and for this purpose created an email extractor. Somehow when i create multiple http requests the node.exe memory useage in windows task manager keeps increasing. I understand that the node needs more memory to process the requests but what i noticed that this memory usage does not come down even after all requests have been successfully processed.
When i start nodejs it consumes about 35000K memory but after about 80-100 request this goes upto 50000K and stays.
Here is my simple email extractor module:
var request = require('request'),
cheerio = require('cheerio'),
async = require('async'),
urlHelper = require('url');
function Extractor(config) {
this.baseUrl = config.url;
this.parsedUrl = urlHelper.parse(config.url);
this.urls = [];
this.emails = [];
}
Extractor.prototype.getEmails = function getEmails(html) {
var foundEmails = html.match(/([a-zA-Z0-9._-]+#[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+)/gi) || [];
if(foundEmails.length) this.emails = this.emails.concat(foundEmails);
}
Extractor.prototype.extract = function extract(html) {
var $ = cheerio.load(html),
that = this;
if($('body')){
this.getEmails($('body').html());
}
if(!this.emails.length){
$("a[href^='http://" + this.parsedUrl.host + "'], a[href^='https://" + this.parsedUrl.host + "'], a[href^='/'], a[href^='./'], a[href^='../']").each(function(k, v) {
that.urls.push(urlHelper.resolve(that.baseUrl, $(v).attr('href')));
});
}
};
/**
* Process the base URL
*/
Extractor.prototype.processBase = function processBase(next) {
request(this.baseUrl, function(err, response, body) {
return next(err, body);
});
}
/**
* Process the internal pages
*/
Extractor.prototype.processInternal = function processInternal(cb) {
var that = this;
async.whilst(
// while this condition returns true
function () { return that.emails.length === 0 && that.urls.length > 0; },
// do this
function (callback) {
request(that.urls.shift(), function (err, response, body) {
var $ = cheerio.load(body);
if($(body)){
that.getEmails($('body').html());
}
callback(); // async internal, needs to be called after we are done with our thing
});
},
// call this if any errors occur. An error also stops the series
// this is also called on successful completion of the series
function (err) {
cb(that);
}
);
}
Extractor.prototype.process = function process(next) {
var that = this;
this.processBase(function(err, html) {
if(err) {
console.log(err);
} else {
that.extract(html);
if(!that.emails.length) {
that.processInternal(function(res) {
return next(null, that);
});
}
}
});
}
module.exports = Extractor;
and here is how i call it:
var express = require('express');
var router = express.Router();
var Extractor = require('../services/Extractor');
router.get('/', function(req, res) {
res.json({msg: 'okay'});
var extractor = new Extractor({url: 'http://lior-197784.use1-2.nitrousbox.com:4000/crawl'});
extractor.process(function(err, res) {});
});
module.exports = router;

Categories

Resources