Express and Mongodb insert same data multiple times - javascript

I am quite new to Express and Mongodb. The project that I am working on requires me to:
Take an object that contains multiple url
Download the content of the url and save it to a cloud storage
Generate links for each of the file saved
Save these links into Mongodb as individual documents
The incoming object looks something like this:
{
"id" : 12345678,
"attachments" : [
{
"original_url" : "https://example.com/1.png",
},
{
"original_url" : "https://example.com/2.png",
},
{
"original_url" : "https://example.com/3.png",
}
]
}
the end goal is to have 3 separate document like this saved on mongodb:
{
"id" : 87654321,
"some_other_data": "etc",
"new_url" : "https://mycloudstorage.com/name_1.png"
}
I have a simple loop like this:
for(var i = 0; i < original_data.attachments.length; i++){
var new_url = "https://example.com/" + i + ".png";
download(original_url, new_url, function(new_url){
console.log(new_url)
new_data.new_url = new_url;
save_new_url_to_mongodb(new_data);
});
}
and the saving function looks like this:
function save_new_url_to_mongodb (data, cb) {
getCollection(collection, function (err, collection) {
if (err) {
return cb(err);
}
collection.insert(data, {w: 1, ordered: false}, function (err, result) {
if (err) {
return cb(err);
}
var item = fromMongo(result.ops);
cb(null, item);
});
});
}
var download = function(original_url, new_url, callback){
request.head(original_url, function(err, res, body){
if(res === undefined){
console.log(err);
} else {
var localUrlStream = request(original_url);
var file = bucket.file(new_url);
var remoteWriteStream = file.createWriteStream();
var stream = localUrlStream.pipe(remoteWriteStream);
stream.on('error', function (err) {
next(err);
});
stream.on('finish', function(){
callback(new_url);
});
}
});
};
The downloading part is fine, I get 3 different image files in my cloud storage. The console.log also gives me 3 different new urls.
The problem is that the newly saved mongodb document all have the same new_url. And sometimes if there are more original_url in the original data, some of the new documents would fail to save.
Thanks a lot

It's a scoping issue in your assignment of new_url in the for loop. See here: JavaScript closure inside loops – simple practical example
A solution is to use Array.Prototype.forEach which inherently solves the scope issue since each iteration creates a closure for the callback
original_data.attachments.forEach(function(i) {
var new_url = "https://example.com/" + i + ".png";
download(original_url, new_url, function(new_url){
console.log(new_url)
new_data.new_url = new_url;
save_new_url_to_mongodb(new_data);
});
})

Related

Scraping Cheerio - Getting only element[k]

Problem : With this code I get a list of every iteration of a given element (h1 here)
I would like to get only the number k (k can be a variable assumption) of this element.
Ultimately, I would like to get say, the element1[i], element2[j], where element1 can be a class, id, same for element2
And then concatenate element1,2,...n in a JSON file
I think this is very basic question, but I am quite new to jQuery...
'use strict';
var url = "http://www.website.html",
rech = "h1";
// Import the dependencies
var cheerio = require("cheerio"),
req = require("tinyreq");
// Define the scrape function
function scrape(url, data, cb) {
// 1. Create the request
req(url, (err, body) => {
if (err) { return cb(err); }
// 2. Parse the HTML
let $ = cheerio.load(body)
, pageData = {}
;
// 3. Extract the data
Object.keys(data).forEach(k => {
pageData[k] = $(data[k]).text();
});
// Send the data in the callback
cb(null, pageData);
});
}
// Extract some data from my website
scrape(url, {
rech
}, (err, data) => {
console.log(err || data);
});

Select using recursivity to iterate node js

i'm iterating trought a mysql database with 20000 rows , the problem is that in each roww i need to call a url and get its content and update the database, the problem is... how do i wait the whole process to continue the iteration?, and how can i make it faster like , do two at time , do tree at time?. thanks
var query = connection.query('SELECT * from product where product.product_description = "0" ', function(err, rows, fields) {
kontador =1;
if (!err)
{
var url = rows[0].url;
url = url.replace('../..','');
//console.log(url);
id = rows[0].id;
url = 'http://example.com'+url;
doCall(url,id,kontador,function(response){
console.log(response,kontador);
if(response && kontador <= rows.length){
var url = rows[kontador].url;
url = url.replace('../..','');
id = rows[kontador].id;
url = 'http://www.example2.com'+url;
//console.log(id);
doCall(url,id, kontador, doCall);
kontador +=1;
}
});
}
else
console.log('Error while performing Query.');
});
function doCall(urlToCall,id,kontador, callback)
{
request({'url':urlToCall}, function(error, response, html){
//console.log('inside');
//console.log(error);
if(!error){
var $ = cheerio.load(html);
$('#content').filter(function(){
var data = $(this);
data = data.find('p');
// console.log('-');
// console.log(data.html());
var queryy = connection.query(' UPDATE product SET product_description = "'+data.html()+'" WHERE id = '+id, function(err, rows, fields) {
if (!err)
{
console.log('updated! ');
return callback(true);
}else{
console.log('error sql!');
}
});
//process.exit();
});
}
});
}
In order to orchestrate the async behavior of your application (what can be done in parallel, should there be throttling, ..) you should use an existing library like :
async - https://www.npmjs.com/package/async - if you prefer node.js callback style
bluebird - http://bluebirdjs.com/docs/getting-started.html - if you prefer promises
highland - https://www.npmjs.com/package/highland - somewhat hybrid + stream like
There are many other libraries that can help you build complex async call graphs.

NodeJS - Given a full path of a directory, how to get a list of all subdirectories and check if a file exists here asynchronously?

For example given: '/Users/John/Desktop/FooApp',
I would like to get a list such as:
['/Users/John/Desktop/FooApp',
'/Users/John/Desktop/FooApp/Folder1',
'/Users/John/Desktop/FooApp/Folder2',
'/Users/John/Desktop/FooApp/Folder2/folderA',
'/Users/John/Desktop/FooApp/Folder3',
'/Users/John/Desktop/FooApp/Folder3/folderX',
'/Users/John/Desktop/FooApp/Folder3/folderX/folderY',
'/Users/John/Desktop/FooApp/Folder3/folderX/folderY/folderZ',
'/Users/John/Desktop/FooApp/Folder3/folderX/folderY2'
]
I require this list to search through all directories to check the existence of a file. The user inputs a folder, and I basically will perform a check similar to finders in OS. I am planning to check fs.exists(subdir + '/mylib.dll') on all the subdirectories. Is there any neat way to do that?
I have converted an answer to a similar question in here, where search was performed for files instead of directories. I also used async to check if file exists. I also found out that fs.exists was about to be deprecated and decided to go on with fs.open.
Anyway, Here is the snippet:
var fs = require('fs');
var getDirs = function(dir, cb){
var dirs = [dir];
fs.readdir(dir, function(err, list){
if(err) return cb(err);
var pending = list.length;
if(!pending) return cb(null, dirs);
list.forEach(function(subpath){
var subdir = dir + '/' + subpath;
fs.stat(subdir, function(err, stat){
if(err) return cb(err);
if(stat && stat.isDirectory()){
dirs.push(subdir);
getDirs(subdir, function(err, res){
dirs = dirs.concat(res);
if(!--pending) cb(null, dirs);
});
} else {
if(!--pending) cb(null, dirs);
}
});
});
});
};
One can then use it as:
var async = require('async');
getDirs('/Users/John/Desktop/FooApp', function(err, list){
if(err) return 'Error retrieving sub-directories';
async.detect(list, function(dir, cb){
fs.open(dir + '/mylib.dll', 'r', function(err, file){
if(err) cb(false);
else cb(true);
});
},
function(dir) {
if(!dir) return 'File Not Found';
/* Do something with the found file ... */
}
);
});

Getting values within a nest for-loop - node.js, javascript, redis, Q library

I am trying to extract values out of a nested for-loop. My loop takes values from Redis, and I want to add these values to an array variable called "info".
The important bit is the for-loop.
app.get('/query', function(req, res) {
var info = [];
redisClient.keys("todo:*", function(err, data) {
if (err) return console.log(err);
for (var i = 0, len = data.length; i < len; i++) {
var id = data[i];
var listItem, author, goodness;
redisClient.hgetall(data[i], function(err, obj) {
listItem = obj.listItem;
author = obj.author;
goodness = {
id: id,
listItem: listItem,
author: author
}
info.push(goodness);
console.log("In here: " + info);
});
console.log("Out here: " + info);
}
console.log("Way out here: " + info);
});
console.log("WAY THE HECK OUT HERE: " + info);
});
Basically, I want the values in the variable "goodness" to be pushed to an array variable called "info". when I execute the code, the info array gets filled up here,
console.log("In here: " + info);
but I have not found a way to extract the info array to have values outside of the redisClient.hgetall() function. I have tried return statements to no avail, though as a beginning programmer there is a decent chance I'm not doing these properly.
NOTE: I have tried to take guidance from the original answer and I must be doing something wrong, or the solution given wasn't good enough, or both. I have added the Q library to my project, and have tried to find a solution. Here is my current code:
app.get('/query', function(req, res) {
var redisKeys = Q.nbind(redisClient.keys, redisClient);
var redisHGetAll = Q.nbind(redisClient.hgetall, redisClient);
var id, listItem, author;
var info = [];
redisKeys('todo:*').then(function (data) {
console.log("First: " + data);
var QAll = Q.all(data.map(processKeysFunc(info)));
console.log("Reading within this: " + data);
console.log("Next within this: " + QAll);
}, function (err) {
if (err) return console.log(err);
}).then(function () {
console.log("Finally: " + data);
})();
function processKeysFunc(array) {
return function (key) {
console.log("This is the key: " + key);
return redisHGetall(key).then(function (obj) {
console.log("This is the obj: " + obj);
array.push({
id: obj.id,
listItem: obj.listItem,
author: obj.author
});
});
};
}
});
And this is what I get within my console.log:
First: todo:281f973d-6ffd-403b-a0f4-9e8958caed35,todo:7ed8c557-0f15-4555-9119-
6777e1c952e8,todo:eb8dbee1-92ca-450e-8248-ad78548cd754,todo:712e8d27-bf9b-46f0-bfdd-
c53ef7d14441,todo:301dd91a-2b65-4b87-b129-a5ad569e38e5,todo:720d98b8-bdec-446d-a178-
fb7e264522aa,todo:d200c6cf-2ee5-443b-b7dc-d245c16899c8,todo:8169e9af-0204-42c8-9ddf-
3b00f7161b11
This is the key: todo:281f973d-6ffd-403b-a0f4-9e8958caed35
node.js is generally non-blocking, this is why callbacks are used. The callback passed to .hgetall will only execute when the data from redis has been fully received. The rest of the code around it will execute immediately and not wait for the data from redis. In fact, since the .hgetall call likely involves IO the callback will always run after the code around has executed.
You have a few options, including using Promises (https://github.com/kriskowal/q).
I'll suggest a solution that should be comprehensible the current structure of your code:
app.get('/query', function(req, res) {
var info = [];
var completed = 0;
redisClient.keys("todo:*", function(err, data) {
if (err) return console.log(err);
for (var i = 0, len = data.length; i < len; i++) {
var id = data[i];
var listItem, author, goodness;
redisClient.hgetall(data[i], function(err, obj) {
completed++;
if (err) return console.log(err);
listItem = obj.listItem;
author = obj.author;
goodness = {
id: id,
listItem: listItem,
author: author
}
info.push(goodness);
console.log("In here: " + info);
if (completed === data.length) {
// Do something with info here, all work has finished
}
});
console.log("Out here: " + info);
}
console.log("Way out here: " + info);
});
console.log("WAY THE HECK OUT HERE: " + info);
});
The key bit is the new variable completed that tracks how many callbacks have returned.
I would highly recommend using promises instead, though. Something like:
var Q = require('q');
var redisKeys = Q.nbind(redisClient.keys, redisClient);
var redisHGetall = Q.nbind(redisClient.hgetall, redisClient);
app.get('/query', function(req, res) {
var info = [];
redisKeys('todo:*').then(function (data) {
return Q.all(data.map(processKeysFunc(info));
}, function (err) { /* handle error */ }).then(function () {
console.log('Complete info array=%j', info);
res.setHeader('Content-Type', 'application/json');
res.end(JSON.stringify(info));
});
});
function processKeysFunc(array) {
return function (key) {
return redisHGetall(key).then(function (obj) {
var goodness = {
id: key,
listItem: obj.listItem,
author: obj.author
};
array.push(goodness);
});
}
}
processKeysFunc returns a function so that you can cleanly pass the info array around without needing to define every function inline. If you prefer inline, that works too though.

Nodejs mongoose Mass/Batch update from file

So i have a csv file containing my information, i need to do a mass add/update
exports.add_questions_from_file = function (file_path, surveyid, callback)
{
var U = [{}];
fs.readFile(file_path, 'utf8', function(err, data){
if (err){
console.log(err);
callback(err,null);
}else{
console.log(data);
d = data.split(/\r\n|\n/);
for (x=0;x <d.length;x++)
{
line = d[x].split(',');
if (line[0] == "") {return};
RQuestion.add_by_line (line,function (err, question)
{
U.push({id:question.id});
console.log(U);
});
}
}
});
Survey.update({_id:surveyid},{$push:{"SurveyQuestions":U}},function (err,numAffected, rawResponse) {
console.log(rawResponse);
RET = {"module":"survey","operation": "add", "status":"OK"};
callback(RET);
});
};
But even though im using callback functions the update seems to happen with the same object always, even the console.log here
U.push({id:question.id});
console.log(U);
returns the same object (even that all the other were created)
Im doing something wrong?
I see a few issues.
First for:
if (line[0] == "") {return};
Don't you mean to use a break or continue instead? Otherwise the entire function will quit if there is a blank line anywhere in the file. This is very important because Survey.update won't get called either.
Second: I assumed that RQuestion.add_by_line and Survey.update are doing something async like updating a database. Your code needs to be restructured to wait for those async items to complete before moving on to the next step. I'd recommend an npm package named async for that.
fs.readFile(file_path, 'utf8', function(err, data){
if (err){
console.log(err);
callback(err,null);
}else{
d = data.split(/\r\n|\n/);
async.map(d, function(line, callback) {
//this function is called for each line
add_by_line (line,function (err, question)
{
callback(err,{id:question.id});
});
}, function(err, results) {
//this function is called when all of the items are done
console.log("done with async");
console.dir(results);
Survey.update({_id:surveyid},{$push:{"SurveyQuestions":results},function (err,numAffected, rawResponse) {
console.log(rawResponse);
RET = {"module":"survey","operation": "add", "status":"OK"};
callback(RET);
});
});
}
});

Categories

Resources