Mongoose Cursor: http bulk request from collection - javascript

I have a problem which is relevant to rxJS and bulk HTTP requests from the huge collection (1M+ docs)
I have the following code, with quite simple logic. I push all the docs from the collection to allplayers array and making bulk 20 HTTP requests to API at once (guess you understand why it's limited) So, the code works fine, but I guess it's time to refactor it from this:
const cursor = players_db.find(query).lean().cursor();
cursor.on('data', function(player) { allPlayers.push(player); });
cursor.on('end', function() {
logger.log('warng',`S,${allPlayers.length}`);
from(allPlayers).pipe(
mergeMap(player => getPlayer(player.name, player.realm),20),
).subscribe({
next: player => console.log(`${player.name}#${player.realm}`),
error: error => console.error(error),
complete: () => console.timeEnd(`${updatePlayer.name}`),
});
});
As for now, I'm using find with cursor with (batchSize), but if I understood this right (via .length), and according to this question: {mongoose cursor batchSize} batchSize is just a way of optimization and it's not return me array of X docs.
So what should I do now and what operator should I choose for rxJS?
For example I could form arrays with necessary length (like 20) and transfer it to rxJS as I use it before. But I guess there should be another way, where I could use rxJS inside this for promise loop
const players = await players_db.find(query).lean().cursor({batchSize: 10});
for (let player = await players.next(); player != null; player = await players.next()) {
//do something via RxJS inside for loop
}
Also I found this question {Best way to query all documents from a mongodb collection in a reactive way w/out flooding RAM} which also relevant to my problem and I understand the logic, but don't the syntax of it. I also know that cursor variable isn't a doc I cann't do anything useful with it. Or actually I could?
rxJS's bufferCount is a quite interesting operator
https://gist.github.com/wellcaffeinated/f908094998edf54dc5840c8c3ad734d3 probable solution?

So, in the end I found that rxJS isn't needed (but can be used) for this case.
The solution was quite simple and using just MongoCursor:
async function BulkRequest (bulkSize = 10) {
try {
let BulkRequest_Array = [];
const cursor = collection_db.find({}).lean().cursor({batchSize: bulkSize});
cursor.on('data', async (doc) => {
BulkRequest_Array.push(/*any function or axios instance*/)
if (BulkRequest_Array.length >= bulkSize) {
cursor.pause();
console.time(`========================`);;
await Promise.all(BulkRequest_Array);
BulkRequest_Array.length = 0;
cursor.resume();
console.timeEnd(`========================`);
}
}
} catch (e) {
console.error(e)
}
}
BulkRequest();

Related

Insert only if unique using knex

I am writing a Node.js Script which accepts an array of objects that represent a different devices recording different measurements. I would like to store information about the device in a PSQL database using Knex.js, but only want to store the device information from objects representing a new/unique device. Validation of the device_id before insert seems to work as long as the same device shows up in different POST requests. But when the same device shows up
in the same POST request it seems as if the asynchronous nature of the program causes the validation to occur before insertion is complete.
I've tried to make the script call two separate async-await functions (one to validate and the other to actually insert) but not sure if this is the easiest approach or if I did it right since it failed anyways.
app.post('/testsite', (req, res) => {
const data = (req.body.measurements);
for (let i = 0; i < data.length; i++) {
database.select("device_id").from("device_info").where("device_id", data[i].device_id)
.then(MatchList => {
console.log(MatchList);
if (MatchList.length === 0) {
return database('device_info')
.returning('device_id')
.insert({
device_id: data[i].device_id,
device_name: data[i].device_name,
site_id: data[i].site_id,
site_name: data[i].site_name
})
.then((newdevice) => {
console.log('inserted device id', newdevice);
});
}
return;
});
}
});
I expect it to not insert when the validation fails, but it seems like the validation never fails even when it should and I get this error:
Unhandled rejection error: duplicate key value violates unique constraint "device_info_pkey"
I'm not sure if the issue is due to the asynchronous nature of knex, or if the issue is something else. However, I re-wrote your code to use async/await syntax to make it more readable. I also am just checking if device comes back as null instead of an array since I added .first(). Could you check and see if device gets console logged when you call this function?
app.post('/testsite', async (req, res) => {
const data = (req.body.measurements);
for (let i = 0; i < data.length; i++) {
const device = await database
.select('*')
.from('device_info')
.where('device_id', data[i].device_id)
.first();
console.log(device);
if (device) {
const newDevice = await database
.from('device_info')
.insert({
device_id: data[i].device_id,
device_name: data[i].device_name,
site_id: data[i].site_id,
site_name: data[i].site_name
})
.returning('*');
return newDevice;
} else {
return device;
}
}
});
I was finally able to solve the issue. Essentially just using async await on the search and insert was not enough as the for loop would still move on to the next element. To get around this I made the POST request call an async function.
app.post('/testsite', (req,res) =>
{ const data = (req.body.measurements);
postfxn(data); } )
This function awaits every iteration of the for loop.
async function postfxn(data){
for(let i =0; i<data.length; i++){
await insertdevice(data[i]); }
}
the insert device function then uses async await to search for the device and insert if it is not in the database already as suggested by technogeek1996's answer.
Hope this helps others with similar issues

Mongoose inserting same data three times instead of iterating to next data

I am trying to seed the following data to my MongoDB server:
const userRole = {
role: 'user',
permissions: ['readPost', 'commentPost', 'votePost']
}
const authorRole = {
role: 'author',
permissions: ['readPost', 'createPost', 'editPostSelf', 'commentPost',
'votePost']
}
const adminRole = {
role: 'admin',
permissions: ['readPost', 'createPost', 'editPost', 'commentPost',
'votePost', 'approvePost', 'approveAccount']
}
const data = [
{
model: 'roles',
documents: [
userRole, authorRole, adminRole
]
}
]
When I try to iterate through this object / array, and to insert this data into the database, I end up with three copies of 'adminRole', instead of the three individual roles. I feel very foolish for being unable to figure out why this is happening.
My code to actually iterate through the object and seed it is the following, and I know it's actually getting every value, since I've done the console.log testing and can get all the data properly:
for (i in data) {
m = data[i]
const Model = mongoose.model(m.model)
for (j in m.documents) {
var obj = m.documents[j]
Model.findOne({'role':obj.role}, (error, result) => {
if (error) console.error('An error occurred.')
else if (!result) {
Model.create(obj, (error) => {
if (error) console.error('Error seeding. ' + error)
console.log('Data has been seeded: ' + obj)
})
}
})
}
}
Update:
Here is the solution I came up with after reading everyone's responses. Two private functions generate Promise objects for both checking if the data exists, and inserting the data, and then all Promises are fulfilled with Promise.all.
// Stores all promises to be resolved
var deletionPromises = []
var insertionPromises = []
// Fetch the model via its name string from mongoose
const Model = mongoose.model(data.model)
// For each object in the 'documents' field of the main object
data.documents.forEach((item) => {
deletionPromises.push(promiseDeletion(Model, item))
insertionPromises.push(promiseInsertion(Model, item))
})
console.log('Promises have been pushed.')
// We need to fulfil the deletion promises before the insertion promises.
Promise.all(deletionPromises).then(()=> {
return Promise.all(insertionPromises).catch(()=>{})
}).catch(()=>{})
I won't include both promiseDeletion and promiseInsertion as they're functionally the same.
const promiseDeletion = function (model, item) {
console.log('Promise Deletion ' + item.role)
return new Promise((resolve, reject) => {
model.findOneAndDelete(item, (error) => {
if (error) reject()
else resolve()
})
})
}
Update 2: You should ignore my most recent update. I've modified the result I posted a bit, but even then, half of the time the roles are deleted and not inserted. It's very random as to when it will actually insert the roles into the server. I'm very confused and frustrated at this point.
You ran into a very common problem when using Javascript: You shouldn't define (async) functions in a regular for (-in) loop. What happens, is that while you loop through the three values the first async find is being called. Since your code is async, nodejs does not wait for it to finish, before it continues to the next loop iteration and counts up to the third value, here the admin rule.
Now, since you defined your functions in the loop, when the first async call is over, the for-loop already looped to the last value, which is why admin is being inserted three times.
To avoid this, you can just move the async functions out of the loop to force a call by value rather than reference. Still, this can bring up a lot of other problems, so I'd recommend you to rather have a look at promises and how to chain them (e.g. Put all mongoose promises in an array and the await them using Promise.all) or use the more modern async/await syntax together with the for-of loop that allows for both easy readability as well as sequential async command instructions.
Check this very similar question: Calling an asynchronous function within a for loop in JavaScript
Note: for-of is being discussed as to performance heavy, so check if this applies to your use-case or not.
When using async functions in loops could cause some problems.
You should change the way you work with findOne to make it synchronous function
First you need to set your function to async, and then use the findOne like so:
async function myFucntion() {
let res = await Model.findOne({'role':obj.role}).exec();//Exec will fire the function and give back a promise which the await can handle.
//do what you need to do here with the result..
}

Asynchronous loop inside an asynchronous loop: a good idea?

I have some javascript code which does the following:
Read a .txt file and fill up an array of objects
Loop through these itens
Loop through an array of links inside each of these itens and make a request using nightmarejs
Write the result in Sql Server
My code is like this:
const Nightmare = require('nightmare');
const fs = require('fs');
const async = require('async');
const sql = require('mssql');
var links = recuperarLinks();
function recuperarLinks(){
//Read the txt file and return an array
}
const bigFunction = () => {
var aparelho = '';
async.eachSeries(links, async function (link) {
console.log('Zip Code: ' + link.zipCode);
async.eachSeries(link.links, async function(url){
console.log('URL: ' + url);
try {
await nightmare.goto(link2)
.evaluate(function () {
//return some elements
})
.end()
.then(function (result) {
//ajust the result
dadosAjustados.forEach(function (obj) {
//save the data
saveDatabase(obj, link.cep);
});
});
} catch (e) {
console.error(e);
}
}
}, function(err){
console.log('Erro: ');
console.log(err);
})
}, function (erro) {
if (erro) {
console.log('Erro: ');
console.log(erro);
}
});
}
async function salvarBanco(dados, cep){
const pool = new sql.ConnectionPool({
user: 'sa',
password: 'xxx',
server: 'xxx',
database: 'xxx'
});
pool.connect().then(function(){
const request = new sql.Request(pool);
const insert = "some insert"
request.query(insert).then(function(recordset){
console.log('Dado inserido');
pool.close();
}).catch(function(err){
console.log(err);
pool.close();
})
}).catch(function(err){
console.log(err);
});
}
lerArquivo();
It works fine, but i'm finding this async loop inside another async loop like a hack of some sort.
My outputs are something like this:
Fetching Data from cep 1
Fetching Data from url 1
Fetching Data from cep 2
Fetching Data from url 2
Fetching Data from cep 3
Fetching Data from url 3
Then it starts making the requests. Is there a better (and possibly a correct way) of doing this?
If you want to serialize your calls to nightmare.goto() and you want to simplify your code which is what you seem to be trying to do with await, then you can avoid mixing the callback-based async library with promises and accomplish your goal by only using promises like this:
async function bigFunction() {
var aparelho = '';
for (let link of links) {
for (let url of link.links) {
try {
let result = await nightmare.goto(url).evaluate(function () {
//return some elements
}).end();
//ajust the result
await Promise.all(dadosAjustados.map(obj => saveDatabase(obj, link.cep)));
} catch (e) {
// log error and continue processing
console.error(e);
}
}
}
}
Asynchronous loop inside an asynchronous loop: a good idea?
It's perfectly fine and necessary sometimes to nest loops that involve asynchronous operations. But, the loops have to be designed carefully to both work appropriately and to be clean, readable and maintainable code. Your bigFunction() does not seem to be either to me with your mix of async coding styles.
It works fine, but i'm finding this async loop inside another async loop like a hack of some sort.
If I were teaching a junior programmer or doing a code review for code from any level of developer, I would never allow code that mixes promises and the callback-based async library. It's just mixing two completely different programming styles for both control flow and error handling and all you get is a very hard to understand mess. Pick one model or the other. Don't mix. Personally, it seems to me that the future of the language is Promises for both async control flow and error propagation so that's what I would use.
Note: This appears to be pseudo-code because some references used in this code are not used or defined such as result and dadosAjustados. So, you will have to adapt this concept to your real code. For the future, we can offer you much more complete answers and often make suggested improvements that you're not even aware of if you include your real code, not an abbreviated pseudo-code.

javascript - how to use promises in firebase?

Lately I've been stuck with a problem that I don't know how to solve. I asked this question and after some efforts we've found that Firebase works differently with promises than normal requests, and I couldn't use them properly.
As explained in the question, I'm filling an array with some informations from Firebase, and I need to call another method when I'm sure the array is filled, in other words when I'm sure the call to Firebase has finished.
This is my code as I'm using it now:
var user_pref = firebase.database().ref('/users/'+ self.api.email_id+'/preferenze/');
var ref = firebase.database().ref('/tag/')
var userTags = [];
var self1 = self;
user_pref.once('value', function(preferenze) {
preferenze.forEach(function(t) {
ref.once('value', function(tags) {
tags.forEach(function(t1) {
if (t.key == t1.key) {
console.log("Found " + t1.key)
userTags.push(t1.key)
}
return false;
});
})
return false;
});
}).then(a =>{
await this.sleep(1000) //----> WORKAROUND
console.log("Out")
this.myTags = userTags
this.findPoiByTag(this.myTags) //method I have to call when finished
})
I'm using this orrible workaround with sleep to be sure the code outside is executed after the one inside. Without that, it prints "Out" before and then all the "Found" in the loop. I've tried using it with promises in every way, but it still doesn't work. Having a look at the docs here I couldn't find anything that would help me.
That's indeed pretty bad.
This should be closer to what you need:
var userTags = [];
var self1 = self;
user_pref.once('value', function(preferenze) {
var promises = [];
preferenze.forEach(function(t) {
promises.push(ref.child(t.key).once('value'));
});
Promise.all(promises).then(function(snapshots) {
snapshots.forEach(function(snapshot) {
if (snapshot.exists()) {
userTags.push(snapshot.key);
}
});
})
this.myTags = userTags
this.findPoiByTag(this.myTags) //method I have to call when finished
});
What this does differently:
It loads each preference key with a direct look (removing the need for a deeply nested loop that was loading way too much data).
It puts all load of the categories into an array of promises.
It then calls your function after all promises have resolved.

Is it possible to asynchronously collect items from a generator into an array?

I'm playing around with writing a web service using Node.js/Express which generates some objects based on templates and then returns the generated data. I'm using Bluebird promises to manage all the async logic. After stripping out all the unimportant stuff, my code looks something like this[1].
My problem is the core logic can block for several seconds if the requested number of output elements is large. Since I've been playing with ES6 for this project, my first thought was to factor out the element creation into a generator[2]. However, the only way I can find to get all the results from this generator is Array.from, which doesn't help with the blocking.
I've played around with .map, .all, .coroutine, and a couple of other things, in an attempt to asynchronously collect the results from the generator, but I haven't had any luck. Is there any nice way to do this with Bluebird? (Or perhaps a better way of doing it altogether?)
Native ES6 Promise.all can take an iterator and give back an array of values, but V8 doesn't support this yet. Also, in my experimentation with polyfills/Firefox, it seems to be synchronous.
This is not-too-common operation, so I don't care much about absolute performance. I just want to avoid blocking the event queue, and I would prefer a nice, easy to read and maintain solution.
[1]:
let Bluebird = require('bluebird');
let templates = ...; // logic to load data templates
function createRandomElementFromRandomTemplate(templates) {
let el;
// synchronous work that can take a couple of milliseconds...
return el;
};
api.createRandomElements = function(req, res) {
let numEls = req.params.numEls;
Bluebird.resolve(templates)
.then(templates => {
let elements = [];
// numEls could potentially be several thousand
for(let i = 0; i < numEls; ++i) {
elements.push(createRandomElementFromRandomTemplate(templates));
}
return elements;
})
.then(elements => {
res.json(elements);
})
.error(err => {
res.status(500).json(err);
});
}
[2]:
function* generateRandomElementsFromRandomTemplate(templates, numEls) {
for(let i = 0; i < numEls; ++i) {
let el;
// synchronous work that can take a couple of milliseconds...
yield el;
}
}
api.createRandomElements = function(req, res) {
let numEls = req.params.numEls;
Bluebird.resolve(templates)
.then(templates => {
// this still blocks
return Array.from(generateRandomElementsFromRandomTemplate(templates, numEls));
})
.then(elements => {
res.json(elements);
})
.error(err => {
res.status(500).json(err);
});
}
Here's a halfway-decent solution I found after looking more closely at Bluebird's .map() as Benjamin suggested. I still have the feeling I'm missing something, though.
The main reason I started with Bluebird was because of Mongoose, so I left a bit of that in for a more realistic sample.
let Bluebird = require('bluebird');
let mongoose = require('mongoose');
Bluebird.promisifyAll(mongoose);
const Template = mongoose.models.Template,
UserPref = mongoose.models.UserPref;
// just a normal function that generates one element with a random choice of template
function createRandomElementFromRandomTemplate(templates, userPrefs) {
let el;
// synchronous work that can take a couple of milliseconds...
return el;
}
api.generate = function(req, res) {
let userId = req.params.userId;
let numRecord = req.params.numRecords
let data;
Bluebird.props({
userprefs: UserPref.findOneAsync({userId: userId}),
templates: Template.findAsync({})
})
.then(_data => {
data = _data;
// use a sparse array to convince .map() to loop the desired number of times
return Array(numRecords);
})
.map(() => {
// ignore the parameter map passes in - we're using the exact same data in each iteration
// generate one item each time and let Bluebird collect them into an array
// I think this could work just as easily with a coroutine
return Bluebird.delay(createRandomElementFromRandomTemplate(data.templates, data.userprefs), 0);
}, {concurrency: 5})
.then(generated => {
return Generated.createAsync(generated);
})
.then(results => {
res.json(results);
})
.catch(err => {
console.log(err);
res.status(500);
});
};

Categories

Resources