I have a nodejs/express server and I'm trying to merge and sort sorted results from multiple mongodb collections in order to create a sorted CSV file. The way I achieve this requires that I keep the mongodb cursors alive (no timeout) until I read/exhaust all data, or until an error occurs, in which case I have to close them manually. It seems to work when there aren't many data points. However, when the mongo queries request data for one year for example, at some point after almost half an hour, I get the following mongo error: Cursor not found: cursor id: 59427962835.
Promises are bluebird promises. Written in Typescript.
import * as _ from 'lodash';
import * as moment from 'moment-timezone';
function findNative(db, collection, spec={}) {
const {query, fields, sort, limit, skip, hint, timeout=true} = spec;
// internal function that gets a connection from the connection pool
// returns promise with connection
return ensureConnection(db)
.then(connection => {
const cursor = connection.collection(collection).find(
query || {},
{fields, sort, limit, skip, hint, timeout});
// For sorted queries we have to limit batchSize
// see https://jira.mongodb.org/browse/SERVER-14228
if (connection.serverConfig.capabilities().maxWireVersion == 0 && sort && !limit) {
cursor.batchSize(0);
}
return cursor;
});
}
function getMongoStream(col, startdate, enddate) {
return findNative('testDb', col, {
query: { t: { $gte: startdate, $lte: enddate }},
sort: { t: 1 },
fields: { i: 0, _id: 0 },
timeout: false
});
}
async function fetchNextCursorData(cursor) {
const hasMore = await cursor.hasNext();
console.log(hasMore, cursor.cursorState.cursorId.toString());
return hasMore ? cursor.next() : Promise.resolve(null);
}
function findEarliestDate(buffer: any[]): [string, number[]] {
let earliestDateMS;
const indices = _(buffer)
.map(x => x && x.t.getTime())
.forEach(t => {
// make sure timestamp is defined
// buffer also contains null values
if(t && (!earliestDateMS || (earliestDateMS && t < earliestDateMS))) {
earliestDateMS = t;
}
})
.reduce((acc, t, i) => {
if(t === earliestDateMS) {
acc.push(i);
}
return acc;
}, []);
return [moment(earliestDateMS).utc().format('YYYY-MM-DD HH:mm:ss.SSS'), indices];
}
function closeAllCursors(cursors: any[]) {
const openCursors = cursors
.filter(c => !c.isClosed());
openCursors.forEach(c => c.close());
}
async function csvData(req, res) {
const collections: string[] = req.swagger.params.collections.value.split(',').sort(),
sources: string[] = req.swagger.params.sources.value.split(',').sort(),
startdate = new Date(Number(req.swagger.params.startdate.value)),
enddate = new Date(Number(req.swagger.params.enddate.value));
const filename = `${moment.utc().format('YYYY-MM-DD_HH:mm')}.csv`;
res.set({
'Content-Type': 'text/csv',
'Content-Disposition': `attachment; filename="${filename}"`
});
res.write('Date UTC,' + sources.join(',') + '\n');
const colPromises = collections.map(col => getMongoStream(col, startdate, enddate));
let cursorsMap: { [rec: string]: any; };
try {
let buffer = [], dateCSVBuffer: any[] = _.fill(Array(sources.length), '');
// fetch first doc from all cursors
const cursors = await Promise.all(colPromises);
cursorsMap = _.zipObject<any>(collections, cursors);
let docs = await Promise.all(cursors.map(fetchNextCursorData));
// initial request made for all collections
let requestedIdx = _.range(0, collections.length);
while(true) {
docs.forEach((doc, i) => {
buffer[requestedIdx[i]] = doc;
});
// null indicates that cursor won't return more data =>
// all cursors are exhausted
if(buffer.every(d => d === null)) {
break;
}
const [date, indices] = findEarliestDate(buffer);
requestedIdx = indices;
indices.forEach(idx => {
// update csv buffer
const {data} = buffer[idx];
Object.keys(data)
.forEach(ch => {
const sourceIndex = sources.indexOf(ch);
if(sourceIndex > -1) {
dateCSVBuffer[sourceIndex] = data[ch];
}
});
// remove doc from buffer
buffer[idx] = null;
});
// send csv string
dateCSVBuffer.unshift(date);
res.write(dateCSVBuffer.join(',') + '\n');
// empty buffer
dateCSVBuffer = dateCSVBuffer.map(() => '');
// request new entry from cursors
const nextDocPromises = indices
.map(idx => cursorsMap[collections[idx]])
.map(fetchNextCursorData);
docs = await Promise.all(nextDocPromises);
}
// end data stream
res.end();
} catch(err) {
// make sure to close all cursors
// will catch all nested promise errors
closeAllCursors(_.values(cursorsMap));
console.error(err);
res.status(500).json(err);
}
}
Mongodb connection created with following options:
{
auto_reconnect: true,
poolSize: 30,
connectTimeoutMS: 90000
}
Could the problem be that I keep the cursor references in the map and thus they are not updated? And when I do a cursor.hasNext() cursor is already dead? I also tried checking whether cursor.isClosed() but it always returns false.
Mongodb driver is "mongodb": "2.2.15" and the queries are tested against a v3.0 database.
EDIT: I did a small count test to see how many docs have been processed at the time when the program crashes.
The 3 cursors (test case requested only data from 3 collections) have the following counts and ids:
3097531 '59427962835'
31190333 '53750510295'
32007475 '101213786015'
and the last document cursor with id '59427962835' processed was number 4101. So not even close to finishing
Turns out that adding the timeout to the find query doesn't work. I had to use the noCursorTimeout flag like so:
const cursor = connection.collection(collection)
.find(query || {}, {fields, sort, limit, skip, hint})
.addCursorFlag('noCursorTimeout', !timeout);
Related
I'm making a scraper that will grab every Uniswap pair and save it to an array using the Graph API.
My problem occurs when I make my 7th request to the API.
Initially, I thought I was being rate limited because I was fetching 1000 tokens at a time, but after adding a 10 second wait between calls and decreasing the fetched tokens from 1000 to 10, it still stops on the 7th loop.
The script works perfectly until this point.
const axios = require('axios');
const fs = require('fs');
async function getTokens(skip) {
try {
const query = `
query tokens($skip: Int!) {
tokens(first: 10, skip: $skip) {
id
name
symbol
}
}
`;
const variables = {
skip: skip
};
const headers = {
"Content-Type": "application/json"
};
const { data } = await axios.post("https://api.thegraph.com/subgraphs/name/uniswap/uniswap-v3", {
query,
variables
}, {
headers
});
return data.data.tokens;
} catch (err) {
console.error(err);
return []
}
}
async function saveTokens(tokens) {
try {
await fs.promises.writeFile("uniTokens.json", JSON.stringify(tokens), { flag: "w" });
} catch (err) {
console.error(err);
}
}
async function main() {
let skip = 0;
let tokens = [];
const retrievedIds = new Set();
while (true) {
const newTokens = await getTokens(skip);
if (newTokens.length === 0) {
console.log("Reached end of tokens, finishing up...");
break;
}
// Only save tokens that haven't been retrieved before
const newIds = new Set(newTokens.map(token => token.id));
newIds.forEach(id => {
if (!retrievedIds.has(id)) {
tokens.push(newTokens.find(token => token.id === id));
retrievedIds.add(id);
}
});
console.log(`Retrieved ${tokens.length} tokens`);
await saveTokens(tokens);
skip += 1000;
// delay the next request by 10 seconds
//await new Promise(resolve => setTimeout(resolve, 10000));
}
}
main();
This is the error that it produces:
TypeError: Cannot read properties of undefined (reading 'tokens')
at getTokens (/root/unipairs/uni:31:26)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async main (/root/unipairs/uni:52:27)
Reached end of tokens, finishing up...
I'm a bit confused
I am sending emails with nodemailer, and every time I send one I perform certain validations in order to manage the upload limit of the attachments. If the upload limit exceeds what is established, the service divides that email and sends it in different emails with the same subject and body as well as its attachment.
Every time this happens, it does a _.chunk that takes care of splitting the pdfs array into smaller elements. But, it should be noted that before that, he made a method to prepare the attachments and this is in charge of obtaining certain information from the api to paint the pdf buffer and thus put it in the content of the emails.
But now what I want to do is search within the matrix that performs the step before dividing the files those that are equal to the array that obtains the information and if they are equal, carry out the instruction that it sends
I will explain with a graph:
If getAmount.pdfBuffer === attachmentMap
// doAction console.log('Equals)
But even though I tried to do it, I couldn't, I don't know if it's because for each attachment that the array has divided, it generates a getAmount array. What do you think I'm doing wrong?
async sendEmail(
{
para: to,
asunto: subject,
plantilla: template,
contexto: context,
}: CorreoInfoDto,
attachments: EmailAttachment[],
driveConfig: OAuthGoogleConfig
) {
const totalSize: number = this.getSizeFromAttachments(attachments);
const chunkSplit = Math.floor(isNaN(totalSize) ? 1 : totalSize / this.LIMIT_ATTACHMENTS) + 1;
const attachmentsChunk: any[][] = _.chunk(attachments, chunkSplit);
if ((totalSize > this.LIMIT_ATTACHMENTS) && attachmentsChunk?.length >= 1) {
await Promise.all(
attachmentsChunk?.map(async (attachment: EmailAttachment[], index) => {
console.log('attachment', attachment)
if (this.getSizeFromAttachments(attachment) > this.LIMIT_ATTACHMENTS) {
const result: GenerateDriveLinkResponse[] = await Promise.all(
attachment?.map(item => {
const file = new GoogleDriveUploadFile({
name: item?.filename,
mimeType: MimeTypesEnum.PDF,
body: item?.content
});
return this.uploadFilesService.uploadToDrive(driveConfig, file) as any;
})
)
const texto = result?.map((item, index) => {
console.log('item', item?.webViewLink);
console.log('index', index);
return new SolicitudXLinkDrive({
texto: attachment[index].filename,
link: item?.webViewLink
})
});
context.links = texto;
const link = `(${index + 1}/${attachmentsChunk?.length - 1})`;
const newContext = {
getCurrent: link,
...context
}
const prepareEmail = this.prepareEmail({
para: to,
asunto: ` ${subject} (${index + 1}/${attachmentsChunk?.length})`,
plantilla: template,
contexto: newContext,
}, []);
return prepareEmail
} else {
// this.getCantidad = `(${index + 1}/${attachmentsChunk?.length - 1})`;
console.log('getCantidad', this.getAmount );
const attachmentMap = attachment.map(element => element.content);
this.getAmount .forEach(element => {
if (element.pdfBuffer === attachmentMap) {
console.log('do action');
}
})
const link = ` (${index + 1}/${attachmentsChunk?.length - 1})`;
const newContext = {
getCurrent: link,
...context
}
return this.prepareEmail({
para: to,
asunto: ` ${subject} (Correo ${index + 1}/${attachmentsChunk?.length - 1})`,
plantilla: template,
contexto: newContext,
}, attachment);
}
})
);
} else {
await this.prepareEmail(
{
para: to,
asunto: ` ${subject}`,
plantilla: template,
contexto: context,
},
attachments,
);
}
}
async prepareEmail(
{
para: to,
asunto: subject,
plantilla: template,
contexto: context,
}: CorreoInfoDto,
attachments: EmailAttachment[],
) {
return await this.mailerService.sendMail({
to,
from: `${process.env.SENDER_NAME} <${process.env.EMAIL_USER}>`,
subject,
template,
attachments: attachments,
context: context,
});
}
async sendEmails(correos: EnvioMultiplesCorreosDto) {
let pdf = null;
let info: ConfiguracionDocument = null;
let GDriveConfig: ConfiguracionDocument = null;
let logo: ConfiguracionDocument = null;
let forContext = {};
const documents = Array.isArray(correos.documento_id) ? correos.documento_id : [correos.documento_id];
const solicitudes = await this.solicitudesService.findByIds(documents);
const nombresPacientes = solicitudes.reduce((acc, cv) => {
acc[cv.correlativo_solicitud] = cv['info_paciente']?.nombre_paciente;
return acc;
}, {});
await Promise.all([
await this.getPdf(correos.tipo_reporte, correos.documento_id, correos?.dividir_archivos).then(data => { pdf = data; }),
await this.configuracionesService.findByCodes([
ConfigKeys.TEXTO_CORREO_MUESTRA,
ConfigKeys[process.env.DRIVE_CONFIG_API],
ConfigKeys.LOGO_FIRMA_PATMED
]).then(data => {
info = data[0];
GDriveConfig = data[1];
logo = data[2];
})
]);
forContext = this.configuracionesService.castValorObjectToObject(info?.valor_object)
const attachmentPrepare = this.prepareAttachments(pdf as any, nombresPacientes);
await this.sendEmail(
{
para: correos.para,
asunto: correos.asunto,
plantilla: 'muestras',
contexto: {
cuerpo: correos.cuerpo,
titulo: forContext[EnvioCorreoMuestraEnum.titulo],
direccion: forContext[EnvioCorreoMuestraEnum.direccion],
movil: forContext[EnvioCorreoMuestraEnum.movil],
pbx: forContext[EnvioCorreoMuestraEnum.pbx],
email: forContext[EnvioCorreoMuestraEnum.email],
logo: logo?.valor,
},
},
attachmentPrepare,
this.configuracionesService.castValorObjectToObject(GDriveConfig?.valor_object) as any,
);
const usuario = new UsuarioBitacoraSolicitudTDTO();
usuario.createFromUserRequest(this.sharedService.getUserFromRequest());
solicitudes.forEach((solicitud) => {
const actual = new BitacoraSolicitudDTO();
actual.createFromSolicitudDocument(solicitud);
const newBitacora = new CrearBitacoraSolicitudDTO();
newBitacora.createNewItem(null, actual, actual, usuario, AccionesBitacora.EmailEnviado);
this.bitacoraSolicitudesService.create(newBitacora);
});
}
prepareAttachments(item: BufferCorrelativosDTO | BufferXSolicitudDTO[], nombresPacientes: { [key: string]: string }) {
if (this.sharedService.isAnArray(item)) {
const castItem: BufferXSolicitudDTO[] = item as any;
this.getCantidad = castItem;
return castItem?.map((s) => {
const namePatient = nombresPacientes[s.correlativo_solicitud];
return new EmailAttachment().setFromBufferXSolicitudDTO(s, namePatient, 'pdf');
});
} else {
return [new EmailAttachment().setFromBufferCorrelativosDTO(item as any, 'pdf')];
}
}
Thank you very much for your attention, I appreciate it. Cheers
You could try using lodash as this has _.intersectionBy and _.intersectionWith functions that should allow you to compare 2 arrays and filter the common values.
There are some good examples here:
How to get intersection with lodash?
I'm having some performance issues with my Parse queries, using Parse SDK.
On some requests, I have to get a ressources related to another one. E.g. :
I get all elements from ClassA
foreach ClassA elements, I have to find all ClassB and ClassC elements which contain a pointer to the relevant entry in ClassA.
Right now, to sum up:
I query all my ClassA elements
I loop on the results, and create a Promise (Parse query) to get ClassB elements and another one (Parse query) to get ClassC elements foreach result
My issue is that solution makes a lot of Parse Query, so a lot of Promise that I have to wait for. The real issue is that on my local env, all is working fine, the whole request makes 1s or less (it's quite heavy, but it does 45 parse query in the current case - based on my Class entries).
On my production server, same code, same data, same Node version, the same request makes 30s or more (=> timeout).
This is the relevant part of the code (it's old and messy) :
router.get('/:userId/company/customers', (req, res, next) => {
if('company' in req.jwtData.data) {
const company = req.jwtData.data.company;
const query = new Parse.Query('Cards');
const Companies = Parse.Object.extend('Companies');
const currentCompany = new Companies({id: company.objectId});
query.equalTo('company', currentCompany).find().then((cards) => {
if(cards.length){
const customersArrayId = cards.map(card => card.toJSON().user.objectId);
const usersQuery = new Parse.Query('_User').containedIn('objectId', customersArrayId).find({ useMasterKey: true });
usersQuery.then(customersResponse => {
const customers = [];
const customersPromises = [];
if(customersResponse.length) {
for (let index = 0; index < customersResponse.length; index++) {
let customer = {
...customersResponse[index].toJSON(),
...customersResponse[index].attributes
};
const customerPromises = [];
const customerId = customer.objectId;
const stamps = new Parse.Query('Stamps').equalTo('user', new UserModel({objectId: customerId})).equalTo('company', currentCompany).limit(CONSTANTS.QUERY_MAX_LIMIT).find().then((stamps) => {
return stamps;
}).catch(error => {
res.json({
success: false,
error
});
});
const cards = new Parse.Query('Cards').equalTo('user', new UserModel({objectId: customerId})).equalTo('company', currentCompany).limit(CONSTANTS.QUERY_MAX_LIMIT).find().then((cards) => {
return cards;
}).catch(error => {
res.json({
success: false,
error
});
});
customers.push(customer);
customerPromises.push(stamps);
customerPromises.push(cards);
customersPromises.push(customerPromises);
}
if(customersPromises.length) {
const allPromises = customersPromises.map(customerP => Promise.all(customerP));
Promise.all(allPromises).then((customerPromiseResponses) => {
console.log('allPromises done, mapping all users data...');
for (let index = 0; index < customerPromiseResponses.length; index++) {
const customerResponseData = customerPromiseResponses[index];
const stamps = customerResponseData[0];
const cards = customerResponseData[1];
const companyEmailAllowed = () => {
let check = false;
if(customers[index].hasOwnProperty('companiesNewsletterAgreements')) {
check = customers[index].companiesNewsletterAgreements.indexOf(company.objectId) > -1;
}
return check;
};
customers[index] = {
...customers[index],
email: companyEmailAllowed() ? customers[index].email : null,
stamps,
cards,
}
}
res.json({
success: true,
data: customers
});
}).catch(error => {
res.json({
success: false,
error
});
});
}
else {
res.json({
success: true,
data: customers
});
}
} else {
res.json({
success: true,
data: customers
});
}
});
} else {
res.json({
success: true,
data: []
});
}
});
} else {
res.json({
success: false,
error: "No company found."
});
}
});
My question is: is it possible to get the same data in a more performant way (less Parse queries, less Promises)?
I actually found a solution. Instead of starting Parse query for each result I found, I request the whole set of elements I need in the first place, using containedIn.
Then, I filter the results to assign the elements to their customer.
I`m trying to query my business network using buildQuery but it always returns an empty array.
My code is as follows.
This is the connection.js file:
module.exports = {
BusinessNetworkConnection : require('composer-client').BusinessNetworkConnection,
cardName : '',
connection: {},
connect : function() {
var cardType = { type: 'composer-wallet-filesystem' }
this.connection = new this.BusinessNetworkConnection(cardType);
return this.connection.connect(this.cardName);
},
disconnect : function(callback) {
this.connection.disconnect();
}
};
This is my query.js file which being invoked to get results:
const connection = require('./connection');
const getContacts = async (cardName,companyID) => {
connection.cardName = cardName;
try {
await connection.connect();
main();
} catch (error) {
main(error);
}
async function main(error) {
if (error) { return new Error("Ops Error: ",error) };
const statement = 'SELECT org.finance.einvoice.participant.Company WHERE (participantId == _$companyID)'
const query = await connection.connection.buildQuery(statement);
const company = await connection.connection.query(query, { companyID }).catch(err => {return new Error(err)});
await connection.connection.disconnect().catch(err => new Error(err));
console.log(company);
return company;
};
};
module.exports = {
getContacts
};
The expected behavior from getContacts() is to return an asset from business network but it actually returns an empty array.
Current versions: composer-cli 0.20 , composer-playground 0.20 , composer-client 0.20 , composer-common 0.20 and fabric-dev-server 1.2 .
i found the solution for this issue.
i was using card which was not allowed to perform queries. However, when i used the admin card it returned with results.
other way is to allow participants to issue queries in permission.acl file.
I have the following piece of code. As is, with a couple of lines commented out, it works as expected. I subscribe to a stream, do some processing and stream the data to the client. However, if I uncomment the comments, my stream is always empty, i.e. count in getEntryQueryStream is always 0. I suspect it has to do with the fact that I subscribe late to the stream and thus miss all the values.
// a wrapper of the mongodb driver => returns rxjs streams
import * as imongo from 'imongo';
import * as Rx from 'rx';
import * as _ from 'lodash';
import {elasticClient} from '../helpers/elasticClient';
const {ObjectId} = imongo;
function searchElastic({query, sort}, limit) {
const body = {
size: 1,
query,
_source: { excludes: ['logbookType', 'editable', 'availabilityTag'] },
sort
};
// keep the search results "scrollable" for 30 secs
const scroll = '30s';
let count = 0;
return Rx.Observable
.fromPromise(elasticClient.search({ index: 'data', body, scroll }))
.concatMap(({_scroll_id, hits: {hits}}) => {
const subject = new Rx.Subject();
// subject needs to be subscribed to before adding new values
// and therefore completing the stream => execute in next tick
setImmediate(() => {
if(hits.length) {
// initial data
subject.onNext(hits[0]._source);
// code that breaks
//if(limit && ++count === limit) {
//subject.onCompleted();
//return;
//}
const handleDoc = (err, res) => {
if(err) {
subject.onError(err);
return;
}
const {_scroll_id, hits: {hits}} = res;
if(!hits.length) {
subject.onCompleted();
} else {
subject.onNext(hits[0]._source);
// code that breaks
//if(limit && ++count === limit) {
//subject.onCompleted();
//return;
//}
setImmediate(() =>
elasticClient.scroll({scroll, scrollId: _scroll_id},
handleDoc));
}
};
setImmediate(() =>
elasticClient.scroll({scroll, scrollId: _scroll_id},
handleDoc));
} else {
subject.onCompleted();
}
});
return subject.asObservable();
});
}
function getElasticQuery(searchString, filter) {
const query = _.cloneDeep(filter);
query.query.filtered.filter.bool.must.push({
query: {
query_string: {
query: searchString
}
}
});
return _.extend({}, query);
}
function fetchAncestors(ancestorIds, ancestors, format) {
return imongo.find('session', 'sparse_data', {
query: { _id: { $in: ancestorIds.map(x => ObjectId(x)) } },
fields: { name: 1, type: 1 }
})
.map(entry => {
entry.id = entry._id.toString();
delete entry._id;
return entry;
})
// we don't care about the results
// but have to wait for stream to finish
.defaultIfEmpty()
.last();
}
function getEntryQueryStream(entriesQuery, query, limit) {
const {parentSearchFilter, filter, format} = query;
return searchElastic(entriesQuery, limit)
.concatMap(entry => {
const ancestors = entry.ancestors || [];
// if no parents => doesn't match
if(!ancestors.length) {
return Rx.Observable.empty();
}
const parentsQuery = getElasticQuery(parentSearchFilter, filter);
parentsQuery.query.filtered.filter.bool.must.push({
terms: {
id: ancestors
}
});
// fetch parent entries
return searchElastic(parentsQuery)
.count()
.concatMap(count => {
// no parents match query
if(!count) {
return Rx.Observable.empty();
}
// fetch all other ancestors that weren't part of the query results
// and are still a string (id)
const restAncestorsToFetch = ancestors.filter(x => _.isString(x));
return fetchAncestors(restAncestorsToFetch, ancestors, format)
.concatMap(() => Rx.Observable.just(entry));
});
});
}
function executeQuery(query, res) {
try {
const stream = getEntryQueryStream(query);
// stream is passed on to another function here where we subscribe to it like:
// stream
// .map(x => whatever(x))
// .subscribe(
// x => res.write(x),
// err => console.error(err),
// () => res.end());
} catch(e) {
logger.error(e);
res.status(500).json(e);
}
}
I don't understand why those few lines of code break everything or how I could fix it.
Your use case is quite complex, you can start off with building up searchElastic method like the pattern bellow.
convert elasticClient.scroll to an observable first
setup the init data for elasticClient..search()
when search is resolved then you should get your scrollid
expand() operator let you recursively execute elasticClientScroll observable
use map to select data you want to return
takeWhile to decide when to complete this stream
The correct result will be once you do searchElastic().subscribe() the stream will emit continuously until there's no more data to fetch.
Hope this structure is correct and can get you started.
function searchElastic({ query, sort }, limit) {
const elasticClientScroll = Observable.fromCallback(elasticClient.scroll)
let obj = {
body: {
size: 1,
query,
_source: { excludes: ['logbookType', 'editable', 'availabilityTag'] },
sort
},
scroll: '30s'
}
return Observable.fromPromise(elasticClient.search({ index: 'data', obj.body, obj.scroll }))
.expand(({ _scroll_id, hits: { hits } }) => {
// guess there are more logic here .....
// to update the scroll id or something
return elasticClientScroll({ scroll: obj.scroll, scrollId: _scroll_id }).map(()=>
//.. select the res you want to return
)
}).takeWhile(res => res.hits.length)
}