Share variable between class methods - javascript

I'm not a master in JavaScript. I'm starting use the OOP syntax and I have a small class like this one:
class Scraper {
constructor() {}
question(inputText) {
rl.setPrompt(inputText);
rl.prompt();
return new Promise( (resolve, reject) => {
let answer;
rl.on('line', (input) => {
answer = input;
rl.close();
});
rl.on('close', () => {
resolve(answer);
});
})
}
startFetch(username) {
this.username = String(username);
return axios({
url: `https://www.instagram.com/${this.usernamee}/?__a=1`
}).then( (response) => {
//response.data.graphql.user);
this.user_id = response.data.graphql.user.id;
//totalMedia = response.data.graphql.user.edge_owner_to_timeline_media.count
this.has_next_page = response.data.graphql.user.edge_owner_to_timeline_media.page_info.has_next_page;
response.data.graphql.user.edge_owner_to_timeline_media.edges.map( (item, index) => {
this.processLink(item.node.display_url, index);
});
if( this.has_next_page ){
this.currCursor = response.data.graphql.user.edge_owner_to_timeline_media.page_info.end_cursor;
this.fetchNextPage(this.user_id);
} else {
console.log('Completed');
}
//let nodes = response.data.graphql.user.edge_owner_to_timeline_media.edges.length;
});
}
fetchNextPage(id) {
return axios({
method: 'GET',
baseURL: 'https://www.instagram.com/graphql/query/',
params: {
query_hash: '42323d64886122307be10013ad2dcc44',
variables: {
id: id,
first: "12",
after: this.currCursor
}
}
}).then( (response) => {
//console.log(response.data.data.user.edge_owner_to_timeline_media.edges[0].node)
//totalMedia = response.data.data.user.edge_owner_to_timeline_media.count
response.data.data.user.edge_owner_to_timeline_media.edges.map( (item, index) => {
index++;
this.processLink(item.node.display_url, index);
});
if( response.data.data.user.edge_owner_to_timeline_media.page_info.has_next_page ){
this.currCursor = response.data.graphql.user.edge_owner_to_timeline_media.page_info.end_cursor;
}
});
}
processLink(imageURI, n) {
let filename = path.format({dir: destinationPath, base: `${n}.jpg`});
let file = fs.createWriteStream(filename);
https.get(imageURI, (res) => {
res.pipe(file);
});
}
}
How I can set a variable inside a method and then share it and overwrite if needed inside another method of the same class? The class I'm creating is responsible to scrape an Instagram public profile. After the first request, as suggested to me here I'm calling the fetchNextPage method. To make it works as expected I need to set the currCursor variable and maintain it updated with the new cursors after each request. will the this.user_id be set and called from the method, and the this.currCursor will be updated with new value?

Related

Fetching in order rather than all at once

I just updated the code I had with yours. I am getting an error for headers not being defined which is odd.
No need to refactor the initial twitter pulling trends code, I am just trying to get it to work. I know the getTrends() function is pulling and topNews is only grabbing the first 5.
Any idea what I am doing wrong?
let topics = [];
let topNews = [];
function getTrends() {
var myHeaders = new Headers();
myHeaders.append(
"Authorization",
"Bearer ************"
);
myHeaders.append(
"Cookie",
'personalization_id="v1_QSZs3kHuqI6knlNtIbIchQ=="; guest_id=v1%3A159630901122767291'
);
var requestOptions = {
method: "GET",
headers: myHeaders,
redirect: "follow",
};
const url =
"https://cors-anywhere-gp.herokuapp.com/https://api.twitter.com/1.1/trends/place.json?id=23424977";
fetch(url, requestOptions)
.then((response) => response.json())
.then((responseJson) => topFive(responseJson))
.catch((error) => console.log("error", error));
}
function topFive(responseJson) {
for (let i = 0; i < 5; i++) {
topics.push(responseJson[0].trends[i].name);
getNews(responseJson[0].trends[i].name.replace("#", ""), i);
}
}
function getTopicURL(topic) {
return `https://api.cognitive.microsoft.com/bing/v7.0/news/search?q=${topic}&count=5`;
}
function getHeaders() {
var headers = new Headers();
headers.append('Ocp-Apim-Subscription-Key', '******');
return headers;
}
function getOptions() {
return {
headers: getHeaders(),
method: 'GET',
redirect: 'follow'
};
}
function fetchAsJSON(url, options) {
return fetch(url, options).then(response => response.json());
}
function toThunk(fn, ...args) {
return () => fn(...args);
}
function delay(ms, fn) {
return new Promise((resolve, reject) => {
setTimeout(function () {
fn().then(resolve).catch(reject);
}, ms);
});
}
function getNews(topic, index) {
return delay(
index * 1000,
toThunk(
fetchAsJSON,
getTopicURL(topic),
getOptions()
)
);
}
Promise.
all(topics.map(getNews)).
then(topicsArray => {
topicsArray.forEach((topic, index) => {
topNews[index] = topic.value;
});
}).
catch(exception => {
console.log('error:', exception);
});
getTrends();
getNews();
console.log(topNews);
You could use Promises and setTimeout. Below is some example code that you can try. I've added several comments as explanation.
I refactored your code from inline to separate utility functions to make it a bit more modular.
// --- FETCH UTILITY FUNCTIONS ---
function getTopicURL(topic) {
return `https://api.cognitive.microsoft.com/bing/v7.0/news/search?q=${topic}&count=5`;
}
function getHeaders() {
var headers = new Headers();
headers.append('Ocp-Apim-Subscription-Key', '******');
return headers;
}
function getOptions() {
return {
headers: getHeaders(),
method: 'GET',
redirect: 'follow'
};
}
function fetchAsJSON(url, options) {
return fetch(url, options).then(response => response.json());
}
// --- "LAZY" UTILITIES ---
function toThunk(fn, ...args) {
// creates a `thunk`
return () => fn(...args);
}
function delay(ms, fn) {
// returns a promise that wraps a call to setTimeout. note that
// the `fn` parameter has to be a thunk!
return new Promise((resolve, reject) => {
setTimeout(function () {
fn().then(resolve).catch(reject);
}, ms);
});
}
// --- PROG ---
function getNews(topic, index) {
// grabs 5 news for 1 topic, defers fetching by 1 second per item
// (first item = 0 ms, second item = 1000 ms, third item = 2000 ms, etc.)
return delay(
index * 1000,
toThunk(
fetchAsJSON,
getTopicURL(topic),
getOptions()
)
);
}
let topics = ['topic-a', 'topic-b', 'topic-c', 'topic-d', 'topic-e'];
Promise.
all(topics.map(getNews)).
then(topicsArray => {
// when all topics return, iterate over them and store all news per topic
topicsArray.forEach((topic, index) => {
topNews[index] = topic.value;
});
checkNewsDone(); // <-- not sure if needed or not since I don't know what it does
}).
catch(exception => {
console.log('error:', exception);
});

Is there a mock way in jest to test all dynamo low-level services with my controllers?

I'm writing jets tests and fully support unit test for dynamo express app. Is there a good example how mocking dynamo from my controller dependencies that it returns same response and verify the type of input data at dynamoDB low level api through the following functions attributes: get, query, batchGet, batchWrite as well as which data needs to be compared. The mock structure should behave as follows: controller -> db services -> json -> wrapper and thus be able to start asserting each case.
I tried aws-sdk-mock but unsuccessfully didn't work. Below is my code snippet:
function getRecordIdsFromDynamo(message) {
return new Promise((resolve, reject) => {
let is = message.id;
let params = {
Key: { id: id },
TableName: LOOKUP,
ConsistentRead: true,
ProjectionExpression: 'activeRecord'
}
dynamoClient.get(params, function(err, data) {
try {
let activeRecords = [];
if(data.Item.activeRecord)
activeRecords = data.Item.activeRecord.values;
resolve(activeRecords);
} catch(){}
})
})
}
function getRecordsFromDynamo(keys, id) {
return new Promise((resolve, reject) => {
let results = [];
let params = { RequestItems: {} };
if(keys.length == 0)
return resolve(results);
// unprocessed logic...
dynamoClient.batchGet(params, function(err, data) {
// err logic...
const responses = data.Responses[RECORD],
newUnprocessedKeys = data.UnprocessedKeys[RECORD];
results = results.concat(responses);
resolve(results)
})
})
}
exports.getRecords = function(message) {
return new Promise((resolve, reject) => {
getRecordsIdsFromDynamo(message)
.then((records) => {
let keys = records.map((id) => {
return { RecordId: id }
});
getRecordsFromDynamo(keys, message.id)
.then((records) => {})
.catch((err) => {
reject(err);
});
});
}
// test code
exports.getActiveRecordIds = async (message, docClient) => {
let data = await getActiveRecordsIdsfromDB(message, docClient);
console.log("data.Items =========>", data.Items);
return {
"Items": {
activeRecordIds: {
values: data.Items
}
}
}
};
/**
*
* #param {String} messageDevice
* #param {AWS} docClient
* #returns {Promise<object>} return promise with activeRuleIds
*/
function getActiveRecordsIdsfromDB(message, docClient) {
const activeRecords = db.getRecords(message);
console.log("In getActiveRecordsIdsFromDB " + activeRecords);
const params = {
Key: { id: message.id },
TableName: LOOKUP,
ConsistentRead: true,
ProjectionExpression: 'activeRecordIds'
};
return docClient.get(params).promise();
};
beforeEach(() => {
// Clear all instances and calls to constructor and all methods:
Message.mockClear();
event = record.event;
recordId = record.id;
ts = record.ts;
context = '1357';
message = new Message({
id: id,
ts: ts,
event: event
}, context);
});
it('should get all active records from dynamo', () => {
AWS_mock.mock('DynamoDB.DocumentClient', 'get', function(params, callback) {
callback(null, { Items: ['1234', '4321'] });
});
let docClient = new aws.DynamoDB.DocumentClient();
return getActiveRecordsIds(message, docClient)
.then(recordIds => {
expect(recordIds).toEqual({ Items: { activeRecordsIds: { values: ['1234', '4321'] } } });
});
});

sinon spy doesn't register call in a generator loop?

I want to check that a piece of code is being called, so I'm using a sinon spy to assert this. However, the spy seems to be failing, despite console.logs showing that the code has been called correctly.
I'm wondering if my function being a generator is causing my spy to misreport what it's doing.
my code (i've taken out some chunks for brevity):
isBlacklisted(release, jobUUID) {
names.forEach((name) => {
this._spawnPythonProcessGenerator(
this.IS_BLACKLISTED_SCRIPT,
name
).next().value
.then((data) => {
console.log(data);
})
.catch((err) => {
this._errorEvent(release, name, err, jobUUID);
});
}, this);
}
_errorEvent(release, name, err, jobUUID) {
console.log('got here');
}
*_spawnPythonProcessGenerator(scriptSrc, name) {
const pythonProcess = this._childProcess.spawn(
'python3',
[...arguments]
);
yield new Promise((resolve, reject) => {
pythonProcess.stderr.on('data', (err) => {
reject(err.toString());
});
pythonProcess.stdout.on('data', (data) => {
resolve(data.toString());
});
});
}
and my tests:
const Blacklist = require('../../src/Blacklist2');
const childProcess = require('child_process');
const uuid = require('uuid/v4');
describe('Blacklist', () => {
let blacklist;
beforeEach(() => {
blacklist = new Blacklist(childProcess);
blacklist.IS_BLACKLISTED_SCRIPT = './test/helpers/good.py';
});
describe('isBlacklisted', () => {
it('should call the _errorEvent for every name in a release when the blacklist application is not available', async () => {
let release = {
id: 1001,
asset_controller: {
id: 54321,
},
display_name: 'Blah',
names: [
{
id: 2001,
name: 'Blah',
},
],
};
blacklist.IS_BLACKLISTED_SCRIPT = './test/helpers/'+ uuid() +'.py';
const spy = sinon.spy(blacklist, '_errorEvent');
blacklist.isBlacklisted(release, uuid());
console.log(spy);
sinon.assert.calledTwice(spy);
spy.restore();
});
});
});
my spy reports:
notCalled: true
I'll expand my comment into an actual answer, hopefully that helps.
Your problem lies with asynchrony, not with the generator. You need isBlacklisted to return a promise you can wait on. Otherwise your assertion happens before the spy is called.
Something like this:
isBlacklisted(release, jobUUID) {
let promises = names.map((name) => {
return this._spawnPythonProcessGenerator(
this.IS_BLACKLISTED_SCRIPT,
name
).next().value
.then((data) => {
console.log(data);
})
.catch((err) => {
this._errorEvent(release, name, err, jobUUID);
});
}, this);
return Promise.all(promises);
}
Then, in your test:
return blacklist.isBlacklisted(release, uuid())
.then(() => {
sinon.assert.calledTwice(spy);
});
Also... This isn't related to your problem, but your _spawnPythonProcessGenerator method doesn't need to be a generator. You're only using the first value of it by calling next like that and calling the whole thing over again for each array item.
It will work the same if you take out the *, change yield to return, and skip the .next().value when you call it. You also probably want to rename it because it's not a generator.
_spawnPythonProcess(scriptSrc, name) {
const pythonProcess = this._childProcess.spawn(
'python3',
[...arguments]
);
return new Promise((resolve, reject) => {
pythonProcess.stderr.on('data', (err) => {
reject(err.toString());
});
pythonProcess.stdout.on('data', (data) => {
resolve(data.toString());
});
});
}
When you call it:
let promises = names.map((name) => {
return this._spawnPythonProcess(
this.IS_BLACKLISTED_SCRIPT,
name
)
.then((data) => {
console.log(data);
})
.catch((err) => {
this._errorEvent(release, name, err, jobUUID);
});
}, this);
return Promise.all(promises);

Updating many(100k+) documents in the most efficient way MongoDB

I have a function that runs periodically, that updates the item.price of some Documents in my Prices Collection. The Price Collection has 100k+ items. The function looks like this:
//Just a helper function for multiple GET requests with request.
let _request = (urls, cb) => {
let results = {}, i = urls.length, c = 0;
handler = (err, response, body) => {
let url = response.request.uri.href;
results[url] = { err, response, body };
if (++c === urls.length) {
cb(results);
}
};
while (i--) {
request(urls[i], handler);
}
};
// function to update the prices in our Prices collection.
const update = (cb) => {
Price.remove({}, (err, remove) => {
if (err) {
return logger.error(`Error removing items...`);
}
logger.info(`Removed all items... Beginning to update.`);
_request(urls, (responses) => {
let url, response, gameid;
for (url in responses) {
id = url.split('/')[5].split('?')[0];
response = responses[url];
if (response.err) {
logger.error(`Error in request to ${url}: ${err}`);
return;
}
if (response.body) {
logger.info(`Request to ${url} successful.`)
let jsonResult = {};
try {
jsonResult = JSON.parse(response.body);
} catch (e) {
logger.error(`Could not parse.`);
}
logger.info(`Response body for ${id} is ${Object.keys(jsonResult).length}.`);
let allItemsArray = Object.keys(jsonResult).map((key, index) => {
return {
itemid: id,
hash_name: key,
price: jsonResult[key]
}
});
Price.insertMany(allItemsArray).then(docs => {
logger.info(`Saved docs for ${id}`)
}, (e) => {
logger.error(`Error saving docs.`);
});
}
}
if (cb && typeof cb == 'function') {
cb();
}
})
});
}
As you can see, to avoid iterating through 100k+ Documents, and updating each and every one of them separately, I delete them all at the beginning, and just call the API that gives me these Items with prices, and use InsertMany to Insert all of them into my Prices Collection.
This updating process will happen every 30 minutes.
But I just now realised, what if some user wants to check the Prices and my Prices Collection is currently empty because it's in the middle of updating itself?
The Question
So do I have to iterate through all of them in order to not delete it? (Remember, there are MANY documents to be updated every 30 mins.) Or is there another solution?
Here's a picture of how my Prices Collection looks (there are 100k docs like these, I just want to update the price property):
Update:
I have re-written my update function a bit and now it looks like this:
const update = (cb = null) => {
Price.remove({}, (err, remove) => {
if (err) {
return logger.error(`Error removing items...`);
}
logger.info(`Removed all items... Beginning to update.`);
_request(urls, (responses) => {
let url, response, gameid;
for (url in responses) {
gameid = url.split('/')[5].split('?')[0];
response = responses[url];
if (response.err) {
logger.error(`Error in request to ${url}: ${err}`);
return;
}
if (response.body) {
logger.info(`Request to ${url} successful.`)
let jsonResult = {};
try {
jsonResult = JSON.parse(response.body);
} catch (e) {
logger.error(`Could not parse.`);
}
logger.info(`Response body for ${gameid} is ${Object.keys(jsonResult).length}.`);
let allItemsArray = Object.keys(jsonResult).map((key, index) => {
return {
game_id: gameid,
market_hash_name: key,
price: jsonResult[key]
}
});
let bulk = Price.collection.initializeUnorderedBulkOp();
allItemsArray.forEach(item => {
bulk.find({market_hash_name: item.market_hash_name})
.upsert().updateOne(item);
});
bulk.execute((err, bulkers) => {
if (err) {
return logger.error(`Error bulking: ${e}`);
}
logger.info(`Updated Items for ${gameid}`)
});
// Price.insertMany(allItemsArray).then(docs => {
// logger.info(`Saved docs for ${gameid}`)
// }, (e) => {
// logger.error(`Error saving docs.`);
// });
}
}
if (cb && typeof cb == 'function') {
cb();
}
})
});
}
Notice the bulk variable now (Thanks #Rahul) but now, the collection takes ages to update. My processor is burning up and it literally takes 3+ minutes to update 60k+ documents. I honestly feel like the previous method, while it might delete all of them and then reinserting them, it also takes 10x faster.
Anyone?
From my experience (updating millions of mongo docs on a hourly basis), here's a realistic approach to very large bulk updates:
do all your API calls separately and write results in as bson into a file
invoke mongoimport and import that bson file into a new empty collection prices_new. Javascript, let alone high-level OO wrappers, are just too slow for that
rename prices_new -> prices dropTarget=true (this will be atomic hence no downtime)
Schematically, it would look like this in JS
let fname = '/tmp/data.bson';
let apiUrls = [...];
async function doRequest(url) {
// perform a request and return an array of records
}
let responses = await Promise.all(apiUrls.map(doRequest));
// if the data too big to fit in memory, use streams instead of this:
let data = flatMap(responses, BSON.serialize).join('\n'));
await fs.writeFile(fname, data);
await child_process.exec(`mongoimport --collection prices_new --drop ${fname}`);
await db.prices_new.renameCollection('prices', true);
There's no need to clear the database and do a fresh insert. You can use the bulkWrite() method for this or use the updateMany() method to do the updates.
You can refactor the existing code to
const update = (cb) => {
_request(urls, responses => {
let bulkUpdateOps = [], gameid;
responses.forEach(url => {
let response = responses[url];
gameid = url.split('/')[5].split('?')[0];
if (response.err) {
logger.error(`Error in request to ${url}: ${response.err}`);
return;
}
if (response.body) {
logger.info(`Request to ${url} successful.`)
let jsonResult = {};
try {
jsonResult = JSON.parse(response.body);
} catch (e) {
logger.error(`Could not parse.`);
}
Object.keys(jsonResult).forEach(key => {
bulkUpdateOps.push({
"updateOne": {
"filter": { market_hash_name: key },
"update": { "$set": {
game_id: gameid,
price: jsonResult[key]
} },
"upsert": true
}
});
});
}
if (bulkUpdateOps.length === 1000) {
Price.bulkWrite(bulkUpdateOps).then(result => {
logger.info(`Updated Items`)
}).catch(e => logger.error(`Error bulking: ${e}`));
bulkUpdateOps = [];
}
});
if (bulkUpdateOps.length > 0) {
Price.bulkWrite(bulkUpdateOps).then(result => {
logger.info(`Updated Items`)
}).catch(e => logger.error(`Error bulking: ${e}`));
}
});
if (cb && typeof cb == 'function') {
cb();
}
}
I have not tested anything but you can try this, might be helpful. I am using bluebird library for concurrency.
let _request = (url) => {
return new Promise((resolve, reject) => {
request(url, (err, response, body) => {
if (err) {
reject(err);
}
resolve(body);
});
});
};
const formatRespose = async (response) => {
// do stuff
return {
query: {}, // itemid: id,
body: {}
};
}
const bulkUpsert = (allItemsArray) => {
let bulk = Price.collection.initializeUnorderedBulkOp();
return new Promise((resolve, reject) => {
allItemsArray.forEach(item => {
bulk.find(item.query).upsert().updateOne(item.body);
});
bulk.execute((err, bulkers) => {
if (err) {
return reject(err);
}
return resolve(bulkers);
});
});
}
const getAndUpdateData = async (urls) => {
const allItemsArray = urls.map((url) => {
const requestData = await _request(url); // you can make this also parallel
const formattedData = formatRespose(requestData); // return {query: {},body: {} };
return formattedData;
});
return await (bulkUpsert(allItemsArray));
};
function update() {
// split urls into as per your need 100/1000
var i, j, chunkUrls = [],
chunk = 100;
for (i = 0, j = urls.length; i < j; i += chunk) {
chunkUrls.push(getAndUpdateData(urls.slice(i, i + chunk)));
}
Bluebird.map(chunkUrls, function (chunk) {
return await chunk;
}, {
concurrency: 1 // depends on concurrent request change 1 = 100 request get and insert in db at time
}).then(function () {
console.log("done");
}).catch(function () {
console.log("error");
});
}

hoisting anonymous function expression which is an array item

This code needs to declare a function (preferably anonymous) inside the array
({argObj}) => {console.log(start);}
and define it later outside the request = (function() {...}()); IIF.
request = (function () {
const pathAfter = {
start: ['homePage', 'GET', ({argObj}) => {console.log(`start`);}]
};
return {
go: (argObj) => {
if(!pathAfter[argObj.pathAfter]) return;
const path = pathAfter[argObj.pathAfter][0];
const method = pathAfter[argObj.pathAfter][1];
const url = data.domain + path + data.ext;
HTTP.call(method, url, (error, response) => {
if (error) {
console.log('error '+error);
} else {
pathAfter[path][2]({response: response}); // <---- calls relevant method
request.go({pathAfter: path});
}
});
return true; // if rms seccessful
}
};
}());
// function definition goes here...
I am not sure how to do this. Thanks
I am not entirely clear on what you are trying to acheive and how strict the requirements are, but one option might be to give the request object the ability to add/extend the handlers in pathAfter:
request = (function () {
const pathAfter = {
start: ['homePage', 'GET', ({argObj}) => {console.log('start');}]
};
return {
go: (argObj) => {
if(!pathAfter[argObj.pathAfter]) return;
const path = pathAfter[argObj.pathAfter][0];
const method = pathAfter[argObj.pathAfter][1];
const url = data.domain + path + data.ext;
HTTP.call(method, url, (error, response) => {
if (error) {
console.log('error '+error);
} else {
pathAfter[path][2]({response: response}); // <---- calls relevant method
request.go({pathAfter: path});
}
});
return true; // if rms seccessful
},
registerPathHandler: (handlerName,handler)=> {
pathAfter[handlerName] = handler;
}
};
}());
request.registerPathHandler('test', ['testPage', 'GET', ({argObj}) => {console.log('test');}]);
This will add a named handler, and could be used to add the start handler as well. If the start handler really needs to be hard-coded inside, then the code above could be modified to just replace the array element instead.

Categories

Resources