I've written a scraper using the scraper module and the queue function of async module.
I read the list of URLs to scrap from a json file and write the informations in another JSON file.
This is my script:
var fs = require("fs");
var scrap = require("scrap"),
async = require("async");
var errors = [];
// Queue a list of URLs
var queue = JSON.parse(fs.readFileSync("products.json", "utf8"));
var len = queue.products.length;
var q = async.queue(function (url, done) {
scrap(url, function(err, $) {
var product = {};
product.name = $("#page-body h2").first().text().trim();
product.myarr = [];
product.picture = $(".content img").first().attr("src");
try {
if (product.picture.indexOf("someword") > 1) {
delete product.picture;
}
}
catch (e) {
console.error(e);
}
$(".content [style^=\"color: #\"] [style=\"font-weight: bold\"], .content [style=\"font-weight: bold\"] [style^=\"color: #\"]").each(function() {
product.myarr.push($(this).text().trim().toLowerCase());
});
if (product.myarr.length) {
fs.appendFile("products-parsed.json", JSON.stringify(product) + ",\n", function (err) {
console.log(queue.products.indexOf(url), len, err);
if (err) { errors.push(queue.products.indexOf(url)); }
done();
});
}
});
}, 20);
q.drain = function() {
console.log(errors);
};
q.push(queue.products);
When I run it, after about 3.000 pages, it stops (quit) and it does not give any error, I have to start from the latest worked page using:
q.push(queue.products.slice(lastWorkedPage, queue.products.length - 1));
How can I fix this problem?
Not sure why, by the way seems like the problem was caused by this row:
console.log(queue.products.indexOf(url), len, err);
Commenting it has solved the problem, feel free to give a more accurate answer which explains the solution and I'll set it as accepted.
Can't comment yet so I have to post new answer.
I can confirm console.log - error. NodeJS/Express sometimes just stops when trying to use console.log()!
Code from one test-project:
console.log(req.body.credentials.password, isMatch);
if (isMatch) {
sess.currentUser = user;
console.log(user);
res.send({ status: "ok", loginUser: user });
}
else {
res.send({ status : "error", msg: "Login failed!" });
}
Second logging line (console.log(user)) stops NodeJS without error! And that happens only in some environments - in most dev environments this works just fine!
Related
So, I am currently developing some functionality on one of my projects at work. The project is using JavaScript mainly - Node.JS for backend and React.JS for frontend, and I have to admit I am not experienced with either of them. I believe that the code I am writing could look much better and work more efficient if I utilised promises or async/await functionality (prior to asking the question here I read few articles about them, and I am still not sure how to use them in the project the way it actually makes sense, hence I decided to ask community here). I also had a glance at this article, but again I am not sure whether my implementation actually does anything StackOverflow.
At the end of this post I am going to paste some code from both front and backend and hopefully someone will be able to point me into a right direction. To make things clear - I am not asking for anybody to rewrite the code for me, but to explain what it is I'm doing wrong (or not doing at all).
Use case:
User writes a company name in the search bar on the website. Typed string is then sent to the backend via http-request and the database is checked for the entry (to get the company's logo) - here I am running an algorithm to check for spelling mistakes and propose similar names to the one typed, as a result the database may be queried more than 2 times before the result is sent back, but it's always working fine.
Once the response is received by the frontend few things should happen - to start with another request should be sent to the web in order to receive other results. If correct results are received, that should be the end of the function, otherwise it should send another request, to google this time, to get the results from there.
Backend Code:
.post('/logo', (req, res) => {
res.header("Access-Control-Allow-Origin", "*");
res.header("Access-Control-Allow-Headers", "Origin, X-Requested-With, Content-Type, Accept");
if (req.body.key !== "" && req.body.key.trim().length > 0) {
let results = {};
let proposedNames = [];
var promise1 = new Promise((resolve, reject) => {
let getLogo = "SELECT title, img_dir FROM logo_approved WHERE LOWER(title) LIKE LOWER($1)";
let searchedCompanyName = ["%"+req.body.key+"%"];
db.queryDB(getLogo, searchedCompanyName, (data) => {
if (data.rows.length > 0){
results.databaseResults = data.rows;
}
resolve(data.rows);
});
});
// Returns the list of all companies' names from the database
var promise2 = new Promise((resolve, reject) => {
let returnAllNames = "SELECT title, img_dir as img FROM logo_approved";
db.queryDB(returnAllNames, [], (data) =>{
// Compare searched company's name with all names from the database
data.rows.forEach(function(element) {
// If name from the database is similar to the one searched for
// It's saved in propsedNames array and will be used later on for database query
if (jw.distance(req.body.key, element.title) > 0.7){
element.probability = parseFloat(jw.distance(req.body.key, element.title).toFixed(2));
proposedNames.push(element);
}
})
resolve(proposedNames);
});
proposedNames.sort(function(a,b){return a.distance-b.distance});
results.proposedNames = proposedNames;
});
var promiseAll = Promise.all([promise1, promise2]);
promiseAll.then(() => {
res.send(results);
});
}
else {
res.status(400);
res.send("Can't search for an empty name");
}
})
Frontend code:
engraveLogoInputHandler() {
let results = {};
let loadedFromWeb = false, loadedFromClearbit = false, loadedFromDatabase = false;
this.setState({
engraveLogo: this.engravingLogo.value.length
});
// charsElthis.engravingInput.value
if (inputLogoTimer) {
clearTimeout(inputLogoTimer);
// inputLogoTimer = null;
}
if (this.engravingLogo.value !== ''){
// Wait to see if there is any new input coming soon, only render once finished to prevent lag
inputLogoTimer = setTimeout(() => {
request.post({url: NODEENDPOINT+'/logo', form: {key: this.engravingLogo.value}}, (err, res, body) => {
if (err){
console.log(err);
}
else {
if (res.body && res.statusCode !== 400){
results.database = JSON.parse(res.body);
loadedFromDatabase = true;
}
}
});
request(link+(this.engravingLogo.value), (err, res, body) => {
if (err) {
console.log(err);
}
else {
let jsonBody = JSON.parse(body);
if (jsonBody && !jsonBody.error){
let sources = [];
let data = JSON.parse(body);
for (let item of data) {
sources.push({
domain: item.domain,
image: item.logo+'?size=512&grayscale=true',
title: item.name
});
}
loadedFromClearbit = true;
results.clearbit = sources;
}
}
});
if (!loadedFromClearbit && !loadedFromDatabase){
request('https://www.googleapis.com/customsearch/v1?prettyPrint=false&fields=items(title,displayLink)&key='+GOOGLE_CSE_API_KEY+'&cx='+GOOGLE_CSE_ID+'&q='+encodeURIComponent(this.engravingLogo.value), { json: true }, (err, res, body) => {
if (err) {
console.error(err);
}
else {
if (body && body.items) {
let sources = [];
for (let s of body.items) {
sources.push({
domain: s.displayLink,
image: 'https://logo.clearbit.com/'+s.displayLink+'?size=512&greyscale=true',
title: s.title
});
}
loadedFromWeb = true;
results.googleSearches = sources;
} else {
console.error(body);
}
}
});
}
console.log("Results: ", results);
if (loadedFromClearbit || loadedFromWeb){
console.log("Propose the logo to be saved in a local database");
}
}, 500);}
}
So, in regarding to the backend code, is my implementation of promises actually correct there, and is it usefull? Could I use something similar for the front end and put the first two requests in Promise, and run the third request only if those two fail? (and failing means that they return empty results).
I thought I could use logic like this (see below) to catch if the promise failed, but that didn't work and I got an error saying I didn't catch the rejection:
var promise1 = new Promise((resolve, reject) => {
// ... some logic there
else {
reject();
}
});
var promise2 = promise1.catch(() => {
new Promise((resolve, reject) => {
// some logic for 2nd promise
});
});
Any answer is appreciated. As mentioned, I'm not very familiar with JavaScript, and this is the first asynchronous project I am working on, so I want to make sure I utilise and adapt the correct behaviour and methods.
Thanks
For the life of me I cannot work this one out. Have look around and tried many many different ways of trying to get this to go. Currently have the following code.
var config = require("./config.js");
var cradle = require('cradle')
var MikroNode = require('mikronode');
var WebServer = require('./bin/www');
var Routers = "Hasnt changed";
var conndb = new(cradle.Connection)(config.couchdb.host);
var db = conndb.database(config.couchdb.db);
db.exists(function(err, exists){
if (err) { console.log('error', err);}
else if (exists) { console.log('Seems the Force is with you - Database Exists');}
else { db.create(); }
});
db.temporaryView({
map: function (doc){
if (doc.type=='ConfigRouter') emit(doc.name, doc);
}
}, function (err, res){
Routers = JSON.stringify(res);
}
);
console.log(Routers);
As it stands it will respond with:
E:\Dev\MM>npm start
> MM#0.0.1 start E:\Dev\MM
> node ./Start.js
Hasnt changed
Seems the Force is with you - Database Exists
I am assuming it is an asynchronous call to the CouchDB and is not filling the result in time before it displays the result. How do I get around this issue?
You are right, the call is asynchronous so when console.log(Routers); is processed, Routers is "Hasnt changed".
One way of doing it would be to use promises thanks to the Q npm module:
var Q = require('q');
var deferred = Q.defer();
db.temporaryView({
map: function (doc) {
if (doc.type=='ConfigRouter') emit(doc.name, doc);
}
}, function (err, res) {
deferred.resolve(JSON.stringify(res));
});
deferred.promise
.then(function (data) {
Routers = data;
console.log(Routers);
// do some stuff...
})
.done();
Maybe it's possible to do something better without using Q.defer and adapting directly the callback:
https://github.com/kriskowal/q#adapting-node
I created a Node.js script that creates a large array of randomly generated test data and I want to write it to a Redis DB. I am using the redis client library and the async library. Initially, I tried executing a redisClient.hset(...) command within the for loop that generates my test data, but after some Googling, I learned the Redis method is asynchronous while the for loop is synchronous. After seeing some questions on StackOverflow, I can't get it to work the way I want.
I can write to Redis without a problem with a small array or larger, such as one with 100,000 items. However, it does not work well when I have an array of 5,000,000 items. I end up not having enough memory because the redis commands seem to be queueing up, but aren't executed until after async.each(...) is complete and the node process does not exit. How do I get the Redis client to actually execute the commands, as I call redisClient.hset(...)?
Here a fragment of the code I am working with.
var redis = require('redis');
var async = require('async');
var redisClient = redis.createClient(6379, '192.168.1.150');
var testData = generateTestData();
async.each(testData, function(item, callback) {
var someData = JSON.stringify(item.data);
redisClient.hset('item:'+item.key, 'hashKey', someData, function(err, reply) {
console.log("Item was persisted. Result: " +reply);
});
callback();
}, function(err) {
if (err) {
console.error(err);
} else {
console.log.info("Items have been persisted to Redis.");
}
});
You could call eachLimit to ensure you are not executing too many redisClient.hset calls at the same time.
To avoid overflowing the call stack you could do setTimeout(callback, 0); instead of calling the callback directly.
edit:
Forget what I said about setTimeout. All you need to do is call the callback at the right place. Like so:
redisClient.hset('item:'+item.key, 'hashKey', someData, function(err, reply) {
console.log("Item was persisted. Result: " +reply);
callback();
});
You may still want to use eachLimit and try out which limit works best.
By the way - async.each is supposed to be used only on code that schedules the invocation of the callback in the javascript event queue (e.g. timer, network, etc) . Never use it on code that calls the callback immediately as was the case in your original code.
edit:
You can implement your own eachLimit function that instead of an array takes a generator as it's first argument. Then you write a generator function to create the test data. For that to work, node needs to be run with "node --harmony code.js".
function eachLimit(generator, limit, iterator, callback) {
var isError = false, j;
function startNextSetOfActions() {
var elems = [];
for(var i = 0; i < limit; i++) {
j = generator.next();
if(j.done) break;
elems.push(j.value);
}
var activeActions = elems.length;
if(activeActions === 0) {
callback(null);
}
elems.forEach(function(elem) {
iterator(elem, function(err) {
if(isError) return;
else if(err) {
callback(err);
isError = true;
return;
}
activeActions--;
if(activeActions === 0) startNextSetOfActions();
});
});
}
startNextSetOfActions();
}
function* testData() {
while(...) {
yield new Data(...);
}
}
eachLimit(testData(), 10, function(item, callback) {
var someData = JSON.stringify(item.data);
redisClient.hset('item:'+item.key, 'hashKey', someData, function(err, reply) {
if(err) callback(err);
else {
console.log("Item was persisted. Result: " +reply);
callback();
}
});
}, function(err) {
if (err) {
console.error(err);
} else {
console.log.info("Items have been persisted to Redis.");
}
});
Okay, so my code is pulling data from a yelp business using their official API. My problem is that I can't seem to get the data to return out of the function. The problem isn't in ejs, it's that the data doesn't return when I tell it to! I just get undefined with some attempts, and with others (including the one I'm going to show here), I get an empty array. I'm pasting only the code that's important, let me know if you need more!
function yelp(){
var b = [];
var i = 0;
(struck the initialization of client)
client.business("(struck)", function(error, data) {
if (error != undefined){
res.send("an error occured. exiting");
process.process.reallyExit();
}
b[i++] = data.name;
b[i++] = data.display_phone;
b[i++] = data.rating;
console.log(b); //NEW!!
});
console.log(b);
return b;
}
app.get('/yelp', function(req,res){
var arr = yelp();
console.log(arr);
res.render('yelp.ejs', {title: 'Yelp!', arr: arr});
});
}
I added one more line of code, that I THINK may have narrowed down the problem to being related to my poor internet connection. I added ANOTHER console.log(b), this time inside of the business API call. the console.log(arr) is shows second, the console.log(b); just before the reutrn shows first, and LAST is the console.log(b) INSIDE the API call. It also took a good 30 seconds for that log to appear, and it appeared AFTER the page loaded. So, how do I go about making the page wait for the data? Or is this unrelated to my problem?
Without knowing their API, I do recognize the callback style. The result of the call to your client would then be in the data parameter in the callback, and thats where you want to render your view.
The following is not tested.
function yelp(cb) {
var b = [];
var i = 0;
// (struck the initialization of client)
client.business("(struck)", function(error, data) {
if (error) {
return cb(error);
}
b[i++] = data.name;
b[i++] = data.display_phone;
b[i++] = data.rating;
});
console.log(b);
cb(null, b)
}
app.get('/yelp', function(req, res) {
yelp(function(err, arr) {
if (err) {
res.send("an error occured. exiting");
process.process.reallyExit();
return;
}
console.log(arr);
res.render('yelp.ejs', {
title: 'Yelp!',
arr: arr
});
});
});
The thing to notice here, is the callback passing. This is normally how you do
async work in Node.js. It can be made a bit prettier using promises.
nodejs is async, meaning the app won't wait for the yelp() function to return. you can pass the yelp function a callback like so:
function yelp(callback){
var b = [];
var i = 0;
(struck the initialization of client)
client.business("(struck)", function(error, data) {
if (error != undefined){
res.send("an error occured. exiting");
process.process.reallyExit();
}
b[i++] = data.name;
b[i++] = data.display_phone;
b[i++] = data.rating;
});
console.log(b);
callback(b);
}
yelp(funtion(arr) {
res.render('yelp.ejs', {title: 'Yelp!', arr: arr});
})
You are expecting sync things to happen, but it's async. Your client.business method takes in a callback as it's second argument which isn't returning by the time res.render gets called.
Try this:
function yelp(callback) {
var b = [];
var i = 0;
client.business("(struck)", function(error, data) {
if (error != undefined){
res.send("an error occured. exiting");
process.process.reallyExit();
}
b[i++] = data.name;
b[i++] = data.display_phone;
b[i++] = data.rating;
// No returns in async. Just call the callback.
callback('yelp.ejs', { {title: 'Yelp!', arr: b}})
});
}
app.get('/yelp', function(req,res){
yelp(res.render);
});
I am using Javascript, webdriverio (v2.1.2) to perform some data extraction from an internal site. So the idea is
Authenticate
Open the required URL, when authenticated
In the new page, search for an anchor tag having specific keyword
Once found, click on the anchor tag
Below is what I have tried and it works (last two points). I had to use Q and async to achieve it. I was hoping to use only Q to achieve it. Can someone help me, on how to achieve it using Q only ??
var EmployeeAllocationDetails = (function () {
'use stricy';
/*jslint nomen: true */
var Q = require('Q'),
async = require('async'),
_ead_name = 'Employee Allocation Details',
goToEadFromHome;
goToEadFromHome = function (browserClient) {
browserClient.pause(500);
var deferred = Q.defer();
browserClient.elements('table.rmg td.workListTD div.tab2 div.contentDiv>a', function (err, results) {
if (err) {
deferred.reject(new Error('Unable to get EAD page. ' + JSON.stringify(err)));
} else {
async.each(results.value, function (oneResult, callback) {
console.log('Processing: ' + JSON.stringify(oneResult));
browserClient.elementIdText(oneResult.ELEMENT, function (err, result) {
if (err) {
if (err.message.indexOf('referenced element is no longer attached to the DOM') > -1 ){
callback();
} else {
callback('Error while processing :' + JSON.stringify(oneResult) + '. ' + err);
}
} else if(!result){
console.log('result undefined. Cannot process: ' + JSON.stringify(oneResult));
callback();
} else if(result.value.trim() === _ead_name){
deferred.resolve(oneResult);
callback();
}
});
}, function (err) {
// if any of the processing produced an error, err would equal that error
if( err ) {
// One of the iterations produced an error.
// All processing will now stop.
console.log('A processing failed to process. ' + err);
} else {
console.log('All results have been processed successfully');
}
}); //end of async.each
}
});
return deferred.promise;
};
return {
launchEad : goToEadFromHome
}
})();
module.exports = EmployeeAllocationDetails;
Related Github Issue link https://github.com/webdriverio/webdriverio/issues/123
I think you should use async. I think your code is great. It runs everything in parallel and it handles error well.
If
If you want to remove async, there are several options:
use Q flow control
copy paste async's implementation
implement it yourself
If you try to use Q's flow control it will look something like this (pseudo-code):
var getTextActions = [];
function createAction(element){
return function(){
return element.getText();
}
}
each(elements, function(element){ getTextActions.push( createAction(element) ) });
Q.all(getTextActions).then(function(results) {
... iterate all results and resolve promise with first matching element..
} );
note this implementation has worse performance. It will first get the text from all elements, and only then try to resolve your promise. You implementation is better as it all runs in parallel.
I don't recommend implementing it yourself, but if you still want to, it will look something like this (pseudo-code):
var elementsCount = elements.length;
elements.each(function(element){
element.getText(function(err, result){
elementsCount --;
if ( !!err ) { logger.error(err); /** async handles this much better **/ }
if ( isThisTheElement(result) ) { deferred.resolve(result); }
if ( elementsCount == 0 ){ // in case we ran through all elements and didn't find any
deferred.resolve(null); // since deferred is resolved only once, calling this again if we found the item will have no effect
}
})
})
if something is unclear, or if I didn't hit the spot, let me know and I will improve the answer.