Scraping Cheerio - Getting only element[k] - javascript

Problem : With this code I get a list of every iteration of a given element (h1 here)
I would like to get only the number k (k can be a variable assumption) of this element.
Ultimately, I would like to get say, the element1[i], element2[j], where element1 can be a class, id, same for element2
And then concatenate element1,2,...n in a JSON file
I think this is very basic question, but I am quite new to jQuery...
'use strict';
var url = "http://www.website.html",
rech = "h1";
// Import the dependencies
var cheerio = require("cheerio"),
req = require("tinyreq");
// Define the scrape function
function scrape(url, data, cb) {
// 1. Create the request
req(url, (err, body) => {
if (err) { return cb(err); }
// 2. Parse the HTML
let $ = cheerio.load(body)
, pageData = {}
;
// 3. Extract the data
Object.keys(data).forEach(k => {
pageData[k] = $(data[k]).text();
});
// Send the data in the callback
cb(null, pageData);
});
}
// Extract some data from my website
scrape(url, {
rech
}, (err, data) => {
console.log(err || data);
});

Related

Using Request.js and Cheerio.js in Node/Express return empty array

I'm building a simple scraper using Request.js and Cheerio.js within Express. Right now I'm only looking for the title of the site. Instead scraping a website one by one, I put the list in an array. I parse through them and then use Cheerio.js to find the Title of the website. When I console log the titles, they come out fine, but I want to ultimately show them on a html page. Please note, I'm very new to programming so if you could provide detailed feedback, that would be incredibly helpful (below is the code I've been working on). Thanks in advance!
function parseSites(urls) {
var parsedSites = [];
urls.forEach(function(site) {
request(site, function(err, res, body) {
if(err) {
console.log(err);
} else {
var $ = cheerio.load(body);
parsedSites.push($('title').text());
}
}
});
});
return parsedSites;
}
Please refer to the below code for a working implementation
var request = require('request-promise')
var cheerio = require("cheerio")
function parseSites(urls, callback) {
var parsedSites = [];
var promiseList = urls.map(getPage)
Promise.all(promiseList).then(function (data) {
callback(data.map(parse))
})
return parsedSites;
}
function getPage(url) {
return request.get(url)
}
function parse(body) {
console.log("parsing body")
var $ = cheerio.load(body);
return $('title').text()
}
parseSites(['https://www.google.com','https://www.facebook.com'],function(data) {
console.log(data)
})
First you need to understand the difference between asynchronous and synchronous code. Lets see an example:
function testFor() {
for(let i=0;i<5;++i){
console.log(i);
}
}
-
console.log('start:');
testFor();
console.log('end:');
// Here you get the expected output because this code is synchronous.
//output:
start:
0
1
2
3
4
end:
-
console.log('start:');
setTimeout(testFor,1000);
console.log('end:');
// Here you don't get your expected output because setTimeout is asynchronous .
//output:
start:
end:
0
1
2
3
4
First the console.log('start:'); is called.
Then setTimeout(testFor,1000); (but it is async and the call
will execute in 1 second).
Immediately after the console.log('end:');
is called.
Finally 1 second after, the testFor() is executed and it
prints 0 1 2 3 4
The next point is that there is an error in your code!
function parseSites(urls) {
var parsedSites = [];
urls.forEach(function(site) {
request(site, function(err, res, body) {
if(err) {
console.log(err);
} else {
var $ = cheerio.load(body);
parsedSites.push($('title').text());
}
//} ! THIS bracket should be removed
});
});
return parsedSites;
}
So your problem is that the 'request' in the forEach loop is an async function that will call the callback 'function(err, res, body)' once there is a response from the web page.
My solutions for this:
'use strict'
const cheerio = require('cheerio');
const request = require('request');
const async = require('async');
const urls = ['http://stackoverflow.com/','http://hackaday.com/','https://www.raspberrypi.org/','https://cheerio.js.org/'];
//SOLUTION 1: do what you need to do when all calls are done using recursion
let i=0;
let parsedSites = [];
parseSites(urls[i],parsedSites);
function finalCall(sites) {
console.log(sites);
}
function parseSites(site,parsedSites) {
++i;
request(site, function(err, res, body) {
if(err) {
console.log(err);
} else {
let $ = cheerio.load(body);
let title = $('title').text();
console.log(title);
parsedSites.push(title);
}
if(i<urls.length){
parseSites(urls[i],parsedSites);// recursive call;
}
else{
finalCall(parsedSites);// when all sites are done.
}
});
//return parsedSites;// cant return! we are in async calls!
}
//SOLUTION 2: do what you need to do when all calls are done using 'async'
parseSites(urls);
function finalCall(sites) {
console.log(sites);
}
function parseSites(urls) {
let parsedSites = [];
async.each(urls,function parseSite(site, callback) {
request(site, function (err, res, body) {
if (err) {
callback(err);
} else {
let $ = cheerio.load(body);
parsedSites.push($('title').text());
callback();
}
})
},function (err) {
if(err) console.log(err);
else finalCall(parsedSites);
});
}
Async github page
Async example

Express and Mongodb insert same data multiple times

I am quite new to Express and Mongodb. The project that I am working on requires me to:
Take an object that contains multiple url
Download the content of the url and save it to a cloud storage
Generate links for each of the file saved
Save these links into Mongodb as individual documents
The incoming object looks something like this:
{
"id" : 12345678,
"attachments" : [
{
"original_url" : "https://example.com/1.png",
},
{
"original_url" : "https://example.com/2.png",
},
{
"original_url" : "https://example.com/3.png",
}
]
}
the end goal is to have 3 separate document like this saved on mongodb:
{
"id" : 87654321,
"some_other_data": "etc",
"new_url" : "https://mycloudstorage.com/name_1.png"
}
I have a simple loop like this:
for(var i = 0; i < original_data.attachments.length; i++){
var new_url = "https://example.com/" + i + ".png";
download(original_url, new_url, function(new_url){
console.log(new_url)
new_data.new_url = new_url;
save_new_url_to_mongodb(new_data);
});
}
and the saving function looks like this:
function save_new_url_to_mongodb (data, cb) {
getCollection(collection, function (err, collection) {
if (err) {
return cb(err);
}
collection.insert(data, {w: 1, ordered: false}, function (err, result) {
if (err) {
return cb(err);
}
var item = fromMongo(result.ops);
cb(null, item);
});
});
}
var download = function(original_url, new_url, callback){
request.head(original_url, function(err, res, body){
if(res === undefined){
console.log(err);
} else {
var localUrlStream = request(original_url);
var file = bucket.file(new_url);
var remoteWriteStream = file.createWriteStream();
var stream = localUrlStream.pipe(remoteWriteStream);
stream.on('error', function (err) {
next(err);
});
stream.on('finish', function(){
callback(new_url);
});
}
});
};
The downloading part is fine, I get 3 different image files in my cloud storage. The console.log also gives me 3 different new urls.
The problem is that the newly saved mongodb document all have the same new_url. And sometimes if there are more original_url in the original data, some of the new documents would fail to save.
Thanks a lot
It's a scoping issue in your assignment of new_url in the for loop. See here: JavaScript closure inside loops – simple practical example
A solution is to use Array.Prototype.forEach which inherently solves the scope issue since each iteration creates a closure for the callback
original_data.attachments.forEach(function(i) {
var new_url = "https://example.com/" + i + ".png";
download(original_url, new_url, function(new_url){
console.log(new_url)
new_data.new_url = new_url;
save_new_url_to_mongodb(new_data);
});
})

Select using recursivity to iterate node js

i'm iterating trought a mysql database with 20000 rows , the problem is that in each roww i need to call a url and get its content and update the database, the problem is... how do i wait the whole process to continue the iteration?, and how can i make it faster like , do two at time , do tree at time?. thanks
var query = connection.query('SELECT * from product where product.product_description = "0" ', function(err, rows, fields) {
kontador =1;
if (!err)
{
var url = rows[0].url;
url = url.replace('../..','');
//console.log(url);
id = rows[0].id;
url = 'http://example.com'+url;
doCall(url,id,kontador,function(response){
console.log(response,kontador);
if(response && kontador <= rows.length){
var url = rows[kontador].url;
url = url.replace('../..','');
id = rows[kontador].id;
url = 'http://www.example2.com'+url;
//console.log(id);
doCall(url,id, kontador, doCall);
kontador +=1;
}
});
}
else
console.log('Error while performing Query.');
});
function doCall(urlToCall,id,kontador, callback)
{
request({'url':urlToCall}, function(error, response, html){
//console.log('inside');
//console.log(error);
if(!error){
var $ = cheerio.load(html);
$('#content').filter(function(){
var data = $(this);
data = data.find('p');
// console.log('-');
// console.log(data.html());
var queryy = connection.query(' UPDATE product SET product_description = "'+data.html()+'" WHERE id = '+id, function(err, rows, fields) {
if (!err)
{
console.log('updated! ');
return callback(true);
}else{
console.log('error sql!');
}
});
//process.exit();
});
}
});
}
In order to orchestrate the async behavior of your application (what can be done in parallel, should there be throttling, ..) you should use an existing library like :
async - https://www.npmjs.com/package/async - if you prefer node.js callback style
bluebird - http://bluebirdjs.com/docs/getting-started.html - if you prefer promises
highland - https://www.npmjs.com/package/highland - somewhat hybrid + stream like
There are many other libraries that can help you build complex async call graphs.

node.js Wait for task to be finished

So I am writing this node.js program to import XML files into arrays of JSON objects. I got 2 files to import, teachers.xml and students.xml.
Teachers and student contains several thousands of info about teachers and students. I got that part pretty well covered with my code.
This is my javascript file for parsing the files :
var fs = require('fs');
var xmldom = require('xmldom');
var async = require('async');
// Parse `catalog` xml file and call `callback(catalog domxml root)`
function parse_catalog(catalog, callback) {
// Read xml file content
fs.readFile(catalog, function (err, data) {
if (err) {
throw 'Error reading XML catalog';
} else {
// Parse xml content and return dom root
var domRoot = new xmldom.DOMParser().parseFromString(data.toString());
// Call callback passing dom root
callback(domRoot)
}
});
}
I got two methods like this to convert the xml to json and its working perfectly (one for teachers and one for students)
// Convert teacher XML into json format in a array
function convert_teachers(domCatalog) {
var teachers = domCatalog.getElementsByTagName('teacher');
var teachers_arr= [];
for (var i = 0; i < teachers .length; i++) {
var teacher= teachers[i];
...
//Reading the xml
teachers_arr.push({
...
//Create the json
});
}
console.log("Teachers are now in JSON format ");
}
So in the end all I have to do is this :
parse_catalog('teachers.xml', convert_teachers);
When I do this :
parse_catalog('teachers.xml', convert_teachers);
parse_catalog('students.xml', convert_students);
One or the other will finish first depending on the number of elements to import, witch is normal I think.
What I want is to wait for both to be imported and then do some javascript manipulations and this is where I am stuck.
I tried to do this with async, but it doesnt not wait until the two files are finished to import.
async.parallel([
function(callback) {
parse_catalog('teachers.xml', convert_teachers);
callback();
},
function(callback) {
parse_catalog('students.xml', convert_students);
callback();
}
], function(err) {
if (err) return next(err);
console.log("Finished");
//Enventually some Javascript manipulations on the two arrays
});
Actually it outputs :
Finished
Teachers are now in JSON format
Students are now in JSON format
or depending of the file size
Finished
Students are now in JSON format
Teachers are now in JSON format
What I would like is more like :
Teachers are now in JSON format (or students)
Students are now in JSON format (or teachers)
Finished
I plan to load 2 more files, and the order they will be loaded doesn't matter to me.
Any leads ? Thanks!
You're executing callback() in your async.parallel() functions too soon because by that time fs.readFile() hasn't even started yet. Try something like this instead:
function parse_catalog(catalog, callback) {
// Read xml file content
fs.readFile(catalog, function(err, data) {
if (err)
return callback(err);
// Parse xml content and return dom root
var domRoot = new xmldom.DOMParser().parseFromString(data.toString());
// Call callback passing dom root
callback(null, domRoot);
});
}
// Convert teacher XML into json format in a array
function convert_teachers(domCatalog) {
var teachers = domCatalog.getElementsByTagName('teacher');
var teachers_arr = [];
for (var i = 0; i < teachers .length; i++) {
var teacher = teachers[i];
...
//Reading the xml
teachers_arr.push({
...
//Create the json
});
}
console.log('Teachers are now in JSON format');
return teachers_arr;
}
// and similarly for `convert_students`
async.parallel({
teachers: function(callback) {
parse_catalog('teachers.xml', function(err, domCatalog) {
if (err)
return callback(err);
var teachers = convert_teachers(domCatalog);
callback(null, teachers);
});
},
students: function(callback) {
parse_catalog('students.xml', function(err, domCatalog) {
if (err)
return callback(err);
var students = convert_students(domCatalog);
callback(null, students);
});
}
}, function(err, results) {
if (err) return next(err);
console.log('Finished');
// here you have `results.teachers` and `results.students`
console.dir(results);
});

returning a variable form an async function

I have a module with a function which generates the value for a vaariable for a variable "stitcheBook". I can see and use this value using a callback.
However, I want to have this value available to me as a module property. How can i achieve this?
Note: I wish the output of the _BookStitcher.stitchAllStories function to go into the _BookStitcher.stitchedBook property.
module.exports = _BookStitcher = (function() {
var db = require('../modules/db');
var stitchedBook = {};
var stitchAllStories = function(callback) {
db.dbConnection.smembers("storyIdSet", function (err, reply) {
if (err) throw err;
else {
var storyList = reply;
console.log(storyList);
// start a separate multi command queue
multi = db.dbConnection.multi();
for (var i=0; i<storyList.length; i++) {
multi.hgetall('story/' + String(storyList[i]) + '/properties');
};
// drains multi queue and runs atomically
multi.exec(function (err, replies) {
stitchedBook = replies;
// console.log(stitchedBook);
callback(stitchedBook);
});
};
});
};
return {
stitchedBook : stitchedBook,
stitchAllStories: stitchAllStories
}
})();
EDIT: to add: I know that I can actually set the value from outside by doing something like this;
_BookStitcher.stitchAllStories(function (reply) {
console.log("Book has been stitched!\n\n")
console.log("the Book is;\n");
console.log(reply);
_BookStitcher.stitchedBook = reply;
console.log("-------------------------------------------------------------------------\n\n\n");
console.log(_BookStitcher.stitchedBook);
});
I was wondering if there was a way of doing it from inside the _BookStitcher module itself.
You could take advantage of how object references work in JavaScript, and assign it to a property:
module.exports = _BookStitcher = (function() {
var db = require('../modules/db');
// CHANGE HERE
var stitched = { book: null };
var stitchAllStories = function(callback) {
db.dbConnection.smembers("storyIdSet", function (err, reply) {
if (err) throw err;
else {
var storyList = reply;
console.log(storyList);
// start a separate multi command queue
multi = db.dbConnection.multi();
for (var i=0; i<storyList.length; i++) {
multi.hgetall('story/' + String(storyList[i]) + '/properties');
};
// drains multi queue and runs atomically
multi.exec(function (err, replies) {
// CHANGE HERE
stitched.book = replies;
// console.log(stitchedBook);
callback(replies);
});
};
});
};
return {
stitched : stitched,
stitchAllStories: stitchAllStories
};
}());
So instead of having it inside _BookStitcher.stitchedBook, you'd have it at _BookStitcher.stitched.book.
But that looks awful, and I'd never use it! You can't know when the value will be available, it's only safe to use it from the callback, when you're sure it's been set.

Categories

Resources