I am currently building a web scraper in NodeJS and I am facing a certain problem. After running my code, I receive this error:
undefined is not a valid uri or options object.
I am not sure how to bypass this error, I've looked at these examples: Example One, Example Two
Here is all my code:
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var START_URL = "http://example.com";
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
setInterval(crawl,5000);
function crawl() {
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
setInterval(crawl,5000);
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
console.log(response.statusCode);
callback();
return;
}else{
console.log(error);
}
// Parse the document body
var $ = cheerio.load(body);
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
callback();
});
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
}
Once your pagesToVisit empties, the url will be undefined since calling pop on an empty array returns this value.
I would add a check in visitPage that url is not undefined, e.g.
function visitPage(url, callback) {
if (!url) {
// We're done
return;
}
Or in crawl, check that pagesToVisit has elements, e.g.
function crawl() {
var nextPage = pagesToVisit.pop();
if (!nextPage) {
// We're done!
console.log('Crawl complete!');
} else if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
setInterval(crawl,5000);
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
Taking hints from Terry Lennox's answer, I modified the crawl() function slightly:
function crawl() {
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
setInterval(crawl, 5000);
} else if(nextPage) {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
All I am doing is check whether the popped element exists or not before calling visitPage().
I get the following output:
Visiting page http://example.com
Status code: 200
response.statusCode: 200
null
Found 0 relative links on page
^C
Related
In the code below, I try to return arr with details in it, but I think it's empty, because of the request. What can I do to make this work?
module.exports = function getWeather(country) {
var arr = [];
var pageToVisit = "https://www.timeanddate.com/weather/" + country;
console.log("Visiting page " + pageToVisit);
request(pageToVisit, function(error, response, body) {
if (error) {
console.log("Error: " + error);
}
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if (response.statusCode === 200) {
// Parse the document body
var $ = cheerio.load(body);
console.log("Page title: " + $('title').text());
$('div.bk-focus__qlook').each(function(index) {
var title = $(this).find('div.h2').text().trim();
//var link = $(this).find('div.h1').attr('href');
console.log('title: ' + title);
//console.log(link);
arr.push(title)
});
}
});
return arr;
}
The arr is always empty, and I can't add the title to the arr. How can I wait for it?
I would write a function :
function pagetovisit( parameters if necessary) {
// logic whatever you want to do
}
async function asyncCall() {
/// stock the return value in a var
const result = await pagetovisit();
console.log(result);
// expected output: "resolved"
}
It does not look like that website uses a public-facing weather API. Looks like an pre-rendered HTML page that loads from the server.
The API information does even include weather: https://www.timeanddate.com/services/api/
If you were to return a list of scraped text, you would need to make your function a asynchronous and return the Promise like so:
const parseHTML = (htmlText) =>
new DOMParser().parseFromString(html, 'text/html');
const getWeather = async (country, city) {
var pageToVisit = `https://www.timeanddate.com/weather/${country}/${city}`
console.log(`Visiting page: ${pageToVisit}`);
return fetch(pageToVisit, {
method: 'GET',
headers: {
'Content-Type': 'text/html',
}
})
.then(response => response.text())
.then(parseHTML)
.then(doc =>
[...doc.querySelectorAll('div.bk-focus__qlook')].map(qlook =>
qlook.querySelector('div.h2').textContent.trim()));
};
module.exports = getWeather;
// https://www.timeanddate.com/weather/usa/chicago
const weather = await getWeather('usa', 'chicago'); // Should not be empty
I would ditch this website, because:
Scraping is inefficient
There are better (and free) weather APIs out there
thank you all, I use callback fun and it work.
var cheerio = require('cheerio');
var URL = require('url-parse');
var express = require('express');
var country="canada";
module.exports= function getWeather(country,callback){
var arr=[];
arr.push(country)
var pageToVisit = "https://www.timeanddate.com/weather/"+country;
console.log("Visiting page " + pageToVisit);
request(pageToVisit, function(error, response, body) {
if(error) {
callback(error);
}
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode === 200) {
// Parse the document body
var $ = cheerio.load(body);
console.log("Page title: " + $('title').text());
$('div.bk-focus__qlook').each(function( index ) {
var temp = $(this).find('div.h2').text().trim();
var wind = $(this).find('p').text().trim();
var index=wind.search("Wind")
wind=wind.substring(index);
arr.push(temp)
arr.push(wind)
});
}
callback(null, arr);
});
}
getWeather(country, function(err, result){
if(err) {console.log(err); }
else {
console.log(result);
}
});
I am trying to write a script that scraps through all the links when fed a sites url and checks if in every url if the font used on that page is helvetica, so i came up with the below script (part copied from online).
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var START_URL = "http://example.com/";
var SEARCH_FONT = "helvetica";
var MAX_PAGES_TO_VISIT = 100000;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
var helveticaFound = searchForHelvetica($, SEARCH_FONT);
if(helveticaFound) {
console.log('Word ' + SEARCH_FONT + ' found at page ' + url);
} else {
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
callback();
}
});
}
function searchForHelvetica($, word) {
var bodyText = $('*').each( function(i , e) { console.log($(e).css('fontFamily')) } );
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
}
The problem i am facing is in the below function:
function searchForHelvetica($, word) {
var bodyText = $('*').each( function(i , e) { console.log($(e).css('fontFamily')) } );
}
The line console.log($(e).css('fontFamily')) will always return undefined. I believe cheerio has no way to access the css of an element. How can i work around this ? How do i get the css of each element on the page , run through each element and then check if the helvetica font is used anywhere on the page and return true or false based on this test ?
You have to make use of getComputedStyle
See https://developer.mozilla.org/en-US/docs/Web/API/Window/getComputedStyle
something like:
window.getComputedStyle(node, null).getPropertyValue('font-family');
It will return a string (like "Arial, "Helvetica Neue", Helvetica, sans-serif") in which you can search for the font
Given your exemple, I think you can do:
function searchForHelvetica($, word) {
var bodyText = $('*').each( function(i , e) { console.log(window.getComputedStyle($(e)[0], null).getPropertyValue('font-family')) } );
}
Note:
As it is clearly stated in cheerio README:
Cheerio parses markup and provides an API for traversing/manipulating the resulting data structure. It does not interpret the result as a web browser does. Specifically, it does not produce a visual rendering, apply CSS, load external resources, or execute JavaScript. If your use case requires any of this functionality, you should consider projects like PhantomJS or JSDom.
cheerio does not render/apply CSS.
So you should use jsdom (https://github.com/jsdom/jsdom) as it does support getComputedStyle.
I have a phantom js script that checks every redirection and shows it in the console by the page.onNavigationRequested callback method.
but when i want to catch all the URLs that returned from the page.onNavigationRequested callback method and pushed them to an array and finally show all the URLs at the end of the script, it only shows the first redirection URL.
can you please check the script and advice.
var page = require('webpage').create();
var sys = require('system');
var fs = require('fs');
var response = {};
var arrayOfResponses = [];
var pageUrl = 'http://example.com/r1.php';
phantom.onError = function (msg, trace) {
phantom.exit(1);
};
function forceExit(){
phantom.exit(0);
}
page.onNavigationRequested = function(url, type, willNavigate, main) {
arrayOfResponses.push(url) ;
}
response.content = arrayOfResponses;
page.open(pageUrl, function(status) {
if ( status !== 'success' ) {
phantom.exit( 1 );
} else {
phantom.exit( 0 );
}
}, 100);
setTimeout(forceExit,2000);
console.log(JSON.stringify(response));
and thank you in advance.
There are two issues with your script:
You make PhantomJS exit too early, after the first url is opened. It doesn't have time to follow redirects.
You write the script from top to bottom as if the program flow is linear/synchronous, whereas in javascript it's not — onNavigationRequested can be called many times.
So with that in mind let's rewrite the script to collect all the redirects and exit if no new redirect is made for 2 seconds.
var page = require('webpage').create();
var response = {};
var arrayOfResponses = [];
var pageUrl = 'http://admin.weeqo.com/redirect/r1.php';
var exitTimeout;
// This will be called if no redirects are requested in 2 seconds
function forceExit(){
// Just for fun we'll note the final URL
var curURL = page.evaluate(function(){
return document.location.href
});
console.log("Final URL is " + curURL);
// Prepare and output the report:
response.content = arrayOfResponses;
console.log("List of all requested URLs: " + JSON.stringify(response));
// Now we can exit safely
phantom.exit(0);
}
// This is called before each redirect
page.onNavigationRequested = function(url, type, willNavigate, main) {
// Clear timeout so that script is not shut down
// because we have a new redirect
if(exitTimeout) {
clearTimeout(exitTimeout);
}
arrayOfResponses.push(url);
console.log("Navigation requested: " + url);
// Create timeout that will shut down the script
// in two seconds unless cancelled
exitTimeout = setTimeout(forceExit, 2000);
}
// open the first page
page.open(pageUrl, function(status) {
// We only care for errors because
// who knows how many time will pass before
// we hit the last redirect
if ( status !== 'success' ) {
phantom.exit( 1 );
}
});
Can you store an array using browser.storage.local.set or achieve the same result with a different method?
Details:
My extension currently will redirect a website specified through the options.html form. Currently when you specify a new website the old one will be replaced. Is there a way I can append to an array of websites that will be redirected instead of replacing the website?
options.js: (will process information from form in options.html)
function saveOptions(e) {
e.preventDefault();
browser.storage.local.set({
url: document.querySelector("#url").value
});
}
function restoreOptions() {
function setCurrentChoice(result) {
document.querySelector("#url").value = result.url || "reddit.com";
}
function onError(error) {
console.log(`Error: ${error}`);
}
var getting = browser.storage.local.get("url");
getting.then(setCurrentChoice, onError);
}
document.addEventListener("DOMContentLoaded", restoreOptions);
document.querySelector("form").addEventListener("submit", saveOptions);
redirect.js:
function onError(error) {
console.log(`Error: ${error}`);
}
function onGot(item) {
var url = "reddit.com";
if (item.url) {
url = item.url;
}
var host = window.location.hostname;
if ((host == url) || (host == ("www." + url))) {
window.location = chrome.runtime.getURL("redirect/redirect.html");
}
}
var getting = browser.storage.local.get("url");
getting.then(onGot, onError);
One thought I had was to add a storage location per url, however i would have to also be stored to prevent it getting reset each time options.js is loaded. (Something similar to the below code)
var i = 0;
browser.storage.local.set({
url[i]: document.querySelector("#url").value
});
i++;
A more logical solution would be for the url storage location to be an array.
If there is a way for url to be an array then redirect.html could contain the following:
if ( (url.includes (host) ) || (url.includes ("www." + host) ) ){
window.location = chrome.runtime.getURL("redirect.html");
}
Fresh eyes has solved my problem.
In options.js:
function saveOptions(e) {
e.preventDefault();
var array = (document.querySelector("#url").value).split(",");
browser.storage.local.set({
url: array
});
In redirect.js:
function onGot(item) {
var url = "";
if (item.url) {
url = item.url;
}
var host = window.location.hostname;
if ( (url.includes(host)) || (url.includes("www." + host)) ) {
window.location = chrome.runtime.getURL("redirect/redirect.html");
}
}
I'm just finishing off this basic webscraper project for a tshirt website.
It enters through one hardcoded url, the home page. It will search for any product pages, and add them to an url. If it finds another link (remainder), it will scrape that again and find any more product pages. It adds the product pages to urlSet and will then scrape those again, grab the tshirt data (price, img, title) and then convert, then write them to a CSV file.
For some reason, this is not working on the second run through of the scrape with 'remainder'.
If I remove the second scrape of url, everything works out fine and the file gets written correctly. But if I want to get the other product pages, it seems to be failing somewhere.
Here is my code, i apologise for posting so much of it but I don't know how it will be understood properly without the right context, hopefully it's been commented okay:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error)return reject(error);
if(!error && response.statusCode == 200){
return resolve(html);
}
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(url + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = imgURL;
tshirtObject.URL = url;
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
convertJson2Csv();
}
//convert tshirt objects and save as CSV file
function convertJson2Csv(){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) throw err;
});
}
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape) //scrape again but with remainder url
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
I'm console logging the arrayOfLinks in nextStep so I can see that they are being grabbed properly, I just cannot work out why they aren't being passed through to 'lastStep' properly.
Currently scraping http://shirts4mike.com/
[ 'http://shirts4mike.com/shirts.php',
'http://shirts4mike.com/shirts.php',
'http://shirts4mike.com/shirt.php?id=108',
'http://shirts4mike.com/shirt.php?id=107',
'http://shirts4mike.com/shirt.php?id=106',
'http://shirts4mike.com/shirt.php?id=105' ]
The remainder is http://shirts4mike.com/shirts.php
http://shirts4mike.com/shirt.php?id=108
http://shirts4mike.com/shirt.php?id=107
http://shirts4mike.com/shirt.php?id=106
http://shirts4mike.com/shirt.php?id=105
Currently scraping http://shirts4mike.com/shirts.php
[ 'http://shirts4mike.com/shirts.phpshirts.php',
'http://shirts4mike.com/shirts.phpshirt.php?id=101',
'http://shirts4mike.com/shirts.phpshirt.php?id=102',
'http://shirts4mike.com/shirts.phpshirt.php?id=103',
'http://shirts4mike.com/shirts.phpshirt.php?id=104',
'http://shirts4mike.com/shirts.phpshirt.php?id=105',
'http://shirts4mike.com/shirts.phpshirt.php?id=106',
'http://shirts4mike.com/shirts.phpshirt.php?id=107',
'http://shirts4mike.com/shirts.phpshirt.php?id=108' ]
BUT if I choose to only call the first scrape and don't call the second, like this:
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
... Then everything works. I just don't get to all the urls.
What is happening here and how can I fix it? Thank you guys
The issue is tshirtArray is not defined in convertJson2Csv(). At lastlastScraperPt2 pass tshirtArray to convertJsonCsv()
convertJson2Csv(tshirtArray)
at convertJson2Csv
function convertJson2Csv(tshirtArray) {
// do stuff
}
One problem seems to be in your lastStep. It looks like you mean for remainder to be another array of urls. Correct me if I'm wrong there. However, what's happing is that the first time the if($('[type=submit]').length !== 0) condition fails, you'll automatically go down to the next block, because remainder start undefined. Whatever the current url is, you assign that one to remainder. For the rest of the iterations of your for-loop, you will never again hit the condition where remainder == undefined. So if you will only ever end up with one url assigned to remainder, while any more that you were hoping to get will simply be passed over.
You might want to define remainder as remainder = [];. And then instead of saying else if (remainder == undefined), you would just say
} else {
remainder.push(obj.arrayOfUrls[i]);
}
However, then you're passing an array of urls to scrape when scrape is only expecting a single url. If this is what you want and I am right in assuming that you mean for remainder to be an array of urls, you could defined a new function as follows:
function scrapeRemainders(remainders) {
var promises = [];
remainder.forEach(function (url) {
promises.push(requestPromise(url));
});
return Promise.all(promises).then(function (results) {
_.flattenDeep(results);
})
}
Then instead of the second scrape in your promise chain, you would replace it with scrapeRemainders. Also, for you the _ in the previous function, you would need to npm install lodash and then var _ = require('lodash'). On a side note, lodash has nothing to do with promises, but it is a great tool for data manipulation. You should look into it when you have the chance.
Also, in lastScraperPt1, you can change
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
to
return Promise.all(promiseArray);
It does the same thing.
Hope this helps. If this does not answer your question, comment at me and I can change my answer accordingly.
All fixed, it was grabbing the wrong urls in scrape(). Though I only knew this after I logged the statusCodes to the console :
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var urlHome = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = [];
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error) {
errorHandler(error);
return reject(error);
}
if(!error && response.statusCode == 200){
return resolve(html);
}
if(response.statusCode !== 200){
console.log("response code is " + response.statusCode);
}
return resolve("");
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
var URL = 'http://shirts4mike.com/';
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(URL + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.push(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
var remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = urlHome + imgURL;
tshirtObject.URL = urlSet[i];
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
return tshirtArray;
}
//conver tshirt objects and save as CSV file
function convertJson2Csv(tshirtArray){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) errorHandler(err);
});
}
scrape(urlHome) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape)
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.then(convertJson2Csv)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
//If the site is down, an error message describing the issue should appear in the console.
//This is to be tested by disabling wifi on your device.
//When an error occurs log it to a file scraper-error.log . It should append to the bottom of the file with a time stamp and error
var errorHandler = function (error) {
console.log(error.message);
console.log('The scraper could not not scrape data from ' + url + ' there is either a problem with your internet connection or the site may be down');
/**
* create new date for log file
*/
var loggerDate = new Date();
/**
* create message as a variable
*/
var errLog = '[' + loggerDate + '] ' + error.message + '\n';
/**
*when the error occurs, log that to the error logger file
*/
fs.appendFile('scraper-error.log', errLog, function (err) {
if (err) throw err;
console.log('There was an error. The error was logged to scraper-error.log');
});
};