Empty array when trying to scrape YouTube main page with Node.js

Empty array when trying to scrape YouTube main page with Node.js - javascript

so I'm playing around with request and cheerio npm's and I can't seem to find a solution, why does it keep giving me empty arrays. I used same code when I scraped reddit and it worked like a charm, but when I use it on YouTube or any other page it doesn't work.
var request = require('request'),
cheerio = require('cheerio'),
fs = require('fs'),
urls = [];
request('https://www.youtube.com/', function(err,resp,body) {
if(!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
$('a.yt-simple-endpoint style-scope ytd-grid-video-renderer', 'primary').each(function() {
var url = $(this);
urls.push(url);
});
And this is my reddit code (works fine)
var request = require('request'),
cheerio = require('cheerio'),
fs = require('fs'),
urls = [];
request('http://www.reddit.com/', function(err,resp,body) {
if(!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
$('a.title', '#siteTable').each(function() {
var url = $(this).attr('href');
if(url.indexOf('imgur.com')!= -1) {
urls.push(url);
}
});
Output Example: [ 'http://i.imgur.com/WVrmZ9j.gifv',
'http://i.imgur.com/T0BchYC.gifv',
'http://imgur.com/u59lzux' ]

The HTML that cheerio loads for youtube is different.
Do res.send($.html()); to check the HTML structure and target it accordingly.

If you need to get some information from YouTube, you can't use Cheerio. Instead, you need to use browser automation, e.g. Puppeteer because YouTube dynamically loads content on the page via JavaScript. In the code below I show you how you can do this (also check it on the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const mainPageUrl = "https://www.youtube.com";
async function getUrls() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(mainPageUrl);
await page.waitForSelector("#contents");
const urls = await page.$$eval("a#thumbnail", (els) => {
return els.map(el => el.getAttribute('href') ? "https://www.youtube.com" + el.getAttribute('href') : undefined).filter((el) => el)
});
await browser.close();
return urls;
}
getUrls().then(console.log);
Output
[
"https://www.youtube.com/watch?v=0rliFQ0qyAM",
"https://www.youtube.com/watch?v=36YnV9STBqc",
"https://www.youtube.com/watch?v=w1_hYe3hhjE",
"https://www.youtube.com/watch?v=Uv6iK2kS3qQ",
"https://www.youtube.com/watch?v=hUGWLAFqYUc",
"https://www.youtube.com/watch?v=17TYygDfr28",
"https://www.youtube.com/watch?v=2isYuQZMbdU",
...and other results
]
You can read more about scraping YouTube from my blog posts:
Web scraping YouTube search video results with Nodejs
Web scraping YouTube secondary search results with Nodejs
Web scraping YouTube video page with Nodejs

Related

How do I download a file in a foreach loop using the Node.js via Request or HTTP?

I have an array of strings that are links to a URL that I would like to download content from. The downloadLinks array looks like:
['stackoverFlow.com/File0', 'stackoverFlow.com/File1', 'stackoverFlow.com/File2'].
Here is the code that I am using to try and download these files. Example code for both using Request and HTTPS.get
var fs = require ('fs');
var sleep = require('sleep');
var https = require('https');
var request = require('request')
// Using Request:
var uniqueDownloadLinks = [ ...new Set(downloadLinks)]
for (downloadLink in uniqueDownloadLinks) {
const download = (url, callback) => {
request.head(url, (err, response, body) => {
var file = fs.createWriteStream(dest);
var dest = 'files/' + response.headers['content-disposition'].toLowerCase().split('filename=')[1].split(';')[0].replace(/"/g, '');
request(url)
.pipe(file)
.on('close', callback)
})
}
download(uniqueDownloadLinks[downloadLink], () => {
console.log('fileDownloaded')
})
sleep.sleep(5);
}
// Using HTTPS.get
var uniqueDownloadLinks = [ ...new Set(downloadLinks)]
for (downloadLink in uniqueDownloadLinks) {
var download = function(url) {
var request = https.get(url, function(response) {
var dest = 'files/' + response.headers['content-disposition'].toLowerCase().split('filename=')[1].split(';')[0].replace(/"/g, '');
var file = fs.createWriteStream(dest);
response.pipe(file);
file.on('finish', function() {
file.close();
});
});
download(downloadLink);
sleep.sleep(5);
}
I have tested this code outside of the foreach loops in a test.js file including only the code to download a file and it works. The files output, everything is happy. When I try to download using the foreach loops I do not download anything. I suspect this has to do with me trying to download inside of a foreach loop but it is difficult to debug what is going wrong because I never enter the request code (For example trying to use console.log(dest) to output the destination + filename from the content-disposition header.
What should I do to be able to iterate over this array of links, and download each file?

Thanks to the comment made by Estus Flask I looked into the problem with using sleep and determined that he was correct in his assessment that "sleep is awful".
The fix to this involves making the download function async and using a promise to handle the sleep of 5 seconds.
async function sleep(milliseconds) {
return new Promise(resolve => setTimeout(resolve, milliseconds));
}
await sleep(5000)
const $ = await download(uniqueDownloadLinks[downloadLink], () => {
console.log('fileDownloaded')
})

How to save the downloadable file link in zombie.js

I am scrapping a website using node.js and zombie.js. I am facing an issue where in a file I have an anchor which holds the link to download a pdf file.
If I click it using browser.clickLink() function, the result that I get in console is beyond my understanding. Is there a way to save this pdf file and have its link like in php? I want to save it for further processing. Here is my test js code
var http = require('http');
var browser = require('zombie');
var assert = require('assert');
const hostname = '127.0.0.1';
const port = 3000;
const server = http.createServer((req, res) => {
res.statusCode = 200;
//res.setHeader('Content-Type', 'text/plain');
//res.end('Hello World\n');
});
server.listen(port, hostname, () => {
console.log(`Server running at http://${hostname}:${port}/`);
});
var url = 'http://localhost/Node/HM_LandRegistry/downloadPdf.html'
browser.visit(url, function(error,browser) {
//browser.dump();
//console.log('browser.text (".link")', browser.text(".link"));
browser.clickLink("a.link");
browser.wait().then(function(){
console.log(browser.text());
browser.dump();
});
});

Here is something I found on google groups. It has solved my problem.
function getLinks(browser) {
var links = browser.querySelectorAll('.link');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href'); // returns an array. Use .toString() to get string only
});
}
Save the link

x-ray: Read html from a file rather than a URL

Code
const Xray = require('x-ray');
const xray = Xray();
// How do I read a file, rather than a URL?
const url = 'https://www.tel-o-fun.ga/';
xray(url, '.marker')((err, value) => {
console.log(value);
});
My goal
I am using x-ray to scrape some date from a website. For testing and development purposes, I would like to parse data from a local file rather than a remote resource.
How do I load a local file into x-ray, instead of pointing it to a remote URL?

This example from the x-ray repo solved my problem. Simply pass an HTML string instead of a URL:
const path = require('path');
const Xray = require('x-ray');
const read = require('fs').readFileSync;
const html = read(path.resolve(__dirname, 'index.html'));
const xray = Xray();
xray(html, '.marker')((err, value) => {
console.log(value);
});

Issue when doing web scraper

I am scraping the webpage https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html
I need to get the title from the table data.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res) {
url = 'https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html';
request(url, function(error, response, body) {
if (!error) {
var $ = cheerio.load(body);
var arr = [];
var title = $('.mp-user-rating tr').each(function() {
var tableData = $('.marketplace-name > .mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
}
res.send('Check your console!')
});
})
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
Here the data is in third column and cannot able to get .mp-user-rating tr data what is expected.
The image shows the structure of the table
Any help would be appreciated.

So, I went to the page and ran this in the console.
var arr = [];
var title = jQuery('.mp-user-rating tr').each(function(i, element) {
var tableData = jQuery(element).find('.mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
console.log(arr);
The array consists of 8 objects that each have the titles within them.
UPDATE:
I pulled in the html information using your code. I think the issue is, the html is loaded asynchronously by the website, as a result, pulling the html will only retrieve the static markup. You will need to use PhantomJS or chrome's headless browser in order to load the website and allow the asyncronous information to load, then you can grab the html.
See here for some good docs on PhantomJS: https://github.com/Medium/phantomjs

Generate PDF using EVOPdf, WebAPI and AngularJS

I'm having an issue rendering a PDF using EVOPdf from a WebAPI controller to an AngularJS app.
This is my code so far:
Angular call:
var url = 'api/form/build/' + id;
$http.get(url, null, { responseType: 'arraybuffer' })
.success(function (data) {
var file = new Blob([data], { type: 'application/pdf' });
if (window.navigator && window.navigator.msSaveOrOpenBlob) {
window.navigator.msSaveOrOpenBlob(file);
}
else {
var objectUrl = URL.createObjectURL(file);
window.open(objectUrl);
}
});
APIController method:
var url = "http://localhost/index.html#/form/build/" + id;
#region PDF Document Setup
HtmlToPdfConverter htmlToPdfConverter = new HtmlToPdfConverter();
htmlToPdfConverter.LicenseKey = "4W9+bn19bn5ue2B+bn1/YH98YHd3d3c=";
//htmlToPdfConverter.HtmlViewerWidth = 1024; //default
htmlToPdfConverter.PdfDocumentOptions.PdfPageSize = PdfPageSize.A4;
htmlToPdfConverter.PdfDocumentOptions.PdfPageOrientation = PdfPageOrientation.Portrait;
htmlToPdfConverter.ConversionDelay = 3;
htmlToPdfConverter.MediaType = "print";
htmlToPdfConverter.PdfDocumentOptions.LeftMargin = 10;
htmlToPdfConverter.PdfDocumentOptions.RightMargin = 10;
htmlToPdfConverter.PdfDocumentOptions.TopMargin = 10;
htmlToPdfConverter.PdfDocumentOptions.BottomMargin = 10;
htmlToPdfConverter.PdfDocumentOptions.TopSpacing = 10;
htmlToPdfConverter.PdfDocumentOptions.BottomSpacing = 10;
htmlToPdfConverter.PdfDocumentOptions.ColorSpace = ColorSpace.RGB;
// Set HTML content destination in PDF page
htmlToPdfConverter.PdfDocumentOptions.Width = 640;
htmlToPdfConverter.PdfDocumentOptions.FitWidth = true;
htmlToPdfConverter.PdfDocumentOptions.StretchToFit = true;
#endregion
byte[] outPdfBuffer = htmlToPdfConverter.ConvertUrl(url);
string outPdfFile = #"c:\temp\forms\" + id + ".pdf";
System.IO.File.WriteAllBytes(outPdfFile, outPdfBuffer);
HttpResponseMessage result = null;
result = Request.CreateResponse(HttpStatusCode.OK);
result.Content = new ByteArrayContent(outPdfBuffer.ToArray());
result.Content.Headers.ContentDisposition = new ContentDispositionHeaderValue("attachment");
result.Content.Headers.ContentDisposition.FileName = "filename.pdf";
result.Content.Headers.ContentType = new MediaTypeHeaderValue("application/pdf");
return result;
When I check the PDF that I write out using WriteAllBytes, it renders perfectly but when it is returned via the Angular call and opened in Adobe Reader, I get an "Invalid Color Space" error message that pops up quite a few times, but the document is not opened. When I change the colorspace to GrayScale, the PDF opens but it's blank.
I have a feeling that it's the ByteArrayContent conversion that's causing the issue, seen as that's the only thing that happens between the actual creation of the PDF and sending it back to the Angular call, but I've hit a brick wall and can't figure out what the problem is.
I'd really appreciate any help you guys can offer because I'm so close to sorting this out and I just need the document to "convert" properly when returned from the call.
Thanks in advance for any help.
Regards,
Johann.

The problem seems to like on the client side, the characters are not properly parsed in the response. For anyone strugling with this i found my solution here: SO Question

Have you tried Headless Chrome? Here is a nice article about this topic. I was using https://github.com/puppeteer/puppeteer for this purpose and it was an easily integrated solution.

// install puppeteer-core npm package cmd
npm i puppeteer-core
# or "yarn add puppeteer-core"
<!-- example.js start-->
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com');
await page.screenshot({ path: 'example.png' });
await browser.close();
})();
Execute script on the command line
node example.js
Puppeteer sets an initial page size to 800×600px, which defines the screenshot size. The page size can be customized with Page.setViewport().
Example - create a PDF.
Save file as hn.js
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://news.ycombinator.com', {
waitUntil: 'networkidle2',
});
await page.pdf({ path: 'hn.pdf', format: 'a4' });
await browser.close();
})();
Execute script on the command line
node hn.js
See Page.pdf() for more information about creating pdfs.
Example - evaluate script in the context of the page
Save file as get-dimensions.js
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com');
// Get the "viewport" of the page, as reported by the page.
const dimensions = await page.evaluate(() => {
return {
width: document.documentElement.clientWidth,
height: document.documentElement.clientHeight,
deviceScaleFactor: window.devicePixelRatio,
};
});
console.log('Dimensions:', dimensions);
await browser.close();
})();
Execute script on the command line
node get-dimensions.js
See Page.evaluate() for more information on evaluate and related methods like evaluateOnNewDocument and exposeFunction.

Develop Reference

JavaScript is the programming language of the Web.

Empty array when trying to scrape YouTube main page with Node.js - javascript

The HTML that cheerio loads for youtube is different. Do res.send($.html()); to check the HTML structure and target it accordingly.

Related

How do I download a file in a foreach loop using the Node.js via Request or HTTP?

How to save the downloadable file link in zombie.js

x-ray: Read html from a file rather than a URL

Issue when doing web scraper

Generate PDF using EVOPdf, WebAPI and AngularJS

Categories

Resources