get rendered text of html page on express server [duplicate] - javascript

I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?

Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});

Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.

Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})

Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());

Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000

Related

function in page.evaluate() is not working/being executed - Puppeteer

My Code below tries to collect a bunch of hyper links that come under the class name ".jss2". However, I do not think the function within my page.evaluate() is working. When I run the code, the link_list const doesn't get displayed.
I ran the document.querySelectorAll on the Chrome console and that was perfectly fine - really having a hard time with this.
async function testing() {
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.setViewport({width: 1200, height: 800});
await page.goto(url);
const link_list = await this.page.evaluate(() => {
let elements = Array.from(document.querySelectorAll(".jss2"));
let links = elements.map(element => {
return element.href;
});
return (links);
});
console.log(link_list);
}
const link_list = await page.$$eval('.classname', links => links.map(link => link.href));
Found the answer here: PUPPETEER - unable to extract elements on certain websites using page.evaluate(() => document.querySelectorAll())

NodeJs Pagination,recursive promise problem

I am scraping multiple pages with cheerio and axios in node.js
I am having a hard time with Promises, can someone help me return the JSON if I hit the last page? Thanks!
const getWebsiteContent = async (url) => {
await axios.get(url).then(res => {
const $ = cheerio.load(res.data)
pageNum = getTotalpages($); // Get the pagination
console.log(url);
//Some scraping here
})
indexPage++; // Increment to the next page
const nextPageLink = baseUrl + '&page=' + indexPage; // get next page
if (indexPage > pageNum) {
var editedText = text.slice(0, text.length - 1);
editedText += ']}';
editedText = JSON.parse(editedText); // I want to return this and use elsewhere
return editedText;
}
setTimeout(async () => {
getWebsiteContent(nextPageLink); // Call itself
}, 1000);
}
var myJSON= await getWebsiteContent(baseUrl); // something like this
I would write getPages as an async generator -
async function* getPages (href, initPage = 0) {
const res = await axios.get(setPage(href, initPage))
const $ = cheerio.load(res.data)
const pages = getTotalpages($)
yield { page: initPage, dom: $ }
for (let p = initPage; p < pages; p++) {
await sleep(1000)
const r = await axios.get(setPage(href, p))
yield { page: p, dom: cheerio.load(r.data) }
}
}
This depends on helper setPage that manipulates the href page number using the url module, which is much safer than hobbling together strings by hand -
function setPage (href, page) {
const u = new URL(href)
u.searchParams.set("page", page)
return u.toString()
}
And another helper, sleep, which prevents the mixing of setTimeout with async-based code. This allows us to easily pause between pages -
async function sleep (ms) {
return new Promise(r => setTimeout(r, ms))
}
Finally we write scrape which is a simple wrapper around getPages. This allows us to reuse the getPages function to scrape various elements as needed. A benefit of using this approach is that the caller can determine what happens with each page. Below we push to result array, but as another example we could write each page to disk using the fs module. Obviously this for you to decide -
async function scrape (href) {
const result = []
for await (const {page, dom} of getPages(href)) {
console.log("scraped page", page) // some status message
result.push(getSomeData(dom)) // get something from each page
}
return result
}
scrape(myUrl).then(console.log, console.error)
You shouldn't be using then with your async / await code.
pagination should look something like this:
let response = await axios.get(url)
let $ = cheerio.load(response.data)
// do some scraping
while(url = $('[rel=next]').attr('href')){
response = await axios.get(url)
$ = cheerio.load(response.data)
// do more scraping
}

Why am I getting Promise { Pending }?

I'm working on a web scraper in Javascript using puppeteer and whenever I try to log the text content of an element it says "Promise { Pending }". I've looked at other answers and none of them worked
const element = await page.$("#ctl00_ContentPlaceHolder1_NameLinkButton");
const text = await page.evaluate(element => element.textContent, element);
console.log(text);
Your answer is correct. but I think you forget to add await before page.evaluate().
There three ways to do that.
First way. just like what are you do. but I don't prefer it because
you don't need to call page.evaluate() to get .textContent
const puppeteer = require('puppeteer');
puppeteer.launch().then(async browser => {
const elementId = 'container';
const page = await browser.newPage();
await page.goto('https://metwally.me');
const element = await page.$(`#${elementId}`);
if (element) {
const text = await page.evaluate(element => element.textContent, element);
console.log(text);
} else {
// handle not exists id
console.log('Not Found');
}
});
Second way. you will call page.evaluate() and use JavaScript Dom to get textContent. like document.getElementById(elementId).textContent.
const puppeteer = require('puppeteer');
puppeteer.launch().then(async browser => {
const elementId = 'container';
const page = await browser.newPage();
await page.goto('https://metwally.me');
const text = await page.evaluate(
elementId => {
const element = document.getElementById(elementId);
return element ? element.textContent : null;
}, elementId);
if (text !== null) {
console.log(text);
} else {
// handle not exists id
console.log('Not Found');
}
});
Third way. you will select element by puppeteer selector then get textContent property using await element.getProperty('textContent') then get value from textContent._remoteObject.value.
const puppeteer = require('puppeteer');
puppeteer.launch().then(async browser => {
const elementId = 'container';
const page = await browser.newPage();
await page.goto('https://metwally.me');
const element = await page.$(`#${elementId}`);
if (element) {
const textContent = await element.getProperty('textContent');
const text = textContent._remoteObject.value;
console.log(text);
} else {
// handle not exists id
console.log('Not Found');
}
});
NOTE: All these examples working successfully in my machine.
os ubuntu 20.04
nodejs v10.19.0
puppeteer v1.19.0
References
Puppeteer page.$
Document.getElementById()
Node.textContent

How to get input element with puppeteer, when the page load all elements inside frameset tag

I am trying to get all input element in this website:
http://rwis.mdt.mt.gov/scanweb/swframe.asp?Pageid=SfHistoryTable&Units=English&Groupid=269000&Siteid=269003&Senid=0&DisplayClass=NonJava&SenType=All&CD=7%2F1%2F2020+10%3A41%3A50+AM
Here is element source page looks like.
here is my code:
const puppeteer = require("puppeteer");
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(
"http://rwis.mdt.mt.gov/scanweb/swframe.asp?Pageid=SfHistoryTable&Units=English&Groupid=269000&Siteid=269003&Senid=0&DisplayClass=NonJava&SenType=All&CD=7%2F1%2F2020+10%3A41%3A50+AM"
);
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll("input").length;
return items;
});
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
});
}
run().then(console.log).catch(console.error);
Right now my output have 0, when i run document.querySelectorAll("input").length in the console, it give me 8 .
It seems like everything is loaded in the frameset tag, this might be the issue, could anyone have any idea how to solve this issue?
You have to get the frame element, from there you can get the frame itself so you can call evaluate inside that frame:
const elementHandle = await page.$('frame[name=SWContent]');
const frame = await elementHandle.contentFrame();
let urls = await frame.evaluate(() => {
let results = [];
let items = document.querySelectorAll("input").length;
return items;
});

Traversing HTML with Cheerio. Attempting to grab img src nested inside <a> tag [duplicate]

I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000

Categories

Resources