Open a link in a new tab, scrape, go to previous page - javascript

I'm using puppeteer for the following:
I switched await link.click(".ExCategory-results > .ExResult-row:nth-child(${i}) > .ExResult-cell > .ExHeading > a",); for await new.page('...') but it says that it can't find the a.
This is the page that I'm scraping but notice the Load More button at the bottom of the page.
https://www.bodybuilding.com/exercises/finder
To prevent resetting the Load more button I want to open each new in a new tab, scrape, close tab and go to the next name.
How can I open each link in a new tab, close, and go to the previous tab?
My code:
var buttonExists = true;
let allData = [];
while (buttonExists == true) {
// const loadMore = true;
const rowsCounts = await page.$$eval(
'.ExCategory-results > .ExResult-row',
(rows) => rows.length
);
console.log(`row counts = ${rowsCounts}`);
for (let i = 2; i < rowsCounts + 1; i++) {
const exerciseName = await page.$eval(
`.ExCategory-results > .ExResult-row:nth-child(${i}) > .ExResult-cell > .ExHeading > a`,
(el) => el.innerText
);
console.log(`Exercise = ${exerciseName}`);
await link.click(`.ExCategory-results > .ExResult-row:nth-child(${i}) > .ExResult-cell > .ExHeading > a`,);
await page.waitForSelector('#js-ex-content');
... fancy code here
await page.goBack();
let obj = {
exercise: exerciseName,
};
allData.push(obj);
}
// clicking load more button and waiting 1sec
try {
await page.click(LoadMoreButton);
}
catch (err) {
buttonExists = false;
}
await page.waitForTimeout(1000);
}

This selector: .ExCategory-results > .ExResult-row:nth-child(${i}) > .ExResult-cell > .ExHeading > a is unnecessarily long and it gives you not completely correct results.
To get to these elements:
this selector should be enough: .ExResult-row .ExHeading > a.
Then you asked:
I want to open each new in a new tab, scrape, close tab and go to the next name.
and
How can I open each link in a new tab, close, and go to the previous tab?
In Puppeteer you can create a new page like so: await browser.newPage();, so you can do it many times and store the pages into an array:
let pages = [];
pages.push(await browser.newPage());
then you get the links:
const links = await pages[0].$$eval(
'.ExResult-row .ExHeading > a',
links => links.map(l => l.getAttribute('href'))
);
and finally to create a new page for each link, scrape what you need, and close the page:
for (let link of links) {
pages.push(await browser.newPage());
await pages[pages.length - 1].goto(`${baseUrl}/${link}`);
// your scraping
await pages[pages.length - 1].close();
}
If you need to look up more, refer to the API documentation Puppeteer provides.

Related

Check if html tag contains text node

I have a popup modal in Shopify, I'm using text node instead of innerHtml for security concerns. However, everytime I open the popup modal, the text node keeps getting appended to my h1 tag. Is there any way to check if the node already has been appended? (I don't want to use a boolean value to check if text node has been appended)
html:
<h1 id="ProductHeading" class="product__title product__title--template"></h1>
<h2 id="ProductHeadingModal" class="product__title product__title--template product__title--modal"></h2>
javascript:
var title = document.createTextNode(product.title);
// Product heading is an element with h1 tag
var productHeading = document.getElementById("ProductHeading");
if(// how to check if element has no node?) {
productHeading.appendChild(title);
}
the entire javascript block:
window.onload = () => {
if (window.__shgProductInits.length) {
window.__shgProductInits.forEach((ele) => {
let proId = document.getElementById(ele.uuid);
proId.setAttribute('url', ele.productHandle);
proId.style.cursor='pointer';
proId.addEventListener('click', (e) => {
let productHandle = e.target.parentElement.parentElement.parentElement.getAttribute('url');
fetch('/products/'+productHandle+'.js')
.then((res) =>{return res.json()})
.then((product) => {
console.log(product)
var product = product;
document.getElementsByClassName("product-modal")[0].style.display = "block";
var title = document.createTextNode(product.title);
var productHeading = document.getElementById("ProductHeading");
var productHeadingModal = document.getElementById("ProductHeadingModal");
if(!(productHeading.hasChildNodes())) {
productHeading.appendChild(title);
productHeadingModal.appendChild(title);
var price = document.createTextNode("$" + parseInt(product.price).toFixed(2));
document.getElementById("product-price").appendChild(price);
}
document.getElementById("product-image").src = product.images[0];
});
});
});
}
ProductHeading itself is not a node (I think). And checking innerHtml for length doesn't work as it is always 0
Update:
I've added the conditional check, it still returns false everytime I open the modal.
My code:
My browser console:
My website displays:
Inspect element in browser:
A couple of ways:
if (element.firstChild) {
// It has at least one
}
or the hasChildNodes() function:
if (element.hasChildNodes()) {
// It has at least one
}
or the length property of childNodes:
if (element.childNodes.length > 0) { // Or just `if (element.childNodes.length)`
// It has at least one
}
So you can just write this
var title = document.createTextNode(product.title);
// Product heading is an element with h1 tag
var productHeading = document.getElementById("ProductHeading");
if(!(productHeading.hasChildNodes())) {
productHeading.appendChild(title);
}
Referring this answer
if (productHeading.hasChildNodes()) {
}

How do I sequentially go down a list of links each time a user clicks a button?

I start with a button that activates the function onclick:
<button onclick="openStuff();">Click here!</button>
Then, I have an array of links like so:
var links = [
"msn.com",
"google.com",
"youtube.com",
"bbc.com",
"facebook.com",
"cnn.com",
"fox.com",
"techcrunch.com"];
Finally, I define the function that randomly applies a link to the button from the above array:
openStuff = function () {
// get a random number between 0 and the number of links
var randIdx = Math.random() * links.length;
// round it, so it can be used as array index
randIdx = parseInt(randIdx, 10);
// construct the link to be opened
var link = 'https://' + links[randIdx];
// open it in a new window / tab (depends on browser setting)
window.open(link);
};
The problem is I need it to not be random. Rather, I need it to go down the list sequentially as the user clicks the button. If the user clicks the button once it should go to the first link. But, if this user clicks the button a second time it should go to the second link in the list.
For example: User clicks button: Go to first link, User clicks button again: go to second link, and so on.
var linkIndex = 0;
openStuff = function () {
//when no more links are available to click then return
if(links.length <= linkIndex) return;
// construct the link to be opened
var link = 'https://' + links[linkIndex];
// open it in a new window / tab (depends on browser setting)
window.open(link);
linkIndex++;
};
You need a global index that start in 0. Then, inside the function you increment it after open the link.
var link = 'https://' + links[index];
index++;
I would do it like this:
var special_button_state = 0;
const links = [
"msn.com",
"google.com",
"youtube.com",
"bbc.com",
"facebook.com",
"cnn.com",
"fox.com",
"techcrunch.com"
];
const openStuff = function () {
var link = 'https://' + links[special_button_state];
if(special_button_state < links.length-1) {
special_button_state++;
}
window.open(link);
console.log(link);
};
<button onclick="openStuff();">Click here!</button>
You could have another variable set to the first index of the array, something like:
var linkTarget = 0
and then update linkTarget in the function to increment appropriately:
openStuff = function () {
// construct the link to be opened
var link = 'https://' + links[linkTarget];
// open it in a new window / tab (depends on browser setting)
window.open(link);
// update the index
if (linkTarget++ == links.length)
linkTarget = 0
};
You could also implement a dynamic label so that you know where it's going:
<body onload="setLink()">
<button id="turntable-button" onclick="openStuff()" />
</body>
Label setter on page load (put the same getElementById() line in openStuff() after updating the index):
setLink = function() {
document.getElementById("turntable-button").innerHTML = links[linkTarget];
}

Can't find a working selector on drop down menu while using Puppeteer? [duplicate]

This question already has answers here:
Issue with CSS locator select-react
(2 answers)
Closed 3 years ago.
I'm creating an automated script with puppeteer and I'm running across a problem of trying to find a selector that could be understood. I have tried many different options but gotten no luck.
Note:Don't worry its a dummy account so nothing important is on it.
I tried using
const myacc = '.li.member-nav-item.d-sm-ib.va-sm-m > button';
and bunch of others but still getting selector error
Code:
const puppeteer = require('puppeteer');
var emailVal = 'kellybrando23434#gmail.com';
var passwordVal = 'd34gfA#4dfW';
const AcceptCookies = '#cookie-settings-layout > div > div > div > div.ncss-row.mt5-sm.mb7-sm > div:nth-child(2) > button';
const loginBtn = 'li.member-nav-item.d-sm-ib.va-sm-m > button';
const email = 'input[type="email"]';
const password = 'input[type="password"]';
const logsubmit = '.loginSubmit.nike-unite-component > input[type="button"]';
const myacc = '.li.member-nav-item.d-sm-ib.va-sm-m > button'; //this line contains error
(async () => {
const browser = await puppeteer.launch({headless: false, slowMo: 150});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 })
await page.goto('https://www.nike.com/launch/'); const AcceptCookies = '#cookie-settings-layout > div > div > div > div.ncss-row.mt5-sm.mb7-sm > div:nth-child(2) > button'; await page.click(loginBtn);
console.log("Login Button Clicked...");
await page.waitFor(5000);
console.log("email: " + emailVal);
await page.type(email, emailVal);
console.log("entered email");
await page.type(password, passwordVal);
console.log("waiting 0.5s");
await page.waitFor(500);
console.log("waiting done");
await page.click(logsubmit);
console.log("submitted"); await page.waitFor(10000); await page.click(myacc); await page.waitFor(10000);
await browser.close(); })();
I'm trying to get the correct selector - "const myacc=..."- to click account profile as shown in the picture (highlighted section) but instead I'm getting a selector error ("Error:No node found for selector:...."). How would you find it in this situation as their is no id?
Before Picture
After Picture
There are almost all of the elements contains a unique attribute for testing data-qa. I would recommend using it for testing, so replace all your selectors with it.
Here is the example for the 'my account' selector:
const myAcc = '[data-qa="user-name"]';
Also, you may not see that the selector was clicked due to screen size, so you will need to maximize screen size.

Automatic file downloads limited to 10 files on Chrome browser

I have a webpage where we generate PDFs based upon the user selection of on-page items. This causes a postback (it's an ASP.NET WebForms page) which creates the PDFs server-side. An <a class="documentDownload"> tag is then added to the page for each item.
When the page reloads in the browser the following jQuery script is executed to automatically download the files (if the user had chosen a auto-download option):
var divHost = document.createElement("div");
divHost.id = "elmntDnldLinks";
divHost.style.display = "none";
document.body.appendChild(divHost);
setTimeout(function() {
$(".documentDownload").each(function(idx, val) {
var lnkDownload = $(val),
save = document.createElement("a");
save.href = lnkDownload.attr("href");
save.download = lnkDownload.attr("download");
save.target = "_blank";
divHost.appendChild(save);
save.click();
});
}, 1000);
This script has a delay of 1 second, then for each .documentDownload element it creates a new <a> element with the same href attribute of the original element, appends it to a newly-added hidden element, then programmatically clicks it.
[This strategy of creating new links and clicking those instead of clicking the original DOM elements gets around a browser security measure.]
This works perfectly well in Firefox but Chrome never downloads more than 10 files. Why? I can see, for example, 15 links on the page and in the hidden element, but only 10 files are downloaded.
If you pause for a second between each 10 downloads, all of them will work in Chrome.
I used async timeout function for this workaround:
function pause(msec) {
return new Promise(
(resolve, reject) => {
setTimeout(resolve, msec || 1000);
}
);
}
async function downloadAll(elements) {
var count = 0;
for (var e in elements) {
download(elements[e]); // your custom download code here, click or whatever
if (++count >= 10) {
await pause(1000);
count = 0;
}
}
}
Simple timeouts multiplied by element counter could probably work too, but I did not test it.
You can do a pause in your downloads like that
function sleep(milliseconds) {
let timeStart = new Date().getTime();
while (true) {
let elapsedTime = new Date().getTime() - timeStart;
if (elapsedTime > milliseconds) {
break;
}
}
}
$("#aDescarga")[0].click();
$("#aDescarga").attr("href","");
if (i > 9) {
sleep(2000);
i = 0;
} else {
i = i + 1;
}
Setting a 200ms delay between each download solves the problem.
Tested on Google Chrome 100.0.4896.127 (Windows, macOS)
Demo in CodePen - 'Download multiple files' should be allowed.
const downloadFileWithAnchor = () => {
const anchor = document.createElement("a");
anchor.href = "data:text/plain;charset=utf-8,";
anchor.download = 'blank.txt';
document.body.appendChild(anchor);
anchor.click();
document.body.removeChild(anchor);
};
const repeatCount = 20;
for (let i = 0; i < repeatCount; i += 1) {
setTimeout(
() => {
downloadFileWithAnchor();
},
i * 200 // Delay download every 200ms
);
}

firefox javascript window.open _self

My problem is:
When I use:
window.open("example.com","_self");
or
self.open("example.com");
or
window.location.href="example.com";
Firefox removes all menus, buttons, window's window minimization buttons, everything. Also context menu stop working, but site opens fine except this chaos, which ruins everything.
So how to fix this?
EDIT:
I'm using FF22, fresh install.
Looks like its not a simple case so I drop here entire code, it's slightly edited addon for creating new tabs from context menu:
let _ = require("l10n").get;
let winUtils = require("window-utils");
let { isBrowser } = require("api-utils/window/utils");
var delegate = {
onTrack: function (window) {
if (isBrowser(window) ){
let menu = window.document.getElementById("tabContextMenu");
let newtab = window.document.createElementNS("http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul","menuitem");
newtab.setAttribute("id", "contexttab-newtab");
newtab.setAttribute("label", _("newtab_string"));
newtab.setAttribute("accesskey", _("newtabaccesskey_string"));
newtab.setAttribute("oncommand", "window.location.href='http://www.example.com'");
menu.insertBefore(newtab, menu.firstChild);
} // End isBrowser
} // End ontrack
} // End delegate function
let tracker = new winUtils.WindowTracker(delegate);
// code to remove the menuitem when extension is disabled for satisfy requirement on AMO for pass a full review
// On uninstall the menuitem is not removed, see: https://bugzilla.mozilla.org/show_bug.cgi?id=627432
exports.onUnload = function(reason) {
var unloader = {
onTrack: function (window) {
if (isBrowser(window) ){
let menu = window.document.getElementById("tabContextMenu");
let newtab = window.document.getElementById("contexttab-newtab");
menu.removeChild(newtab);
}
}
}; // End unloader function
let remover = new winUtils.WindowTracker(unloader);
}
This is the only line I edited:
newtab.setAttribute("oncommand", "window.location.href='http://www.example.com'");
gBrowser.loadURI('http://www.example.com');
works properly.
gBrowser.loadURI loads a page into the selected tab I think.
If you want to open a new window you have to do it like this:
var url = Cc['#mozilla.org/supports-string;1'].createInstance(Ci.nsISupportsString);
url.data = 'http://www.bing.com/';
Services.ww.openWindow(null, 'chrome://browser/content/browser.xul', '_blank', 'chrome,all', url);

Categories

Resources