My application gets a list of IDs from the db. I iterate over these with a cursor & for every ID, I plug it into a URL with Selenium to get specific items on a page. This is doing a search on a keyword & getting the most relevant item to that search. There are around 1000 results from the db. At random iterations, 1 of the driver actions will throw up an StaleElementReferenceError with the full message of:
stale element reference: element is not attached to the page document\n (Session info: chrome=77.0.3865.75)
Looking at the official docs I can see that the 2 common causes for this are:
The element has been deleted entirely.
The element is no longer attached to the DOM.
With the former being the most frequent cause.
index.js
const { MongoClient, ObjectID } = require('mongodb')
const fs = require('fs')
const path = require('path')
const { Builder, Capabilities, until, By } = require('selenium-webdriver')
const chrome = require('selenium-webdriver/chrome')
require('dotenv').config()
async function init() {
try {
const chromeOpts = new chrome.Options()
const ids = fs.readFileSync(path.resolve(__dirname, '..', 'data', 'primary_ids.json'), 'utf8')
const client = await MongoClient.connect(process.env.DB_URL || 'mongodb://localhost:27017/test', {
useNewUrlParser: true
})
const db = client.db(process.env.DB_NAME || 'test')
const productCursor = db.collection('product').find(
{
accountId: ObjectID(process.env.ACCOUNT_ID),
primaryId: {
$in: JSON.parse(ids)
}
},
{
_id: 1,
primaryId: 1
}
)
const resultsSelector = 'body #wrapper div.src-routes-search-style__container--2g429 div.src-routes-search-style__products--3rsz9'
const mostRelevantSelector = `${resultsSelector}
> div:nth-child(2)
> div.src-routes-search-product-item-raw-style__product--3vH_O:nth-child(1)`
const titleContainerSelector = `${mostRelevantSelector}
> div.src-routes-search-product-item-raw-style__mainPart--1HEWx
> div.src-routes-search-product-item-raw-style__containerText--3NefD
> div.src-routes-search-product-item-raw-style__description--3swql
> div.src-routes-search-product-item-raw-style__titleContainer--tazkH`
const productImageSelector = `${mostRelevantSelector}
> div.src-routes-search-product-item-raw-style__mainPart--1HEWx
> div.src-routes-search-product-item-raw-style__containerImages--1PfdF
> a.src-routes-search-product-item-raw-style__productImage--1Y42Y
> img`
const linkSelector = `${titleContainerSelector} > a`
const primaryIdSelector = `${titleContainerSelector} > p`
chromeOpts.setChromeBinaryPath('/usr/local/bin')
const driver = await new Builder()
.withCapabilities(Capabilities.chrome())
.forBrowser('chrome')
.build()
let newProds = {}
let product
let i = 0
while (await productCursor.hasNext()) {
i += 1
product = await productCursor.next()
let searchablePrimaryId = product.primaryId
let link
let primaryId
let pId
let href
let img
let imgSrc
if (product.primaryId.includes('#')) {
searchablePrimaryId = product.primaryId.substr(0, product.primaryId.indexOf('#'))
}
if (searchablePrimaryId.includes('-')) {
searchablePrimaryId = searchablePrimaryId.substr(0, searchablePrimaryId.indexOf('-'))
}
await driver.get(`https://icecat.biz/en/search?keyword=${encodeURIComponent(searchablePrimaryId.toLowerCase())}`)
link = await driver.wait(until.elementLocated(By.css(linkSelector)), 10000) // wait 10 seconds
img = await driver.wait(until.elementLocated(By.css(productImageSelector)), 10000)
imgSrc = await img.getAttribute('src')
primaryId = await driver.wait(until.elementLocated(By.css(primaryIdSelector)), 10000)
pId = await primaryId.getText()
href = await link.getAttribute('href')
const iceCatId = href.substr(href.lastIndexOf('-') + 1, href.length)
const _iceCatId = iceCatId.substr(0, iceCatId.indexOf('.html'))
const idFound = (searchablePrimaryId.toUpperCase() === pId.toUpperCase()) && !imgSrc.includes('logo-fullicecat')
newProds[product._id.toString()] = {
primaryId: product.primaryId,
iceCatId: idFound ? _iceCatId : 'N/A'
}
}
const foundProducts = Object.values(newProds).filter(prod => prod.iceCatId !== 'N/A')
console.log(`\nFound ${foundProducts.length}/${JSON.parse(ids).length}`)
fs.writeFileSync(path.resolve(__dirname, '..', 'data', 'new_products.json'), JSON.stringify(newProds, null, 4), 'utf8')
driver.quit()
} catch(err) {
throw err
}
}
init()
.then(res => {
console.log(res)
})
.catch(err => {
console.error(err)
})
To debug, I put a try...catch around each of the driver actions to see which specific action it is that is failing but that didn't work as it was never a consistent action that was failing. For example, sometimes if would have been one of the elementLocated lines or others it would have just been the getAttribute action.
If it is the latter in that scenario, that is why I am confused as to why this error is being thrown as surely selenium has found the element on the page (i.e. link) but is unable to do getAttribute('src') on the element? That's why I'm confused as to the error I'm getting. I imagine I must be doing something wrong with how I am setting up selenium to handle iterations. The iterations never get higher than 110
In your case the second cause is The element is no longer attached to the DOM. If a WebElement is located and the DOM is refreshed afterwards this element become stale even if the DOM hasn't change, the same locator will return new WebElement.
Normally, driver.get() will block until the page is fully loaded, however this site is running JavaScript to load the search results. You can test it by running document.readyState in the developer tools console, you will see "complete" results while the search results are still loading.
The page has a spinner before the results are located, hopefully it will be enough to wait for it to appear and became stale before scraping the page
await driver.get(`https://icecat.biz/en/search?keyword=${encodeURIComponent(searchablePrimaryId.toLowerCase())}`)
let spinner = driver.wait(until.elementIsVisible(By.className('src-routes-search-style__loader---acti')))
driver.wait(until.stalenessOf(spinner))
link = await driver.wait(until.elementLocated(By.css(linkSelector)), 10000)
You don't have wait for Ajax request to finish. The website retrieves and refreshes dom once you go to end and also keeps calling index every few seconds so DOM probably keeps updating. You can probably hold AJAX requests, get your results, process and enable AJAX again.
Could you try removing "await" from img Src = await img.getAttribute('src'). Since wait for img is already handled in its previous line.
Related
I'm fairly new to Puppeteer and I'm trying to practice keep tracking of a selected item from Amazon. However, I'm facing a problem when I try to retrieve some results from the page.
The way I intended this automation to work is by following these steps:
New tab.
Go to the home page of Amazon.
Enter the given product name in the search element.
Press the enter key.
Return the product title and price.
Check this example below:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (req) => { // don't load any fonts or images on my requests. To Boost the performance
if (req.resourceType() == 'font' /* || req.resourceType() == 'image' || req.resourceType() == 'stylesheet'*/) {
req.abort();
}
else {
req.continue(); {
}
}
});
const baseDomain = 'https://www.amazon.com';
await page.goto(`${baseDomain}/`, { waitUntil: "networkidle0" });
await page.click("#twotabsearchtextbox" ,{delay: 50})
await page.type("#twotabsearchtextbox", "Bose QuietComfort 35 II",{delay: 50});
await page.keyboard.press("Enter");
await page.waitForNavigation({
waitUntil: 'networkidle2',
});
let productTitle = await page.$$(".a-size-medium, .a-color-base, .a-text-normal")[43]; //varible that holds the title of the product
console.log(productTitle );
debugger;
})();
when I execute this code, I get in the console.log a value of undefined for the variable productTitle. I had a lot of trouble with scraping information from a page I navigate to. I used to do page.evaluate() and it only worked when I'm scraping from the page that I have told the browser to go to.
The first problem is on this line:
let productTitle = await page.$$(".a-size-medium, .a-color-base, .a-text-normal")[43];
// is equivalent to:
let productTitle = await (somePromise[43]);
// As you guessed it, a Promise does not have a property `43`,
// so I think you meant to do this instead:
let productTitle = (await page.$$(".a-size-medium, .a-color-base, .a-text-normal"))[43];
Once this is fixed, you don't get the title text, but a handle to the DOM element. So you can do:
let titleElem = (await page.$$(".a-size-medium, .a-color-base, .a-text-normal"))[43];
let productTitle = await titleElem.evaluate(node => node.innerText);
console.log(productTitle); // "Microphone"
However, I'm not sure that simply selecting the 43rd element will always get you the one you want, but if it isn't, that would be a topic for another question.
I am using puppeteer to scrape a website that is being live updated, to report the latest item elsewhere.
Currently the way I was thinking accomplishing this is to run a setInterval call on my async scrape and to compare if the last item has changed, checking every 30 seconds. I assume there has to be a better way of doing this then that.
Here is my current code:
const puppeteer = require('puppeteer');
playtracker = async () => {
console.log('loading');
const browser = await puppeteer.launch({});
const page = await browser.newPage();
await page.goto('URL-Being-Scraped');
await page.waitForSelector('.playlist-tracklist-view');
let html = await page.$$eval('.playlist-tracklist-view > .playlist-track', tracks => {
tracks = tracks.filter(track => track.querySelector('.playlist-trackname').textContent);
tracks = tracks.map(el => el.querySelector('.playlist-trackname').textContent);
return tracks;
});
console.log('logging', html[html.length-1]);
};
setInterval(playtracker, 30000)
There is an api called "MutationObserver". You can check that out on MDN. Here's the link https://developer.mozilla.org/en-US/docs/Web/API/MutationObserver
What it is doing is basically do whatever you want to do when the spesific element has changed. Lets say you have a list you want to listen. What you would do is
const listElement = document.querySelector( [list element] );
const callbackFunc = funcion foo () {
//do something
}
const yourMutationObserver = new MutationObserver(callbackFunc)
yourMutationObserver.observe(listElement)
You can disconnect your mutation observer with yourMutationObserver.disconnect() method whenever you want.
This could help too if you confused about how to implement it https://stackoverflow.com/a/48145840/14138428
I am trying to scrape this page.
https://www.psacard.com/Pop/GetItemTable?headingID=172510&categoryID=20019&isPSADNA=false&pf=0&_=1583525404214
I want to be able to find the grade count for PSA 9 and 10. If we look at the HTML of the page, you will notice that PSA does a very bad job (IMO) at displaying the data. Every TR is a player. And the first TD is a card number. Let's just say I want to get Card Number 1 which in this case is Kevin Garnett.
There are a total of four cards, so those are the only four cards I want to display.
Here is the code I have.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.psacard.com/Pop/GetItemTable?headingID=172510&categoryID=20019&isPSADNA=false&pf=0&_=1583525404214");
const tr = await page.evaluate(() => {
const tds = Array.from(document.querySelectorAll('table tr'))
return tds.map(td => td.innerHTML)
});
const getName = tr.map(name => {
//const thename = Array.from(name.querySelectorAll('td.card-num'))
console.log("\n\n"+name+"\n\n");
})
await browser.close();
})();
I will get each TR printed, but I can't seem to dive into those TRs. You can see I have a line commented out, I tried to do this but get an error. As of right now, I am not getting it by the player dynamically... The easiest way I would think is to create a function that would think about getting the specific card would be doing something where the select the TR -> TD.card-num == 1 for Kevin.
Any help with this would be amazing.
Thanks
Short answer: You can just copy and paste that into Excel and it pastes perfectly.
Long answer: If I'm understanding this correctly, you'll need to map over all of the td elements and then, within each td, map each tr. I use cheerio as a helper. To complete it with puppeteer just do: html = await page.content() and then pass html into the cleaner I've written below:
const cheerio = require("cheerio")
const fs = require("fs");
const test = (html) => {
// const data = fs.readFileSync("./test.html");
// const html = data.toString();
const $ = cheerio.load(html);
const array = $("tr").map((index, element)=> {
const card_num = $(element).find(".card-num").text().trim()
const player = $(element).find("strong").text()
const mini_array = $(element).find("td").map((ind, elem)=> {
const hello = $(elem).find("span").text().trim()
return hello
})
return {
card_num,
player,
column_nine: mini_array[13],
column_ten: mini_array[14],
total:mini_array[15]
}
})
console.log(array[2])
}
test()
The code above will output the following:
{
card_num: '1',
player: 'Kevin Garnett',
column_nine: '1-0',
column_ten: '0--',
total: '100'
}
I have a firebase function that deletes old messages after 24 hours as in my old question here. I now have just the messageIds stored in an array under the user such that the path is: /User/objectId/myMessages and then an array of all the messageIds under myMessages. All of the messages get deleted after 24 hours, but the iDs under the user's profile stay there. Is there a way to continue the function so that it also deletes the messageIds from the array under the user's account?
I'm new to Firebase functions and javascript so I'm not sure how to do this. All help is appreciated!
Building upon #frank-van-puffelen's accepted answer on the old question, this will now delete the message IDs from their sender's user data as part of the same atomic delete operation without firing off a Cloud Function for every message deleted.
Method 1: Restructure for concurrency
Before being able to use this method, you must restructure how you store entries in /User/someUserId/myMessages to follow best practices for concurrent arrays to the following:
{
"/User/someUserId/myMessages": {
"-Lfq460_5tm6x7dchhOn": true,
"-Lfq483gGzmpB_Jt6Wg5": true,
...
}
}
This allows you to modify the previous function to:
// Cut off time. Child nodes older than this will be deleted.
const CUT_OFF_TIME = 24 * 60 * 60 * 1000; // 2 Hours in milliseconds.
exports.deleteOldMessages = functions.database.ref('/Message/{chatRoomId}').onWrite(async (change) => {
const rootRef = admin.database().ref(); // needed top level reference for multi-path update
const now = Date.now();
const cutoff = (now - CUT_OFF_TIME) / 1000; // convert to seconds
const oldItemsQuery = ref.orderByChild('seconds').endAt(cutoff);
const snapshot = await oldItemsQuery.once('value');
// create a map with all children that need to be removed
const updates = {};
snapshot.forEach(messageSnapshot => {
let senderId = messageSnapshot.child('senderId').val();
updates['Message/' + messageSnapshot.key] = null; // to delete message
updates['User/' + senderId + '/myMessages/' + messageSnapshot.key] = null; // to delete entry in user data
});
// execute all updates in one go and return the result to end the function
return rootRef.update(updates);
});
Method 2: Use an array
Warning: This method falls prey to concurrency issues. If a user was to post a new message during the delete operation, it's ID could be removed while evaluating the deletion. Use method 1 where possible to avoid this.
This method assumes your /User/someUserId/myMessages object looks like this (a plain array):
{
"/User/someUserId/myMessages": {
"0": "-Lfq460_5tm6x7dchhOn",
"1": "-Lfq483gGzmpB_Jt6Wg5",
...
}
}
The leanest, most cost-effective, anti-collision function I can come up for this data structure is the following:
// Cut off time. Child nodes older than this will be deleted.
const CUT_OFF_TIME = 24 * 60 * 60 * 1000; // 2 Hours in milliseconds.
exports.deleteOldMessages = functions.database.ref('/Message/{chatRoomId}').onWrite(async (change) => {
const rootRef = admin.database().ref(); // needed top level reference for multi-path update
const now = Date.now();
const cutoff = (now - CUT_OFF_TIME) / 1000; // convert to seconds
const oldItemsQuery = ref.orderByChild('seconds').endAt(cutoff);
const snapshot = await oldItemsQuery.once('value');
// create a map with all children that need to be removed
const updates = {};
const messagesByUser = {};
snapshot.forEach(messageSnapshot => {
updates['Message/' + messageSnapshot.key] = null; // to delete message
// cache message IDs by user for next step
let senderId = messageSnapshot.child('senderId').val();
if (!messagesByUser[senderId]) { messagesByUser[senderId] = []; }
messagesByUser[senderId].push(messageSnapshot.key);
});
// Get each user's list of message IDs and remove those that were deleted.
let pendingOperations = [];
for (let [senderId, messageIdsToRemove] of Object.entries(messagesByUser)) {
pendingOperations.push(admin.database.ref('User/' + senderId + '/myMessages').once('value')
.then((messageArraySnapshot) => {
let messageIds = messageArraySnapshot.val();
messageIds.filter((id) => !messageIdsToRemove.includes(id));
updates['User/' + senderId + '/myMessages'] = messageIds; // to update array with non-deleted values
}));
}
// wait for each user's new /myMessages value to be added to the pending updates
await Promise.all(pendingOperations);
// execute all updates in one go and return the result to end the function
return ref.update(updates);
});
Update: DO NOT USE THIS ANSWER (I will leave it as it may still be handy for detecting a delete operation for some other need, but do not use for the purpose of cleaning up an array in another document)
Thanks to #samthecodingman for providing an atomic and concurrency safe answer.
If using Firebase Realtime Database you can add an onChange event listener:
const functions = require('firebase-functions');
const admin = require('firebase-admin');
admin.initializeApp();
exports.onDeletedMessage = functions.database.ref('Message/{messageId}').onChange(async event => {
// Exit if this item exists... if so it was not deleted!
if (event.data.exists()) {
return;
}
const userId = event.data.userId; //hopefully you have this in the message document
const messageId = event.data.messageId;
//once('value') useful for data that only needs to be loaded once and isn't expected to change frequently or require active listening
const myMessages = await functions.database.ref('/users/' + userId).once('value').snapshot.val().myMessages;
if(!myMessages || !myMessages.length) {
//nothing to do, myMessages array is undefined or empty
return;
}
var index = myMessages.indexOf(messageId);
if (index === -1) {
//nothing to delete, messageId is not in myMessages
return;
}
//removeAt returns the element removed which we do not need
myMessages.removeAt(index);
const vals = {
'myMessages': myMessages;
}
await admin.database.ref('/users/' + userId).update(vals);
});
If using Cloud Firestore can add an event listener on the document being deleted to handle cleanup in your user document:
exports.onDeletedMessage = functions.firestore.document('Message/{messageId}').onDelete(async event => {
const data = event.data();
if (!data) {
return;
}
const userId = data.userId; //hopefully you have this in the message document
const messageId = data.messageId;
//now you can do clean up for the /user/{userId} document like removing the messageId from myMessages property
const userSnapShot = await admin.firestore().collection('users').doc(userId).get().data();
if(!userSnapShot.myMessages || !userSnapShot.myMessages.length) {
//nothing to do, myMessages array is undefined or empty
return;
}
var index = userSnapShot.myMessages.indexOf(messageId);
if (index === -1) {
//nothing to delete, messageId is not in myMessages
return;
}
//removeAt returns the element removed which we do not need
userSnapShot.myMessages.removeAt(index);
const vals = {
'myMessages': userSnapShot.myMessages;
}
//To update some fields of a document without overwriting the entire document, use the update() method
await admin.firestore().collection('users').doc(userId).update(vals);
});
I'm using the default TypeScript service and the models are initialized asynchronously with one model depending on the other. There's a case where the two models cannot detect each other so it shows a semantic error. If I make some edits in the dependent model, which causes the model to be re-validated, the errors disappear.
I have tried to setModel manually, which solves the problems. However, it destroys the undo history.
Is there a way to re-validate the model manually?
That's my solution, which is extracted from monaco-typescript:
async function revalidateModel(model) {
if (!model || model.isDisposed()) return;
const getWorker = await monaco.languages.typescript.getTypeScriptWorker();
const worker = await getWorker(model.uri);
const diagnostics = (await Promise.all([
worker.getSyntacticDiagnostics(model.uri.toString()),
worker.getSemanticDiagnostics(model.uri.toString())
])).reduce((a, it) => a.concat(it));
const markers = diagnostics.map(d => {
const start = model.getPositionAt(d.start);
const end = model.getPositionAt(d.start + d.length);
return {
severity: monaco.MarkerSeverity.Error,
startLineNumber: start.lineNumber,
startColumn: start.column,
endLineNumber: end.lineNumber,
endColumn: end.column,
message: flattenDiagnosticMessageText(d.messageText, "\n")
};
});
const owner = model.getLanguageIdentifier().language;
monaco.editor.setModelMarkers(model, owner, markers);
}
Call the function above when model is created asynchronizedly.
This is what I did to fix it:
setInterval(() => {
const range = new monaco.Range(1,1,1,1);
const addEmptySpace = {forceMoveMarkers: true, range, text: ' '};
for (const m of monaco.editor.getModels()) {
const toInvert = m.applyEdits([addEmptySpace]);
m.applyEdits(toInvert);
}
}, 50*1000)
Every fifty seconds you insert and immediately remove a space. I don't like it, but it works.