I'm using nightmare.js to scrape public records and am just trying to get the scraper to wait for the next page to load. I'm crawling search results which I press a next button to (obviously) get to the next page. I can't use nightmare.wait(someConstTime) to accurately wait for the next page to load because sometimes someConstTime is shorter than the time it takes for the next page to load (although it's always under 30 seconds). I also can't use nightmare.wait(selector) because the same selectors are always present on all result pages. In that case nightmare basically doesn't wait at all because the selector is already present (on the page I already scraped) so it it will proceed to scrape the same page several times unless the new page loads before the next loop.
How can I conditionally wait for the next page to load after I click on the next button?
If I could figure out how - I would compare the "Showing # to # of ## entries" indicator of the current page (currentPageStatus) to the last known value (lastPageStatus) and wait until they're different (hence the next page loaded).
(ignore that the example image only has one search result page)
I'd do that using this code from https://stackoverflow.com/a/36734481/3491991 but that would require passing lastPageStatus into deferredWait (which I can't figure out).
Here's the code I've got so far:
// Load dependencies
//const { csvFormat } = require('d3-dsv');
const Nightmare = require('nightmare');
const fs = require('fs');
var vo = require('vo');
const START = 'http://propertytax.peoriacounty.org';
var parcelPrefixes = ["01","02","03","04","05","06","07","08","09","10",
"11","12","13","14","15","16","17","18","19"]
vo(main)(function(err, result) {
if (err) throw err;
});
function* main() {
var nightmare = Nightmare(),
currentPage = 0;
// Go to Peoria Tax Records Search
try {
yield nightmare
.goto(START)
.wait('input[name="property_key"]')
.insert('input[name="property_key"]', parcelPrefixes[0])
// Click search button (#btn btn-success)
.click('.btn.btn-success')
} catch(e) {
console.error(e)
}
// Get parcel numbers ten at a time
try {
yield nightmare
.wait('.sorting_1')
isLastPage = yield nightmare.visible('.paginate_button.next.disabled')
while (!isLastPage) {
console.log('The current page should be: ', currentPage); // Display page status
try {
const result = yield nightmare
.evaluate(() => {
return [...document.querySelectorAll('.sorting_1')]
.map(el => el.innerText);
})
// Save property numbers
// fs.appendFile('parcels.txt', result, (err) => {
// if (err) throw err;
// console.log('The "data to append" was appended to file!');
// });
} catch(e) {
console.error(e);
return undefined;
}
yield nightmare
// Click next page button
.click('.paginate_button.next')
// ************* THIS IS WHERE I NEED HELP *************** BEGIN
// Wait for next page to load before continue while loop
try {
const currentPageStatus = yield nightmare
.evaluate(() => {
return document.querySelector('.dataTables_info').innerText;
})
console.log(currentPageStatus);
} catch(e) {
console.error(e);
return undefined;
}
// ************* THIS IS WHERE I NEED HELP *************** END
currentPage++;
isLastPage = yield nightmare.visible('.paginate_button.next.disabled')
}
} catch(e) {
console.error(e)
}
yield nightmare.end();
}
I had a similar issue that I managed to fix. Basically I had to navigate to a search page, select the '100 per page' option and then wait for the refresh. Only problem was, it was a crapshoot as to whether a manual wait time allowed the AJAX to fire and repopulate with more than 10 results (the default).
I ended up doing this:
nightmare
.goto(url)
.wait('input.button.primary')
.click('input.button.primary')
.wait('#searchresults')
.select('#resultsPerPage',"100")
.click('input.button.primary')
.wait('.searchresult:nth-child(11)')
.evaluate(function() {
...
}
.end()
With this, the evaluate won't fire until it detects at least 11 divs with the class of .searchresult. Given that the default is 10, it has to wait for the reload for this to complete.
You could extend this to scrape the total number of available results from the first page to ensure that there are - in my case - more than 10 available. But the foundation of the concept works.
From what I could understand, basically you need the DOM change to be completed before you start extracting from the page being loaded.
In your case, the element for DOM changes is table with CSS selector: '#search-results'
I think MutationObserver is what you need.
I have used Mutation Summary library which provides a nice wrapper on raw functionality of MutationObservers, to achieve something similar
var observer = new MutationSummary({
callback: updateWidgets,
queries: [{
element: '[data-widget]'
}]
});
:From Tutorial
First register MutationSummary observer when the search results are loaded.
Then, after clicking 'Next' use nightmare.evaluate to wait for mutationSummary callback to return extracted values.
Related
Here is what I have going on. I have a rPI that launches chrome into three tabs that I have set using xdotool to cycle between the three tabs. Everything is working great with that functionality, but I am looking to have it stop cycling and stay on one of the tabs when an event on that website happens. I have the code done to go back to that tab and stay there for x-amount of time. What I need help with is getting the code to recognize the event happening. I have watched the console when the event occurs and there is a log of the function call as well as the object that is passed from the JS code. If there is a way to monitor that console log real-time in the background and catch that function call being printed to the log then I could use that to fire the rest of the logic to lock the screen to that tab.
Or if anyone can come up with a different/easier plan that would be greatly appreciated. When the function call happens there is a list of names that displays on the website. Maybe we could check that list for any name and then lock the screen.
I tried to use selenium to grab the logs. I was able to get it to start chrome and then go to the website and pull up the logs. That worked as it was supposed to from the documentation that I have read. The problem is I need something to run on an already running instance of chrome. Maybe have it in the code that when it goes to the tab where the function would be called it would check the log and execute code, not launch and then close an instance of chrome.
If there is a way to monitor that console log real-time in the
background and catch that function call being printed to the log
There is (though not in the background). Here's how you can do it
function myConsoleLogFunc(info) {
// examine the info being logged
this.log(info);
}
myConsoleLogFunc.log = console.log;
console.log = myConsoleLogFunc;
So I was able to get the answer I was looking for with puppeteer and some navigation and bash scripts. Below is the code that I used to complete the task.
const puppeteer = require('puppeteer-core');
async function start(){
const browser = await puppeteer.launch({executablePath: '/usr/bin/chromium-browser'}); //launch browser window in bg
const page = await browser.newPage(); //get new page in browser
await page.setViewport({width: 1280, height: 800}); //set window size
await page.goto('https://auth.iamresponding.com/login/member'); //open i am responding page
await page.click('#accept-policy'); //click accept cookies
await page.type('#Input_Agency', '#########'); //input agency name
await page.type('#Input_Username', '#########'); //input user name
await page.type('#Input_Password', '#########'); //input password
await Promise.all([page.click('button[name="Input.button"'), page.waitForNavigation()]) //click login button and wait for new page to load
var messageTest = "" // var to hold console message for testing
var testDone = false
var loaded = 0 // var to only fire code on first pushrespond notice
page.on('console', message => { //get the console logs from the browser and pass them to the test method
messageTest = message.text()
console.log(messageTest)
testDone = testValue(messageTest)
})
function testValue(cLog){ //method to test the console message for responding and clear responding
if (cLog.includes("pushrespond")) { //check to see if value is pushrespond
loaded += 1 // if it is increment the loaded var
if (loaded == 1){ // check if loaded = 1 and if so open new chrome window and execute login
require('child_process').exec('sh /home/pi/open.sh',
(error, stdout, stderr) => {
console.log(stdout);
console.log(stderr);
if (error !== null) {
console.log(`exec error: ${error}`);
}
});
return //return out of the method
}else {
return // if loaded is more than one return out of method without doing anything
}
return
}else if (cLog.includes("pushautoclear")){ //check to see if console message is push autoclear
if(loaded >= 1){ //make sure that there is a valid window to close out of as to not close main browser if no one was responding
require('child_process').exec('sh /home/pi/exit.sh', //close the window that was launched on responding
(error, stdout, stderr) => {
console.log(stdout);
console.log(stderr);
if (error !== null) {
console.log(`exec error: ${error}`);
}
});
loaded = 0 //reset loaded to 0 so all functions work properly on next iteration
}else{
return
}
return
}else{ //exit out of the method if message does not contain pushrespond or pushautoclear
return
}
}
}
I am loading a page, intercepting its requests, and when a certain element shows up I stop loading and extract the data I need...
Here is the problem that I am faced with.
When simplified the code looks actually something like this:
async function loadPage()
{
var contentLoaded = false;
var content;
//now i say when element shows up, do something
// it is page.waitForSelector but for simplicity, i use a timeout
// because the problem is the same
//i set it to "show up" in 10 seconds here.
//when it shows up, it sets the content to 100 (extracts the content i want)
//and stores it..
setTimeout(()=>{
content = 100;
contentLoaded = true;
},10000)
//Here i have a function that loads the page
//Intercepts request and handles them
//Until content is loaded
page.on('request', req =>{
if(!contentLoaded)
{
// keep loading page
}
})
// this is the piece of code i would like to not run,
// UNTIL i either get the data, or a timeout error
// from page.waitForSelector...
//but javascript will run it if it's not busy with the
//loading function above...
// In 10 seconds the content shows
// and it's stored in DATA, but this piece of code has
// already finished by the time that is done...
// and it returns false...
if(contentLoaded)
{return content}
else
{return false}
}
var x = loadPage();
x.then(console.log); //should log the data or false if error occured
Thank you all for taking the time to read this and help out, I'm a novice so any feedback or even reading material is welcome if you think there is something I'm not fully understanding
Solved
Simple explanation:
Here is what I was trying to accomplish:
Intercept page requests so that I can decide what not to load, and speedup loading
Once an element shows up on the page, i want to extract some data and return it.
I was trying to return it like this: (note, all the browser and error handling will be left out in these since it would just clutter the explanation)
var data = loadPage(url);
async function loadPage(URL)
{
var data;
page.waitForSelector(
var x = //page.evaluate returns data to x...
data = x;
)
return data;
}
Which doesn't work since return runs immediately but waitForSelector runs later, so we always return undefined...
The correct way of doing it, or rather the way it works for me is to return the whole promise, and then extract the data...
var data = loadPage(url);
data.then(//do what needs to be done with the data);
async function loadPage(URL)
{
var data = page.waitForSelector(
var x = //page.evaluate returns data to x...
data = x;
)
return data; // we return data as a promise
}
I hope it's a solid enough explanation, if someone needs to see the whole deal, I could edit the question and place the whole code there...
I am trying to test a react webapp (created in a separate project), that contains a popup where there's an input containing a google auto-complete for cities:
(I changed text because of language)
I have in "search city" a text input where if data is inserted, google searches for cities and returns results (eg I search Rome, Italy):
When I press "save data" there's a function that checks google results, then closes the popup:
in a file:
export const useGoogleApiDesktop = () => {
let autocompleteService
if (window.google && window.google.maps) {
autocompleteService = new window.google.maps.places.AutocompleteService()
}
}
in another file (the one called):
const googleApi = useGoogleApiDesktop()
const onSubmitClick = useCallback(async () => {
[...]
const res: GoogleApiPlacesResponse = await googleApi.autocompleteService.getPlacePredictions({
input: addressComputed,
types: ['(cities)'],
componentRestrictions: { country: 'it' }
})
}, [])
When I use it in plain browser, everything works fine;
but if I try to launch it with cypress to test it, it returns me this error:
I am trying to avoid this error and simply go on and close the popup, since during my tests I do not need to write anything on that line; I only need to write something on the other textareas and close the popup.
Since I couldn't do it, I've tried to stub that call, but I am totally new in using cy.stub() and does not work:
function selectAddress(bookingConfig) {
// opens the popup
cy.get('.reservationsWhereAdd').click()
// trying to add the google library
const win = cy.state('window')
const document = win.document
const script = document.createElement('script')
script.src = `https://maps.googleapis.com/maps/api/js?key=[myApiKey]&libraries=places&language=it`
script.async = true
// this is commented since I don't think I need it
// window.initMap = function () {
// // JS API is loaded and available
// console.log('lanciato')
// }
// Append the ‘script’ element to ‘head’
document.head.appendChild(script)
// type something in some fields
cy.get('#street').type(bookingConfig.street)
cy.get('#streetNumber').type(bookingConfig.streetNum)
cy.get('#nameOnTheDoorbell').type(bookingConfig.nameOnTheDoorbell)
cy.get('#addressAlias').type(bookingConfig.addressAlias)
// this correctly finds and prints the object
console.log('--->', win.google.maps.places)
cy.stub(googleApi.autocompleteService, 'getPlacePredictions')
// this closes the popup
cy.get('.flex-1 > .btn').click()
}
this cy.stub however does not works, and I don't get why: it says
googleApi is not defined
Any idea on how to solve this? Thanks!
UPDATE:
After the error, working with the cypress window, I manually closed the popup, reopened it, filled the fields, and clicked on save data. It worked, so I added a cy.wait(1000) just after opening the popup and it works for 95% of the times (9 times on 10). Any Idea on how to "wait for loading the google api, then fill the fields"?
As the update block said, I discovered that the problem was that it kept really long time to load the google API, because it's not local and needs time to be retrieved.
So at first I just put a cy.wait(2000) before executing my code; but this couldn't be the answer: what happens if I run the code on a slow network? Or if it takes more time for my application to load?
So, i created a command, that first waits for the google API to load; if it fails to load after 5 attempts, the test fails.
Then, after that, my code is being executed. This way my test won't fail really easily.
Here's the code:
in cypress/support/command.js
Cypress.Commands.add('waitForGoogleApi', () => {
let mapWaitCount = 0
const mapWaitMax = 5
cyMapLoad()
function cyMapLoad() {
mapWaitCount++
cy.window().then(win => {
if (typeof win.google != 'undefined') {
console.log(`Done at attempt #${mapWaitCount}:`, win)
return true
} else if (mapWaitCount <= mapWaitMax) {
console.log('Waiting attempt #' + mapWaitCount) // just log
cy.wait(2000)
cyMapLoad()
} else if (mapWaitCount > mapWaitMax) {
console.log('Failed to load google api')
return false
}
})
}
})
in file you want to use it:
cy.waitForGoogleApi().then(() => {
// here comes the code to execute after loading the google Apis
})
I am trying to dump out a few key measurements to console when my test runs, rather than getting them from the reporter output, but I can't see how to grab the time taken for the last step to execute. Here's a simplified version based on the docs for request.timing() but I don't think that what I'm doing is classed as a request:
const { test, expect } = require('#playwright/test');
test('ApplicationLoadTime', async ({ page }) => {
// Wait for applications to load
await page.waitForSelector('img[alt="Application"]');
// Not working! - get time for step execution
const [fir] = await Promise.all([
page.click('text=Further information requested'),
page.waitForSelector('img[alt="Application"]')
]);
console.log(fir.timing());
});
The click on "Further information requested" causes the page to be modified based on an AJAX call in the background and the appearance of the Application img tells me it's finished. Is this possible or do I need to rely on the reports instead?
fir is going to be undefined in your code as page.click() doesn't return anything. You need to wait for the request whose timing you're interested in, use page.waitForEvent('requestfinished') or waitForNavigation:
const { test, expect } = require('#playwright/test');
test('ApplicationLoadTime', async ({ page }) => {
// Wait for applications to load
await page.waitForSelector('img[alt="Application"]');
const [fir] = await Promise.all([
// Wait for the request
page.waitForEvent('requestfinished', r => r.url() == '<url of interest>'),
page.click('text=Further information requested'),
page.waitForSelector('img[alt="Application"]')
]);
console.log(fir.timing());
});
I'm trying to download the HTML of a website that is almost entirely generated by JavaScript. So, I need to simulate browser access and have been playing around with PhantomJS. Problem is, the site uses hashbang URLs and I can't seem to get PhantomJS to process the hashbang -- it just keeps calling up the homepage.
The site is http://www.regulations.gov. The default takes you to #!home. I've tried using the following code (from here) to try and process different hashbangs.
if (phantom.state.length === 0) {
if (phantom.args.length === 0) {
console.log('Usage: loadreg_1.js <some hash>');
phantom.exit();
}
var address = 'http://www.regulations.gov/';
console.log(address);
phantom.state = Date.now().toString();
phantom.open(address);
} else {
var hash = phantom.args[0];
document.location = hash;
console.log(document.location.hash);
var elapsed = Date.now() - new Date().setTime(phantom.state);
if (phantom.loadStatus === 'success') {
if (!first_time) {
var first_time = true;
if (!document.addEventListener) {
console.log('Not SUPPORTED!');
}
phantom.render('result.png');
var markup = document.documentElement.innerHTML;
console.log(markup);
phantom.exit();
}
} else {
console.log('FAIL to load the address');
phantom.exit();
}
}
This code produces the correct hashbang (for instance, I can set the hash to '#!contactus') but it doesn't dynamically generate any different HTML--just the default page. It does, however, correctly output that has when I call document.location.hash.
I've also tried to set the initial address to the hashbang, but then the script just hangs and doesn't do anything. For example, if I set the url to http://www.regulations.gov/#!searchResults;rpp=10;po=0 the script just hangs after printing the address to the terminal and nothing ever happens.
The issue here is that the content of the page loads asynchronously, but you're expecting it to be available as soon as the page is loaded.
In order to scrape a page that loads content asynchronously, you need to wait to scrape until the content you're interested in has been loaded. Depending on the page, there might be different ways of checking, but the easiest is just to check at regular intervals for something you expect to see, until you find it.
The trick here is figuring out what to look for - you need something that won't be present on the page until your desired content has been loaded. In this case, the easiest option I found for top-level pages is to manually input the H1 tags you expect to see on each page, keying them to the hash:
var titleMap = {
'#!contactUs': 'Contact Us',
'#!aboutUs': 'About Us'
// etc for the other pages
};
Then in your success block, you can set a recurring timeout to look for the title you want in an h1 tag. When it shows up, you know you can render the page:
if (phantom.loadStatus === 'success') {
// set a recurring timeout for 300 milliseconds
var timeoutId = window.setInterval(function () {
// check for title element you expect to see
var h1s = document.querySelectorAll('h1');
if (h1s) {
// h1s is a node list, not an array, hence the
// weird syntax here
Array.prototype.forEach.call(h1s, function(h1) {
if (h1.textContent.trim() === titleMap[hash]) {
// we found it!
console.log('Found H1: ' + h1.textContent.trim());
phantom.render('result.png');
console.log("Rendered image.");
// stop the cycle
window.clearInterval(timeoutId);
phantom.exit();
}
});
console.log('Found H1 tags, but not ' + titleMap[hash]);
}
console.log('No H1 tags found.');
}, 300);
}
The above code works for me. But it won't work if you need to scrape search results - you'll need to figure out an identifying element or bit of text that you can look for without having to know the title ahead of time.
Edit: Also, it looks like the newest version of PhantomJS now triggers an onResourceReceived event when it gets new data. I haven't looked into this, but you might be able to bind a listener to this event to achieve the same effect.