WebdriverIO scrape content from x pages

WebdriverIO scrape content from x pages - javascript

So I am working on an API using WebdriverIO to automate some stuff for sites that have no API support. Currently facing an issue with trying to dynamically scrape x number of pages. The scenario is only say 20 items are listed on each page and I can click next to jump to the next, scrape it and repeat until there are no more pages.
The issue; There are not a specific number of pages, at one time it could be 2 and another it could be 12 so I need to do this dynamically.
My problem (I think); Judging from console output I believe the selenium session is being killed before the recursive scraping is finished and that's because it's proceeding to the execute I have there to test things.
My question; How can I recursively scrape those pages without the script continuing and terminating the session before I am finished?
Here is a slightly redacted version of what I have so far. Thanks!
...webdriver doing stuff that works...
.then(function() {
console.log('Entering the initial THIS...');
function recursive() {
console.log('This is inside the RECURSIVE Function!');
client
.execute(function() {
return document.title;
})
.then(function(ret) {
console.log('The thing I asked it for is: ' + ret.value);
})
.isExisting('#lastPage').then(function(lastPage) {
console.log(lastPage);
if (!lastPage) {
console.log('Its not the last page!');
client
.click('//a[text()="Next"]')
.saveScreenshot('/src/api/debug_images/05.png')
recursive();
}
})
}
recursive();
})
.execute(function() {
var table = document.getElementById("transactionHistoryTable");
var result = [];
for (var i = 1, row; row = table.rows[i]; i++) {
result.push({
date: row.cells[0].innerText,
details: row.cells[1].innerText,
debt: row.cells[2].innerText,
credit: row.cells[3].innerText,
balance: row.cells[4].innerText
});
}
return result;
})
.then(function(ret) {
return ret.value;
})
.then(function(ledger) {
console.log('This should not run yet?');
res.json({
status: 'success',
request: { bsb: bsb, account: account, period: period },
ledger: ledger.length
});
})
.end();

Related

Create loop in Javascript to use with imap-simple nodejs package

Firstly, some background as to what my test script will cover.
Pressing a button on a website will fire off an email to a test mailbox.
This email can take anything between 10 and 30 minutes to arrive in the test mailbox.
So using the following code from imap-simple ;
'get new email info': function(browser) {
imaps.connect(config).then(function (connection) {
return connection.openBox('INBOX').then(function () {
var searchCriteria = ['UNSEEN'];
var fetchOptions = {
bodies: ['HEADER', 'TEXT'],
markSeen: false
};
return connection.search(searchCriteria, fetchOptions).then(function (results) {
var subjects = results.map(function (res) {
return res.parts.filter(function (part) {
return part.which === 'HEADER';
})[0].body.subject[0];
});
console.log(subjects);
Correctly comes back with a blank subjects array, as the email hasn't been received by the test mailbox yet.
Adding a 30 minutes pause to the beginning of the script 'works', as after the 30 minutes the subjects array is populated as the email is (99.99% of the time) sent within a 30 minute window.
However, it is definitely far from ideal as the email might be received within 15 minutes, meaning the test is 'wasting' 15 minutes.
So what I'd ideally like to do is write some form of loop (?) that tests if the subjects array is populated or not.
So if the array is populated, carry on with the rest of the test script (which entails testing that the array contains a certain text).
If the array is not populated, wait for another minute before trying again.
Continue this trying every minute until the array is populated.
I've tried setInterval, For loops, While loops, etc but I can't seem to get them to work and I'm out of ideas to be honest.
Any advice, help, references would be greatly appreciated and any more info can be promptly added if required.

One way to do that could be using recursion.
const createPromise = ms => new Promise((resolve, reject) => {
setTimeout(() => resolve(ms), ms)
});
function findUnseenEmails(connection) {
return connection.openBox('INBOX').then(function () {
var searchCriteria = [
'UNSEEN'
];
var fetchOptions = {
bodies: ['HEADER', 'TEXT'],
markSeen: false
};
return connection.search(searchCriteria, fetchOptions).then(function (results) {
var subjects = results.map(function (res) {
return res.parts.filter(function (part) {
return part.which === 'HEADER';
})[0].body.subject[0];
});
console.log(subjects);
return subjects.length > 0 ? subjects : createPromise(5000).then(function() { return findUnseenEmails(connection)});
});
});
}
imaps.connect(config).then(function (connection) {
return findUnseenEmails(connection);
}).then((subjects) => console.log('finished', subjects));
Of course there is a possibility and danger of stack overflow, but in such scenario feel free to come back to stack overflow to find here with the help of our community non-recursive solution.
Result:
EDIT:
Answering your question regarding closing connection:
I'd do it like this (in findUnseenEmails function)
if (subjects.length > 0) {
connection.end();
return subjects;
} else {
return createPromise(5000).then(function() { return findUnseenEmails(connection)});
}

Quit AJAX loop or find alt to reading in list of html pages to do a compare

I need to get this small script inserted onto all of my website pages.
The script has to contain the token set for that domain.
I have 21 different domains.
Each domain has an HTML page with all the sites under that domain listed on it.
It isn't as easy as wwww.site.com/domain/path, not all the sites are under the same 'domain name' but are considered under that 'domain'.
So my thought was to write a script that upon the page loading, read in the site pathname and compare it to each list until a match is found.
Look at my array of {domain, tokens} and find the correct token for that domain and have a function write the script with the token and insert it into the page.
The Issue: AJAX will not quit the loop of searching the list of site URLs after a match has been found so this is all a massive drain on the server.
I know there have been several stackoverflow questions related to that very problem and I have tried all of the solutions posted. My real question is, how can I not use AJAX to read in and do the compare for the site pathnames?
Is there a better way to solve this problem?
*The key to using the 21 different HTML pages with the listed sites for each is, that list is dynamically updated with new site releases and deleted sites which happen frequently enough.
The script I need to be inserted is an analytics tracking code.
Latest Attempt at this (pseudo code because I can not post direct links/information)
let tokens = [{ domain: "Example",
url: "//example.html",
token: "ExampleToken"
},
etc
];
let index = 0;
jQuery( document ).ready(function()
{
loopyloop(); //start index at 0
});
function loopyloop()
{
jQuery.ajax({
type: "GET",
ajaxToken: index,
url: tokens[index].url,
success: function(data)
{
findToken(data);
},
error: function() {
console.log('fail');
}
});
}
function findToken(data)
{
//data is a very large string of <html> format, use replace to find a match with the domain
var urlRegex = /(https?:\/\/[^\s]+)/g;
data.replace(urlRegex, function(url) {
//compare the found URLs to the pathname/URL of the page
if (url.toLowerCase().indexOf(window.location.pathname.toLowerCase()) >= 0)
{
create_script(tokens[index].token, tokens[index].domain);
return false;
}
else if(index+1 < tokens.length)
{
index += 1;
console.log(index, tokens[index].domain);
loopyloop();
}
});
}
//this part has been tested and works correctly when called, no need to post this actual code
function create_script(token, domain)
{
console.log('token found: '+ domain, token);
}

I would try converting it to an async function, so that you can do a proper loop with await and break out of it when you're ready
Something potentially resembling this:
async function loopyLoop() {
try {
while(true) {
let data = await axios.get(``);
// some logic to extract token and compare it
if (somethingsomething) {
continue;
} else if (somethingelse) {
break;
}
}
} catch(e) {
console.log('loop errored');
console.log(e.toString());
}
}

Nightwatch.js function not 'closing'

I'm trying to perform a function at the beginning of my test, then the rest of the test should be executed.
This is my custom-command (named internalAdviceLinksHtml):
var solr = require('solr-client')
exports.command = function() {
this
var client = solr.createClient('solr.dev.bauerhosting.com', 8080, 'cms', '/www.parkers.co.uk');
var globalSettingsQuery = client.createQuery()
.q({TypeName:'Bauer.Parkers.GlobalSettings'})
.start(0)
.rows(10);
client.search(globalSettingsQuery,function(err,obj) {
if (err) {
console.log(err);
} else {
var myresult = (obj.response.docs[0].s_InternalAdviceLinksHtml);
console.log(myresult.length);
if (myresult.length === 0) {
console.log('content block not configured');
} else {
console.log('content block configured');
}
}
});
return this;
};
Test-file (script):
module.exports = {
'set up the solr query': function (browser) {
browser
.solr_query.global_settings.internalAdviceLinksHtml();
},
'links above footer on advice landing page displayed': function (browser) {
browser
.url(browser.launch_url + browser.globals.carAdvice)
.assert.elementPresent('section.seo-internal-links')
},
'closing the browser': function (browser) {
browser
.browserEnd();
},
};
The function works correctly (i.e. if myresult length is 0 then "content block is not configured" is displayed, etc), but the following test ("links above footer on advice landing page displayed") is never invoked.
It seems like the execution stops after the custom-command. I'm sure this will be something quite obvious to someone, but I just can't seem to see what it is.
Any help would be greatly appreciated.

Regarding your internalAdviceLinksHtml custom-command, everything looks good from my point of view (I presume that lonely this was a typo).
Your hunch is correct, it seems that the Nightwatch test-runner fails to go to the next test, which is probably due to some promise not being resolved upstream (client.search function from internalAdviceLinksHtml).
I would recommend doing a return this immediately after outputting to console (content block not configured, or content block configured) and see if that fixes the problem:
client.search(globalSettingsQuery,function(err,obj) {
if (err) {
console.log(err);
} else {
var myresult = (obj.response.docs[0].s_InternalAdviceLinksHtml);
console.log(myresult.length);
if (myresult.length === 0) {
console.log('content block not configured');
} else {
console.log('content block configured');
}
}
return this
});
Also, a few extra pointers:
make use of the Nightwatch test-hooks to make your tests easier to read/maintain & create a separation of concern (setup => before/beforeEach hooks | teardown (e.g: browser.end()) => after/afterEach hooks);
you need not do an explicit browser.end() at the end of your test case. Check this answer for more information on the matter.
Your test-file would become:
module.exports = {
// > do your setup here <
before(browser) {
browser
.solr_query.global_settings.internalAdviceLinksHtml();
},
'links above footer on advice landing page displayed': function (browser) {
browser
.url(browser.launch_url + browser.globals.carAdvice)
.assert.elementPresent('section.seo-internal-links');
},
// > do your cleanup here <
after(browser) {
browser
.browserEnd();
},
};

How to get and append most recent messages from server using jQuery and AJAX?

I'm working on my first simple chat application and this issue has me stuck. I know what I'm trying to do, but I end up overthinking it.
Basically, I have this heroku server going:
http://tiy-fee-rest.herokuapp.com/collections/blabberTalk
Whenever someone sends a message, it is added to this array.
My Issue:
I have it on a set interval so that every 2 seconds, it runs the getNewestMessages function. When this setInterval is working and someone sends a message, it will keep appending the last message they sent every 2 seconds. If I disable the setInterval and simply call the getNewestMessages function myself in a separate browser tab, this doesn't seem to happen. I want to make it so that the most recently sent message isn't constantly re-appended to the DOM when the setInterval is active.
This is the function I'm using to check for recent messages. It's pretty bloated, sorry about that:
getNewestMessages: function() {
$.ajax({
url: http://tiy-fee-rest.herokuapp.com/collections/blabberTalk,
method: 'GET',
success: function (data) {
// Finds Id of most recent message displayed in the DOM
var recentId = $('.message').last().data('id');
var prevMostRecent = 0;
var newMostRecent = [];
jQuery.each(data, function(idx,el){
if (el._id === recentId) {
// if one of the messages on the server has an Id equal to
// one of the messages in the DOM, it saves its index in a var
prevMostRecent = idx;
}
});
jQuery.each(data, function(idx,el){
if (idx < prevMostRecent) {
// if there are messages on the server with a lower index than
// the most recent message in the DOM, it pushes them to a new
// array. Basically, creates a new array of any messages newer
// than the last one displayed in the DOM.
newMostRecent.push(el);
}
});
for (var i = 0; i < newMostRecent.length; i++) {
console.log(newMostRecent[i]);
if (newMostRecent[i]._id === $('.message').last().data('id')) {
// My attempt at trying to remove the last DOM message from
// the array of newer messages. My main issue was that this
// whole function would keep appending the most recent message
// over and over again.
var result = _.without(newMostRecent, newMostRecent[i]);
console.log('MESSAGE TO BE EXCLUDED: ', newMostRecent[i]);
// If the array of newer messages contained the most recent
// DOM message, it removes it and sends it to be appended.
page.appendNewestMessages(result);
}
}
// If the array of newer messages DOESN'T contain the most recent
// DOM message, it just sends the whole array normally.
page.appendNewestMessages(newMostRecent);
},
error: function (err) {
}
});
}
Here is the append function:
appendNewestMessages: function(messagesToAppend) {
console.log(messagesToAppend.reverse());
_.each(messagesToAppend.reverse(), function(el, idx, arr) {
var newMessage = {
content: el.content,
timestamp: el.timestamp,
author: el.author,
userIcon: el.userIcon
}
$.ajax({
url: page.url,
method: 'POST',
data: newMessage,
success: function (data) {
page.addOneMessageToDOM(data);
},
error: function (err) {
console.log("error ", err);
}
});
})
}
Can anyone help me understand how to get the most recent messages from a server and append them to the DOM without any repeats? This has been driving me nuts.
Thanks for any and all help.

Using phantom.js to scrape data

Following on from this question, I am trying to scrape data using phantomjs, modifying a script from here:
My goal is to integrate a working function (see 2nd code snippet) into the script below in 1st code snippet. I have tried doing this but keep getting errors. Is there a way I can actually do the integration?
(note: using phantomjs because the site is an angular app where initial HTML doesn't contain any of the data I amlooking for, i.e.a headless web browser. So I need to load the page in memory, wait for angular to do its thing (a set delay of some sort), and then scrape the rendered DOM)
The errors (and output) I get when I execute my script (phantomjs scraping.js) are as follow:
console> SPR-ERROR: 103 - Invalid published date console> v6
ReferenceError: Can't find variable: angular
http://stage.inc.com/js/Inc5000ListApp.js?UPDATE1:2
http://www.inc.com/inc5000/index.html:2485
console> SPR-ERROR:103 - Invalid published date (date)
====================================================
Step "0"
====================================================
console>Reached scrapeData
console>
Seems like it is connecting to the desired site. How do I modify this script below to fit the extraction code at the bottom of this qn:
var page = new WebPage(),
url = 'http://www.inc.com/inc5000/index.html',
stepIndex = 0;
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments:
* the string for the message, the line number, and the source identifier.
*/
page.onConsoleMessage = function (msg, line, source) {
console.log('console> ' + msg);
};
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message.
*/
page.onAlert = function (msg) {
console.log('alert!!> ' + msg);
};
// Callback is executed each time a page is loaded...
page.open(url, function (status) {
if (status === 'success') {
// State is initially empty. State is persisted between page loads and can be used for identifying which page we're on.
console.log('============================================');
console.log('Step "' + stepIndex + '"');
console.log('============================================');
// Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file)
page.injectJs('jquery-1.6.1.min.js');
// Our "event loop"
if(!phantom.state){
//initialize();
scrapeData();
} else {
phantom.state();
}
// Save screenshot for debugging purposes
page.render("step" + stepIndex++ + ".png");
}
});
function scrapeData(){
page.evaluate(function() {
console.log('Reached scrapeData');
var DATA = [];
$('tr.ng-scope').each(function(){
var $tds = $(this).find('td');
DATA.push({
rank: $tds.eq(0).text(),
company: $tds.eq(1).text(),
growth: $tds.eq(2).text(),
revenue: $tds.eq(3).text(),
industry: $tds.eq(4).text()
});
});
console.log(DATA);
});
phantom.state = parseResults;
// scraping code here
}
// Step 1
function initialize() {
page.evaluate(function() {
console.log('Searching...');
});
// Phantom state doesn't change between page reloads
// We use the state to store the search result handler, ie. the next step
phantom.state = parseResults;
}
// Step 2
function parseResults() {
page.evaluate(function() {
$('#search-result a').each(function(index, link) {
console.log($(link).attr('href'));
})
console.log('Parsed results');
});
// If there was a 3rd step we could point to another function
// but we would have to reload the page for the callback to be called again
phantom.exit();
}
I know this code below works in the console, but how I can integrate it with the code script above to successfully scrape data from multiple pages on the site:
request('http://www.inc.com/inc5000/index.html', function (error, response, html) {
if(error || response.statusCode != 200) return;
var $ = cheerio.load(html);
var DATA = [];
$('tr.ng-scope').each(function(){
var $tds = $(this).find('td');
DATA.push({
rank: $tds.eq(0).text(),
company: $tds.eq(1).text(),
growth: $tds.eq(2).text(),
revenue: $tds.eq(3).text(),
industry: $tds.eq(4).text()
});
});
console.log(DATA);
});

Develop Reference

JavaScript is the programming language of the Web.

WebdriverIO scrape content from x pages - javascript

Related

Create loop in Javascript to use with imap-simple nodejs package

Quit AJAX loop or find alt to reading in list of html pages to do a compare

Nightwatch.js function not 'closing'

How to get and append most recent messages from server using jQuery and AJAX?

Using phantom.js to scrape data

Categories

Resources