hope you're having an awesome day.
I'm running a CasperJS scrape across around 100,000 links over the course of a few days (continuously).
For every 500 or so, casperJS crashes randomly. When reloaded and started from the last link, however, it continues for another 500.
I was wondering if someone knows of an effective way I might be able to refresh or close & reinstance casperjs, to avoid this burnout? I was thinking of an exit() paired with a wait, but very keen on thoughts!
The script is similar to:
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
pageSettings: {
loadImages: false,
loadPlugins: true,
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11'
},
clientScripts: ['vendor/jquery.min.js', 'vendor/lodash.js'],
viewportSize: {
width: 1600,
height:1000
}
});
var linkArray = [ // Includes 100,000 + links ]
function inspectUrl(url) {
casper.thenOpen(url, function() {
title = this.getPageTitle();
bodyText = this.fetchText('body');
// Includes a bunch of other tasks to do.
}
casper.start('https://www.google.com.au', function() {
console.log('Booting up CasperJS...');
});
casper.then(function() {
for (var i = 0; i < linkArray.length; i++) {
inspectUrl(linkArray[i]);
};
});
casper.run()
There is a known PhantomJS memory problem. You should develop a "runner", which runs your CasperJS script with some 400 links, collects result, then runs another instance of the script with another portion of links and so far.
Maybe you can make some CasperJS instances run in parallel, if you need speed.
You can develop such a runner with PhantomJS, using the spawn function.
The function is described briefly in the PhantomJS docs: http://phantomjs.org/api/child_process/
UPDATE:
You can find below a working example of such a runner. The example is very simple, just to demonstrate how one could spawn CasperJS instances and collect their results. In particular, there is no error handling in the example at all. The example have been tested with PhantomJS 2.1.1.
The runner uses Q promises, so first you have to make file package.json with the following content:
{
"dependencies": {
"q": "1.4.1"
}
}
and run installer:
npm install
Then you have to create runner.js:
var Q = require('q');
var childProcess = require('child_process');
var parserTasks = [
'http://phantomjs.org/',
'http://casperjs.org/',
'https://jquery.com/'
];
run(parserTasks).then(function(result) {
console.log('Tasks result: ' + JSON.stringify(result));
phantom.exit();
});
function run(tasks) {
if (tasks.length) {
var task = tasks.pop();
return runTask(task).then(function(result) {
console.log('result: ' + result);
return run(tasks).then(function(results) {
return([result].concat(results));
});
});
} else {
return Q([]);
}
}
function runTask(task) {
var defer = Q.defer();
var spawn = childProcess.spawn;
var result = '';
var child = spawn('casperjs', ['parser.js', task]);
console.log("spawn run: " + task);
child.stdout.on("data", function(data) {
result += data;
});
child.on("exit", function() {
defer.resolve(result);
});
return defer.promise;
}
and parser.js
var casper = require('casper').create();
var url = casper.cli.args[0];
var result;
casper.start();
casper.thenOpen(url, function() {
result = this.getTitle();
});
casper.run(function() {
this.echo(result).exit();
});
You could execute the runner the following way, meaning that phantomjs executable is somewhere on PATH.
phantomjs runner.js
The output should be the following:
spawn run: https://jquery.com/
result: jQuery
spawn run: http://casperjs.org/
result: CasperJS, a navigation scripting and testing utility for PhantomJS and SlimerJS
spawn run: http://phantomjs.org/
result: PhantomJS | PhantomJS
Tasks result: ["jQuery\n","CasperJS, a navigation scripting and testing utility for PhantomJS and SlimerJS\n","PhantomJS | PhantomJS\n"]
Related
Hello guys I'm kinda new to js and protractor and I just found out it can't create and modify files, so the question I want to ask is:
Is it possible to manually write test cases logic fails to a text file for example:
I know the code is not correct but you will get the idea i know about jasmine-reporters and with xml file output but it just prints console errors i want one that is custom liek the one below
describe('File output test', function() {
it('should have a title', function() {
browser.ignoreSynchronization=true;
browser.get('https://www.google.com');
});
it('Tests output file',function(){
var searchText = $('#lst-ib');
searchText.sendKeys('Testt')
searchText.sendKeys(protractor.Key.ENTER);
browser.sleep(3000);
if(browser.getTitle() != 'Test')
{
var txtFile = "C:\Users\y\Desktop\test.txt";
var file = new File(txtFile);
var url = browser.getCurrentUrl();
file.open("w");
file.writeln("Error at " + url);
file.close();
}
});
});
conf file pretty basic:
exports.config = {
framework: 'jasmine',
seleniumAddress: 'http://localhost:4444/wd/hub',
specs: ['spec.js']
}
So I simply want to check for the given title at the moment and if it is different from the expected one i want to save the url in an output file so when the test ends i can check afterwards where exactly did something i didnt want happened. I hope I am not talking nonsense
Protractor runs in Node.js environment. So everything, that Node.js has, is available to you. Such as "fs" module. So you can manually save file every time, or (as a better option), write custom Jasmine reporter. Your reporter would expose some variable or function in global namespace to register custom errors and write them into a file after test execution.
Nevermind i found the answer to my question here is my sample code i used to test it
var fs = require('fs-extra')
var file = 'C:/Users/y/Desktop/test/New folder/output.txt'
var counter = 1;
describe('File output test', function() {
it('should have a title', function() {
browser.ignoreSynchronization=true;
browser.get('https://www.facebook.com');
});
it('Tests output file',function(){
email = 'dame#hotmail.com';
pass = 'test123'
var enterMail = $('#email');
enterMail.sendKeys(email);
var enterPass = $('#pass');
enterPass.sendKeys(pass);
enterPass.sendKeys(protractor.Key.ENTER);
browser.sleep(3000);
if(browser.getTitle() != 'Facebook'){
fs.appendFile(file,counter +'. ' + 'Error at login using: ('+email +') as email and ('+pass+') as password.' + "\n" , function (err) {
console.log(err) // => null
})
counter+=1;
}
});
});
I found a module fs-extra which allowed to create and edit some files or documents and i managed to create and write my manual output in a file here is the link to fs-extra https://github.com/jprichardson/node-fs-extra#mkdirsdir-callback in case someone needs it cheers
I'm new to nodejs and jquery, and I'm trying to update one single html object using a script.
I am using a Raspberry pi 2 and a ultrasonic sensor, to measure distance. I want to measure continuous, and update the html document at the same time with the real time values.
When I try to run my code it behaves like a server and not a client. Everything that i console.log() prints in the cmd and not in the browesers' console. When I run my code now i do it with "sudo node surveyor.js", but nothing happens in the html-document. I have linked it properly to the script. I have also tried document.getElementsByTagName("h6").innerHTML = distance.toFixed(2), but the error is "document is not defiend".
Is there any easy way to fix this?
My code this far is:
var statistics = require('math-statistics');
var usonic = require('r-pi-usonic');
var fs = require("fs");
var path = require("path");
var jsdom = require("jsdom");
var htmlSource = fs.readFileSync("../index.html", "utf8");
var init = function(config) {
usonic.init(function (error) {
if (error) {
console.log('error');
} else {
var sensor = usonic.createSensor(config.echoPin, config.triggerPin, config.timeout);
//console.log(config);
var distances;
(function measure() {
if (!distances || distances.length === config.rate) {
if (distances) {
print(distances);
}
distances = [];
}
setTimeout(function() {
distances.push(sensor());
measure();
}, config.delay);
}());
}
});
};
var print = function(distances) {
var distance = statistics.median(distances);
process.stdout.clearLine();
process.stdout.cursorTo(0);
if (distance < 0) {
process.stdout.write('Error: Measurement timeout.\n');
} else {
process.stdout.write('Distance: ' + distance.toFixed(2) + ' cm');
call_jsdom(htmlSource, function (window) {
var $ = window.$;
$("h6").replaceWith(distance.toFixed(2));
console.log(documentToSource(window.document));
});
}
};
function documentToSource(doc) {
// The non-standard window.document.outerHTML also exists,
// but currently does not preserve source code structure as well
// The following two operations are non-standard
return doc.doctype.toString()+doc.innerHTML;
}
function call_jsdom(source, callback) {
jsdom.env(
source,
[ 'jquery-1.7.1.min.js' ],
function(errors, window) {
process.nextTick(
function () {
if (errors) {
throw new Error("There were errors: "+errors);
}
callback(window);
}
);
}
);
}
init({
echoPin: 15, //Echo pin
triggerPin: 14, //Trigger pin
timeout: 1000, //Measurement timeout in µs
delay: 60, //Measurement delay in ms
rate: 5 //Measurements per sample
});
Node.js is a server-side implementation of JavaScript. It's ok to do all the sensors operations and calculations on server-side, but you need some mechanism to provide the results to your clients. If they are going to use your application by using a web browser, you must run a HTTP server, like Express.js, and create a route (something like http://localhost/surveyor or just http://localhost/) that calls a method you have implemented on server-side and do something with the result. One possible way to return this resulting data to the clients is by rendering an HTML page that shows them. For that you should use a Template Engine.
Any DOM manipulation should be done on client-side (you could, for example, include a <script> tag inside your template HTML just to try and understand how it works, but it is not recommended to do this in production environments).
Try searching google for Node.js examples and tutorials and you will get it :)
I've got a Node.js CLI I've been building using the Commander Library. Here's the code from the main execution file.
#!/usr/bin/env node
var program = require('commander');
var scmStash = require('./lib/hdqc/scmStash');
var command = {};
program
.version('0.0.1')
.option('-P, --Projects', 'List Projects')
.option('-R, --Repositories', 'List All Repositories on Server.')
.parse(process.argv);
function listProjects() {
scmStash.getProjectListing(function (data) {
for (var i = 0; i < data.size; i++) {
var project = data.values[i];
console.log(' ' + i + ' ' + project.name + ' # ' + project.link.url);
}
})
}
if (program.Projects) {
console.log(' - Projects');
listProjects();
}
I've been building this in WebStorm, and when I call the command using the node.js running everything works perfectly. for example, if I run the WebStorm runner executing the ./strack -P command to output the project, the output looks like this...
node strack -P
- Projects
0 Business Insights # /projects/BI
1 Platform # /projects/HDP
2 H # /projects/H
3 QC Application Code # /projects/QCCODE
4 QC Design # /projects/QCDESIGN
5 QC Reports # /projects/QCREP
6 Sandbox # /projects/SAN
7 Systemic Automation Tools # /projects/SAT
8 The Swamp # /projects/SWAMP
However when I run the same 'strack' command from the standard bash (inside WebStorm or outside of WebStorm in iTerm or such) then the following output is displayed.
23:11 $ node strack -P
- Projects
As I wrote up this question, I - like so often happens when typing up a stackoverflow question - realized the dillemma. The other call that prints out the projects themselves, is an asynchronous call, the actual app shoots off that call and then executes the remaining lines of code and finishes. Before the projects are even returned and can be printed to the console. I'm not sure what WebStorm is doing to keep the console attached to the running process but I'd love to have that work for my CLI. Any ideas, thoughts, or suggestions on how I should redesign this application to actually print out the projects to the command line?
All of the code is available on the github repo here.
I think the issue is with your array boundaries in your while loop. data.size is probably something you're remembering from one of the 8 other languages you know, lol. BUT it's not in js, you're looking for data.length. Try this, it's trimmed down and has a mock for the scmStash object but I think you'll see what I mean:
var command, listProjects, program, scmStash;
program = require('commander');
scmStash = {
getRepositories: function(cb) {
return cb([
{
name: 'a',
cloneUrl: 'b'
}, {
name: 'c',
cloneUrl: 'd'
}
]);
},
getProjectListing: function(cb) {
return cb([
{
name: "proj1",
link: {
url: "http://blah"
}
}, {
name: "proj2",
link: {
url: "http://bluh"
}
}
]);
}
};
command = {};
listProjects = function() {
return scmStash.getProjectListing(function(projects) {
var i, j, len, project, results;
results = [];
for (i = j = 0, len = projects.length; j < len; i = ++j) {
project = projects[i];
results.push(console.log(' ' + i + ' ' + project.name + ' # ' + project.link.url));
}
return results;
});
};
Output:
$ node .temp/adron.js -P
- Projects
0 proj1 # http://blah
1 proj2 # http://bluh
Also, there's an optimization to be had by storing the length of the array in a variable before iterating it but it's very minor. Worry about that when you have a million elements in an array.
There is an ES7 recommendation for async/await syntax.
In the meanwhile, there are plenty of flow control packages available. You might consider the 'async' NPM package.
I'm using Selenium's node.js API to run PhantomJS instances against a series of web pages. The code I use to execute the actions on the pages work fine, but it seems only one instance of Selenium/PhantomJS can run at a time. This function is called multiple times from the same module and steps through pages in a webshop where the pagination is handled client side (which is why I need the Selenium/PhantomJS environment - to extract data from each page).
Once again, the code in and of itself works fine, but it can't execute in parallell. What could be causing this?
module.exports = function (crawler, page, parsePage, done) {
"use strict";
var _ = require("lodash"),
format = require("util").format,
path = require("path"),
webdriver = require("selenium-webdriver"),
By = webdriver.By,
until = webdriver.until;
var phantomPath = path.resolve(__dirname, "../node_modules/.bin/phantomjs"),
isWin = process.platform === "win32";
var driver = new webdriver.Builder()
.withCapabilities({
"phantomjs.binary.path": isWin ? phantomPath + ".cmd" : phantomPath
})
.forBrowser("phantomjs")
.build();
var windowHandle = new webdriver.WebDriver.Window(driver);
windowHandle.setSize(1100, 1000);
var getAllPagesContent = function (driver) {
var pagesContent = [],
pageNo = 1;
var getNextPage = function () {
var nextPageLink;
return driver.findElements(By.css(".pagination li")).then(function (elements) {
return elements[elements.length - 1];
}).then(function (element) {
nextPageLink = element;
return element.getAttribute("class");
}).then(function (className) {
return _.includes(className, "active");
}).then(function (isLastPage) {
return (!isLastPage) ? driver.getPageSource() : false;
}).then(function (content) {
if (content)
pagesContent.push(content);
content && console.log("Got page %d", pageNo++);
return nextPageLink.findElement(By.css("a")).then(function (element) {
return element.click();
}).then(function () {
return driver.wait(until.stalenessOf(nextPageLink), 10 * 1000);
}).then(function () {
return content ? getNextPage() : pagesContent;
});
});
};
return getNextPage();
};
var processTimeout = setTimeout(function () {
console.log("PhantomJS for page %s took too long to execute", page.url);
driver.quit().then(done);
}, 60 * 1000);
driver.get(page.url).then(function () {
var pageOverlay = driver.findElement(By.css("#overlay-the-new"));
return pageOverlay.isDisplayed().then(function (visible) {
if (visible) {
pageOverlay.click();
return driver.wait(until.elementIsNotVisible(pageOverlay), 10000);
}
}).then(function () {
return getAllPagesContent(driver);
});
}).then(function (contents) {
clearTimeout(processTimeout);
console.log("Got %d pages for %s", contents.length, page.url);
_.forEach(contents, function (pageContent) {
parsePage(page.url, pageContent);
});
return driver.quit();
}).then(function () {
done();
});
}
Although PhantomJS is now deprecated you can still run it in parallel isolated Docker containers by using Selenoid. There is a ready to use image with latest release here: https://hub.docker.com/r/selenoid/phantomjs/tags/
Parallel execution with Selenium tends to be done using Remote WebDrivers and the Selenium Grid2 Framework.
This tutorial at WeDoQA seems to be the sort of thing you want. At a brief glance it has each test in a separate class, while a central test base class points towards Grid2's hub, which then (in the tutorial) executes the tests in parallel using a Firefox driver. You could easily retool this to use phantomjs, but you might have to rework your test structure.
It seems you're only using one driver. I'd initialize a second driver, then use threading to run in parallel. I think this could get the job done.
Use Thread for running in parallel or you can use any test framework which can take care of running the tests in parallel.
I am new to casperJS. I have installed casperJS 1.0.4 and phantomJS 1.8.2 on windows 8.
My objective is to scrape some data from net. i want to open this webpage and fetch the list of towns in vermont. I replicated the code given by Victor W Yee. When i run the code, it opens the desired page, i take a snapshot of it as verification but when i try and fetch data from the table I get an error on this line:
var town_names_info = this.getElementsInfo(town_selector);
Error says:
TypeError: 'undefined' is not a function(evaluating'this.getElementsInfo(town_selector)')
F:/Trial Codes/intro to casper_JS/Vermont/vermont.js:21
F:/Trial Codes/intro to casper_JS/Vermont:1335 in runStep
F:/Trial Codes/intro to casper_JS/Vermont:332 in checkStep
Any suggestions ??
My whole code is:
var utils = require('utils');
var casper = require('casper').create({
verbose: false,
logLevel: 'debug'
});
var url = 'http://en.wikipedia.org/wiki/List_of_towns_in_Vermont';
var town_selector;
casper.start(url, function() {
this.capture("result1.png");
this.echo("* "+this.getTitle()+" *");
});
casper.then(function() {
// Get info on all elements matching this CSS selector
town_selector = 'table[id="sortable wikitable"] tbody tr td:nth-of-type(2)';
var town_names_info = this.getElementsInfo(town_selector); // an array of object literals
// Pull out the town name text and push into the town_names array
var town_names = [];
for (var i = 0; i < town_names_info.length; i++)
{
town_names.push(town_names_info[i].text);
}
// Dump the town_names array to screen
utils.dump(town_names);
});
casper.run(function() {
this.exit();
});
getElementsInfo() was added in CasperJS version 1.1 (note the green note in the page). You can use 1.1.0-beta3 because this "beta" version is actually stable. While you're at it updating, you should use a more up-to-date version of PhantomJS such as 1.9.7 or 1.9.8 (has some problems with CasperJS).