I coded this, but it's still non-sequential. I hoped with the functions it would wait until the actual request finishes until a new one is called .... but that doesn't work.
Problem 1: The page.open() calls are not sequential as you can see here:
6 protocol: https: type: Content
7 protocol: https: type: Content
8 protocol: https: type: Content
9 protocol: https: type: Content
LINE: https://www.roller.de/einrichten/
10 protocol: https: type: Content
11 protocol: https: type: Content
12 protocol: https: type: Content
LINE: https://www.roller.de/einrichten/anrichte/
LINE: https://www.roller.de/einrichten/arbeitsstuhl/
LINE: https://www.roller.de/einrichten/arbeitstisch/
LINE: https://www.roller.de/einrichten/armlehnstuehle/
LINE: https://www.roller.de/einrichten/badezimmermoebel
LINE: https://www.roller.de/einrichten/bistrostuehle/
LINE: https://www.roller.de/einrichten/buecherregal/
13 protocol: https: type: Content
14 protocol: https: type: Content
15 protocol: https: type: Content
16 protocol: https: type: Content
LINE: https://www.roller.de/einrichten/buerocontainer/
LINE: https://www.roller.de/einrichten/bueroregale/
17 protocol: https: type: Content
18 protocol: https: type: Content
The LINE: should only be printed once per request, but it appears several time without the page.open result, resulting in an early stream.atEnd() = true. That should be impossible if it's sequential.
Problem 2: The last line is not taken, when I have a .txt file with 100 links (1 per line), 99 are printed, one is not
Problem 3: it crashes when I give it a list with 1000 urls
Problem 4: 10 links = 10 prints, 100 links = 98 prints and stream.atEnd() does appear several times, 500 links = 497-498 prints + stream.atEnd() problem, 1000 links = Crash
console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('100sitemap.txt', 'r');
var webPage = require('webpage');
var i = 1;
function nextPage() {
if (stream.atEnd()) {
//stream.close();
console.log("STREAM END: " + stream.atEnd());
console.log("FILE ENDS HERE");
//phantom.exit();
}
if (!stream.atEnd()) {
var line = stream.readLine();
console.log("LINE: " + line);
getRequest(line);
}
}
function getRequest(line2) {
//console.log(line);
var page = webPage.create();
page.settings.loadImages = false;
page.open(line2, function() {});
//console.log("page.open() " + line2);
//console.log("opened " + line2);
page.onResourceRequested = function(requestData, request) {
//console.log("BEFORE: " + requestData.url);
var match = requestData.url.match(/example.com\/ca/g)
//console.log("Match: " + match);
//console.log(request.url);
if (match != null) {
hasFound = true;
var targetString = decodeURI(JSON.stringify(requestData.url));
var klammerauf = targetString.indexOf("{");
var jsonobjekt = targetString.substr(klammerauf, (targetString.indexOf("}") - klammerauf) + 1);
targetJSON = (decodeURIComponent(jsonobjekt));
var t = JSON.parse(targetJSON);
console.log(i + " " + t['groups'] + " " + t['campID']);
i++;
//console.log(targetJSON);
request.abort;
}
};
page.onLoadFinished = function(status) {
if (!hasFound) {
console.log(i + " :NOT FOUND: " + line2);
i++;
}
//request.abort();
page.close();
nextPage();
}
}
nextPage();
Now it works with this code, iFrames seems to trigger onLoadFinished() twice, so I check that with hasOnLoadFinished to prevent the multiple entries in the function (using multiple page.open() at once is a really bad idea in PhantomJS).
Be aware that 2.0 will crash with too many links / too many urls (in my case 120-180) for unknown reasons (most times no error message, rare times a "QThread::start: Failed to create thread ()".
To prevent that, use the 1.9.8 version instead of 2.0, seems to be a bug there, filled a crash report with dumps on Github.
/edit crashed without an error message after 3836 links with 1.9.8 links ............... PHANTOM.
console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('linklist.de.txt', 'r');
var webPage = require('webpage');
var i = 1;
var hasFound = Array();
var hasonLoadFinished = Array();
function handle_page(link) {
var page = webPage.create();
page.settings.loadImages = false;
page.open(link, function() {});
page.onResourceRequested = function(requestData, request) {
var match = requestData.url.match(/example.com\/searchmeI'maString/g)
if (match != null) {
hasFound[link] = true;
var targetString = decodeURI(JSON.stringify(requestData.url));
var klammerauf = targetString.indexOf("{");
var jsonobjekt = targetString.substr(klammerauf, (targetString.indexOf("}") - klammerauf) + 1);
targetJSON = (decodeURIComponent(jsonobjekt));
var t = JSON.parse(targetJSON);
console.log(i + " " + t + " " + t['id']);
//console.log(targetJSON);
//console.log("");
request.abort;
} else {
request.abort;
return;
}
};
page.onLoadFinished = function(status) {
if (!hasonLoadFinished[link]) {
hasonLoadFinished[link] = true;
//console.log(" " + status + " " + link);
//console.log("onLoadFinished()")
//setTimeout(function(){/* Look mah! No name! */},1000);
if (!hasFound[link]) {
console.log(i + " :NOT FOUND: " + link);
console.log("");
}
i++;
page.close();
nextPage();
}
}
};
function nextPage() {
var link = stream.readLine();
if (!link) {
end = Date.now();
console.log("");
console.log(((end - start) / 1000) + " Sekunden");
console.log("FILE ENDS HERE!!!");
phantom.exit(0);
}
hasFound[link] = false;
hasonLoadFinished[link] = false;
handle_page(link);
}
start = Date.now();
nextPage();
Related
I am currently trying to replace the name of a file in the Mid Server after a scheduled export.
The idea here is that the file goes with the name in the format "file_name_datetime" and the customer needs "datetime_file_name" for the file to be correctly read by another system.
My main idea was to rename the file after the export to the correct format, but if there is a way of changing the file name to the required one I could do that also.
I would love to hear from you guys as I have no idea how can I do this.
Thanks in advance.
If anyone is interested in the answer, see below:
Script include:
initialize: function() {
this.filePath = gs.getProperty('directory_path');
this.midServer = gs.getProperty('midserver');
this.authMidServerBase64 = gs.getProperty('authmidserver');
},
nameChange: function(exportSetName) {
var exportGr = new GlideRecord("sys_export_set_run");
exportGr.addEncodedQuery("set.nameSTARTSWITH" + exportSetName);
exportGr.orderByDesc("completed");
exportGr.query();
if (exportGr.next()) {
var attachSysID = exportGr.ecc_agent_attachment.sys_id;
}
var attachGr = new GlideRecord("sys_attachment");
attachGr.addEncodedQuery("table_sys_idSTARTSWITH" + attachSysID);
attachGr.query();
if (attachGr.next()) {
var attachName = attachGr.file_name;
var attachDate = attachName.match((/\d+/));
var newName = attachDate + '_' + exportSetName + '.csv';
}
var jspr = new JavascriptProbe(this.midServer);
jspr.setName('FileNameChange'); // This can be any name
jspr.setJavascript('var ddr = new MidServer_script_include(); res = ddr.execute();');
jspr.addParameter("verbose", "true");
jspr.addParameter("skip_sensor", "true"); // prevent Discovery sensors running for the ECC input
jspr.addParameter("filename", this.filePath + "\\" + attachName);
jspr.addParameter("filePath", this.filePath);
jspr.addParameter("newName", this.filePath + "\\" + newName);
jspr.addParameter("operation", "rename");
return jspr.create();
},
Mid Server Script include:
initialize: function() {
/**
*** Set up the Packages references
**/
this.File = Packages.java.io.File;
this.FileOutputStream = Packages.java.io.FileOutputStream;
this.FileInputStream = Packages.java.io.FileInputStream;
this.Path = Packages.java.nio.file.Path;
this.Paths = Packages.java.nio.file.Paths;
this.Files = Packages.java.nio.file.Files;
this.StandardCopyOption = Packages.java.nio.file.StandardCopyOption;
/**
/* Set up the parameters
**/
this.verbose = probe.getParameter("verbose");
this.filePath = probe.getParameter("filePath");
this.filename = probe.getParameter("filename");
this.operation = probe.getParameter("operation");
this.newName = probe.getParameter("newName");
result = "initialize complete";
},
execute: function() {
if (this.operation == 'rename') {
this.fileRename(this.filename, this.newName);
}
return result;
},
fileRename: function(fileName, newName) {
result+= "\r\n Renaming file.";
this._debug(result);
try {
var res = this._moveFile(fileName, newName);
} catch (e) {
result += "\r\n Erro no renomeamento do ficheiro: " + e;
this._debug(result);
}
},
_moveFile: function(initialPath, targetPath) {
try {
this._debug("Initiating file move function");
var inPath = this.Paths.get(initialPath);
var tgPath = this.Paths.get(targetPath);
var res = this.Files.move(inPath, tgPath, this.StandardCopyOption.REPLACE_EXISTING);
result += "File successfully moved from: " + initialPath + " to: " + targetPath + " \r\n Result: " + res;
this._debug(result);
} catch (e) {
this._debug('Error:' + e);
}
},
_debug: function(m) {
if (this.verbose == "true") {
ms.log("::: Mid Server script include logger ::: " + m);
}
},
https://community.servicenow.com/community?id=community_question&sys_id=a56b38a6db326490fa192183ca961987
In trying to get a hang of node.js asynchronous coding style, I decided to write a program that would read a text file containing a bunch of URLS to download and download each file. I started out writing a function to download just one file (which works fine), but having trouble extending the logic to download multiple files.
Here's the code:
var http = require("http"),
fs = require("fs"),
input = process.argv[2],
folder = "C:/Users/Wiz/Downloads/",
regex = /(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?/,
urls = null,
url = "",
filename = "";
fs.readFile(input, "utf8", function(e, data) {
console.log("Reading file: " + input);
if (e) console.log("Got error:" + e.message);
urls = data.split("\n");
for (var i = urls.length; i--;) {
url = urls[i];
if (!url.match(regex)) continue;
filename = folder + url.substring(url.lastIndexOf('/') + 1);
downloadQueue.addItem(url, filename);
}
});
var downloadQueue = {
queue: [],
addItem: function(p_sSrc, p_sDest) {
this.queue.push({
src: p_sSrc,
dest: p_sDest
});
if (this.queue.length === 1) {
this.getNext();
}
},
getNext: function() {
var l_oItem = this.queue[0];
http.get(l_oItem.src, function(response) {
console.log("Downloading: " + l_oItem.dest);
var file = fs.createWriteStream(l_oItem.dest);
response.on("end", function() {
file.end();
console.log("Download complete.");
downloadQueue.removeItem();
}).on("error", function(error) {
console.log("Error: " + error.message);
fs.unlink(l_oItem.dest);
});
response.pipe(file);
});
},
removeItem: function() {
this.queue.splice(0, 1);
if (this.queue.length != 0) {
this.getNext();
} else {
console.log("All items downloaded");
}
}
};
How do I structure the code so that the completion of the first download can signal the initiation of the next one. Please note that this exercise is just for learning purposes, to understand how asynchronous coding works. In practice, I'm sure there are much better tools out there to download multiple files.
Try simple at first, it look like you copy paste codes and quite don't understand what they do.
Do a simple loop, that get the url, and print something.
var http = require('http');
URL = require('url').parse('http://www.timeapi.org/utc/now?format=%25F%20%25T%20-%20%25N')
URL['headers'] = {'User-Agent': 'Hello World'}
// launch 20 queries asynchronously
for(var i = 0; i < 20; i++) {
(function(i) {
console.log('Query ' + i + ' started');
var req = http.request(URL, function(res) {
console.log('Query ' + i + ' status: ' + res.statusCode + ' - ' + res.statusMessage);
res.on('data', function(content){
console.log('Query ' + i + ' ended - ' + content);
});
});
req.on('error', function(err) {
console.log('Query ' + i + ' return error: ' + err.message);
});
req.end();
})(i);
}
All the urls will be fetched asynchronously. You can observe that the response does not arrive in order, but are still processed correctly.
The difficulty with async is not to do the things is parallel, because you just write like a single task, and execute multiple time. It becomes complicated when you need for instance to wait for all tasks to finished before continuing. And for that, have a look at promises
Here is what I started out with. Figuring that each download was invoked asynchronously, they would all be independent of each other.
var http = require("http"),
fs = require("fs"),
input = process.argv[2],
folder = "C:/Users/Wiz/Downloads/",
regex = /(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?/,
urls = null,
url = "",
filename = "";
fs.readFile(input, "utf8",
function(e, data) {
console.log("Reading file: " + input);
if (e) console.log("Got error:" + e.message);
urls = data.split("\n");
for (var i = urls.length; i--;) {
url = urls[i];
if (!url.match(regex)) continue;
filename = folder + url.substring(url.lastIndexOf('/') + 1);
http.get(url, function(response) {
var file = fs.createWriteStream(filename);
response.on("end", function() {
file.end();
});
response.pipe(file);
})
}
});
I've just started using CasperJS so I'm pretty stuck on an issue.
What I want to do is load a URL (Login protected), find a list of links (Which change the theme of the forum, specifically this part: ".styleChooser .overlayScroll li a"), click each of them and screenshot the result of the page after the click at two resolutions.
My code is currently just a grouping of other suggestions around the net trying to get this working, however I believe all the necessary code is basically there I just can't get it working. Any help would be very much appreciated!
var casper = require("casper").create();
var screenshotUrl = "URL Here";
var screenshotPaths = "rivals";
function getLinks() {
var links = document.querySelectorAll('.styleChooser .overlayScroll li a');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}
screenshotNow = new Date(),
screenshotDateTime = screenshotNow.getFullYear() + pad(screenshotNow.getMonth() + 1) + pad(screenshotNow.getDate()),
viewports = [
{
'name': 'smartphone-portrait',
'viewport': {width: 320, height: 480}
},
{
'name': 'desktop-standard',
'viewport': {width: 1280, height: 1024}
}
];
i = -1;
casper.start();
casper.setHttpAuth('Username', 'Password');
casper.thenOpen(screenshotUrl, function(response) {
var linksArray = this.evaluate(getLinks);
this.eachThen(linksArray, function(response) {
var url = response.data;
this.each(viewports, function(casper, viewport) {
this.then(function() {
this.viewport(viewport.viewport.width, viewport.viewport.height);
});
this.thenOpen(url, function() {
this.wait(5000);
});
casper.then(function(){
casper.echo('Screenshot for '+ screenshotPaths + '/' + "homepage " + viewport.name + ' (' + viewport.viewport.width + 'x' + viewport.viewport.height + ')', 'info');
casper.capture('screenshots/' + screenshotPaths + '/' + screenshotDateTime + '/' + "homepage" + '/' + viewport.name + '-' + viewport.viewport.width + 'x' + viewport.viewport.height + '.png', {
top: 0,
left: 0,
width: viewport.viewport.width,
height: viewport.viewport.height
});
});
});
});
++i;
}); // error is here
casper.run();
function pad(number) {
var r = String(number);
if ( r.length === 1 ) {
r = '0' + r;
}
return r;
}
And this is the error:
C:\xampp\htdocs\caspertest>casperjs newestcasper.js
CasperError: You can only define a step as a function
C:/casperjs/modules/casper.js:1755 in then
C:/xampp/htdocs/caspertest/newestcasper.js:52
Unsafe JavaScript attempt to access frame with URL about:blank from frame with U
RL file:///C:/casperjs/bin/bootstrap.js. Domains, protocols and ports must match
.
If you want to open the page that you want to take a screenshot of, you should use thenOpen. Right now, you use then which is only a step function without opening anything.
casper.thenOpen(screenshotUrl, function(response) {
I'm trying to scrape a number of pages that have a standard format. I've been able to use Phantomjs to successfully scrape a single page, but when I try to iterate over multiple ones, the asynchronous processing makes things hang up. What's the proper way to tell Casper/Phantom to wait?
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
// this overwrites the previous output file
f = fs.open("lat_long.txt", "w");
f.write("--");
f.close();
// this is the unique identifier for the locations. For now, I just have three datapoints
var EPAID = ["KYD980501076","ME8170022018", "MEN000103584"];
/// this code will be used to loop through the different locations. For now, set to look at only one.
for (q= 0; q < 1; q++) {
var processing = false;
//we construct the target url
var url = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[0] + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;
page.open(url);
page.onLoadFinished = function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
$value = [];
$Object = $(".result tr");
for (i =0 ; i < 10; i++) {
$value.push($Object.find('td').html(),$Object.find('td').next().next().html() );
$Object = $Object.next();
}
$string = "{ EPAID: "+ $value[0] + ", " +
"Name: "+ $value[1] + ", " +
"City: "+ $value[4] + ", " +
"State: "+ $value[6] + ", " +
"ZipCode: "+ $value[8] + ", " +
"Latitude: "+ $value[14] + ", " +
"Longitude: "+ $value[16] + " }" ;
return $string;
});
f = fs.open("lat_long.txt", "a");
f.write(str);
f.close();
processing = true;
console.log("writing to file");
phantom.exit();
});
}
// right here it should delay until the previous page is completed
// while (!processing) {
// setTimeout(function(){ console.log("waiting....");},1000);
// }
};
}
console.log("finished all pages");
If you switched to using casperJS, it is as simple as changing your page.open() into page.thenOpen(). (This CasperJS - How to open up all links in an array of links question looks very similar to yours?)
If you wanted to stick with PhantomJS you need to start the next page load in the onSuccess callback of the previous load. This is tedious, and needs care to avoid large memory usage. (I did it once or twice, but now simply use CasperJS.)
An alternative approach is to create the page object inside the loop. However that is not quite answering your question, as then they will run in parallel. But you could use setTimeout to stagger each once to avoid a burst of activity if you have hundreds of URLs!
Here is the code that ultimately works (using the timeout approach since I wasn't able to get the success callback to work better).
With casperjs installed, I named this file "process.js" and was able to run it from the command line as "casperjs process.js"
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
// this overwrites the previous output f
// this is the unique identifier for the locations.
var EPAID = ["NED981713837",... , "FLD049985302", "NJD986643153"];
f = fs.open("lat_long.txt", "w");
f.write("-<>-");
f.close();
var count = 0;
var target = 1400;
var written = [];
function yourFunction(){
if (count < target) {
process(count);
count++;
setTimeout(yourFunction, 5000);
} else {
console.log("exiting");
phantom.exit();
return;
}
}
function process(counter){
var processing = false;
console.log("Beginning record #" + counter);
//we construct the target url
var url = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[counter] + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;
page.open(url);
page.onLoadFinished = function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
$value = [];
$Object = $(".result tr");
for (i =0 ; i < 10; i++) {
$value.push($Object.find('td').html(),$Object.find('td').next().next().html() );
$Object = $Object.next();
}
$string = "{ \"EPAID\": \""+ $value[0] + "\", " +
"\"Name\": \""+ $value[1] + "\", " +
"\"City\": \""+ $value[4] + "\", " +
"\"State\": \""+ $value[6] + "\", " +
"\"ZipCode\": \""+ $value[8] + "\", " +
"\"Latitude\": "+ $value[14] + ", " +
"\"Longitude\": "+ $value[16] + " }," ;
return $string;
});
if (written[counter] === undefined) {
f = fs.open("lat_long.txt", "a");
f.write(str);
f.close();
written[counter] = true;
console.log("Writing to file #"+ counter);
}
});
}
};
}
console.log("Start...");
yourFunction();
screenshot.js
var page = require("webpage").create();
var homePage = "http://www.google.com/";
page.open(homePage);
page.onLoadFinished = function(status) {
var url = page.url;
console.log("Status: " + status);
console.log("Loaded: " + url);
page.render("google.png");
phantom.exit();
};
Terminal:
bin/phantomjs screenshot.js
Question:
Is there any way that I can send phantomjs the URL (value of var homePage above) somehow outside of screenshot.js so that its not hard coded inside the script?
Add the url to the command line
bin/phantomjs screenshot.js http://www.google.com/
Here you have an example from the docs :https://github.com/ariya/phantomjs/blob/master/examples/arguments.js
var system = require('system');
if (system.args.length === 1) {
console.log('Try to pass some args when invoking this script!');
} else {
system.args.forEach(function (arg, i) {
console.log(i + ': ' + arg);
});
}
phantom.exit();