I'm trying to scrape a number of pages that have a standard format. I've been able to use Phantomjs to successfully scrape a single page, but when I try to iterate over multiple ones, the asynchronous processing makes things hang up. What's the proper way to tell Casper/Phantom to wait?
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
// this overwrites the previous output file
f = fs.open("lat_long.txt", "w");
f.write("--");
f.close();
// this is the unique identifier for the locations. For now, I just have three datapoints
var EPAID = ["KYD980501076","ME8170022018", "MEN000103584"];
/// this code will be used to loop through the different locations. For now, set to look at only one.
for (q= 0; q < 1; q++) {
var processing = false;
//we construct the target url
var url = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[0] + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;
page.open(url);
page.onLoadFinished = function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
$value = [];
$Object = $(".result tr");
for (i =0 ; i < 10; i++) {
$value.push($Object.find('td').html(),$Object.find('td').next().next().html() );
$Object = $Object.next();
}
$string = "{ EPAID: "+ $value[0] + ", " +
"Name: "+ $value[1] + ", " +
"City: "+ $value[4] + ", " +
"State: "+ $value[6] + ", " +
"ZipCode: "+ $value[8] + ", " +
"Latitude: "+ $value[14] + ", " +
"Longitude: "+ $value[16] + " }" ;
return $string;
});
f = fs.open("lat_long.txt", "a");
f.write(str);
f.close();
processing = true;
console.log("writing to file");
phantom.exit();
});
}
// right here it should delay until the previous page is completed
// while (!processing) {
// setTimeout(function(){ console.log("waiting....");},1000);
// }
};
}
console.log("finished all pages");
If you switched to using casperJS, it is as simple as changing your page.open() into page.thenOpen(). (This CasperJS - How to open up all links in an array of links question looks very similar to yours?)
If you wanted to stick with PhantomJS you need to start the next page load in the onSuccess callback of the previous load. This is tedious, and needs care to avoid large memory usage. (I did it once or twice, but now simply use CasperJS.)
An alternative approach is to create the page object inside the loop. However that is not quite answering your question, as then they will run in parallel. But you could use setTimeout to stagger each once to avoid a burst of activity if you have hundreds of URLs!
Here is the code that ultimately works (using the timeout approach since I wasn't able to get the success callback to work better).
With casperjs installed, I named this file "process.js" and was able to run it from the command line as "casperjs process.js"
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
// this overwrites the previous output f
// this is the unique identifier for the locations.
var EPAID = ["NED981713837",... , "FLD049985302", "NJD986643153"];
f = fs.open("lat_long.txt", "w");
f.write("-<>-");
f.close();
var count = 0;
var target = 1400;
var written = [];
function yourFunction(){
if (count < target) {
process(count);
count++;
setTimeout(yourFunction, 5000);
} else {
console.log("exiting");
phantom.exit();
return;
}
}
function process(counter){
var processing = false;
console.log("Beginning record #" + counter);
//we construct the target url
var url = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[counter] + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;
page.open(url);
page.onLoadFinished = function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
$value = [];
$Object = $(".result tr");
for (i =0 ; i < 10; i++) {
$value.push($Object.find('td').html(),$Object.find('td').next().next().html() );
$Object = $Object.next();
}
$string = "{ \"EPAID\": \""+ $value[0] + "\", " +
"\"Name\": \""+ $value[1] + "\", " +
"\"City\": \""+ $value[4] + "\", " +
"\"State\": \""+ $value[6] + "\", " +
"\"ZipCode\": \""+ $value[8] + "\", " +
"\"Latitude\": "+ $value[14] + ", " +
"\"Longitude\": "+ $value[16] + " }," ;
return $string;
});
if (written[counter] === undefined) {
f = fs.open("lat_long.txt", "a");
f.write(str);
f.close();
written[counter] = true;
console.log("Writing to file #"+ counter);
}
});
}
};
}
console.log("Start...");
yourFunction();
Related
I am currently trying to replace the name of a file in the Mid Server after a scheduled export.
The idea here is that the file goes with the name in the format "file_name_datetime" and the customer needs "datetime_file_name" for the file to be correctly read by another system.
My main idea was to rename the file after the export to the correct format, but if there is a way of changing the file name to the required one I could do that also.
I would love to hear from you guys as I have no idea how can I do this.
Thanks in advance.
If anyone is interested in the answer, see below:
Script include:
initialize: function() {
this.filePath = gs.getProperty('directory_path');
this.midServer = gs.getProperty('midserver');
this.authMidServerBase64 = gs.getProperty('authmidserver');
},
nameChange: function(exportSetName) {
var exportGr = new GlideRecord("sys_export_set_run");
exportGr.addEncodedQuery("set.nameSTARTSWITH" + exportSetName);
exportGr.orderByDesc("completed");
exportGr.query();
if (exportGr.next()) {
var attachSysID = exportGr.ecc_agent_attachment.sys_id;
}
var attachGr = new GlideRecord("sys_attachment");
attachGr.addEncodedQuery("table_sys_idSTARTSWITH" + attachSysID);
attachGr.query();
if (attachGr.next()) {
var attachName = attachGr.file_name;
var attachDate = attachName.match((/\d+/));
var newName = attachDate + '_' + exportSetName + '.csv';
}
var jspr = new JavascriptProbe(this.midServer);
jspr.setName('FileNameChange'); // This can be any name
jspr.setJavascript('var ddr = new MidServer_script_include(); res = ddr.execute();');
jspr.addParameter("verbose", "true");
jspr.addParameter("skip_sensor", "true"); // prevent Discovery sensors running for the ECC input
jspr.addParameter("filename", this.filePath + "\\" + attachName);
jspr.addParameter("filePath", this.filePath);
jspr.addParameter("newName", this.filePath + "\\" + newName);
jspr.addParameter("operation", "rename");
return jspr.create();
},
Mid Server Script include:
initialize: function() {
/**
*** Set up the Packages references
**/
this.File = Packages.java.io.File;
this.FileOutputStream = Packages.java.io.FileOutputStream;
this.FileInputStream = Packages.java.io.FileInputStream;
this.Path = Packages.java.nio.file.Path;
this.Paths = Packages.java.nio.file.Paths;
this.Files = Packages.java.nio.file.Files;
this.StandardCopyOption = Packages.java.nio.file.StandardCopyOption;
/**
/* Set up the parameters
**/
this.verbose = probe.getParameter("verbose");
this.filePath = probe.getParameter("filePath");
this.filename = probe.getParameter("filename");
this.operation = probe.getParameter("operation");
this.newName = probe.getParameter("newName");
result = "initialize complete";
},
execute: function() {
if (this.operation == 'rename') {
this.fileRename(this.filename, this.newName);
}
return result;
},
fileRename: function(fileName, newName) {
result+= "\r\n Renaming file.";
this._debug(result);
try {
var res = this._moveFile(fileName, newName);
} catch (e) {
result += "\r\n Erro no renomeamento do ficheiro: " + e;
this._debug(result);
}
},
_moveFile: function(initialPath, targetPath) {
try {
this._debug("Initiating file move function");
var inPath = this.Paths.get(initialPath);
var tgPath = this.Paths.get(targetPath);
var res = this.Files.move(inPath, tgPath, this.StandardCopyOption.REPLACE_EXISTING);
result += "File successfully moved from: " + initialPath + " to: " + targetPath + " \r\n Result: " + res;
this._debug(result);
} catch (e) {
this._debug('Error:' + e);
}
},
_debug: function(m) {
if (this.verbose == "true") {
ms.log("::: Mid Server script include logger ::: " + m);
}
},
https://community.servicenow.com/community?id=community_question&sys_id=a56b38a6db326490fa192183ca961987
I am new to the world of * AppScript * I am currently designing a ** WepApp ** which is made up of html lists that connect to Mysql, when I individually test my lists paint correctly and their icons modify and update the data, without However ** the problem is ** when I join all my lists and call them through their corresponding url only the last one paints me the others are blank. For example of 10 lists I call # 2 the log tells me to call 10; I call 5 the same thing happens and if I call 10 it paints the data and they are allowed to be modified.
Within what I have searched I find that my problem lies in the way I render my pages but I cannot find the right path, so I ask for your support.
function doGet(e) {
var template ;
var view = e.parameters.v;
if(view == null){
template = HtmlService.createTemplateFromFile("Index");
}if(view == "Index"){
template = HtmlService.createTemplateFromFile("Index");
}if(view != null && view != "Index"){
template = HtmlService.createTemplateFromFile(view);
}
return template.evaluate()
.setTitle('Documental')
.setSandboxMode(HtmlService.SandboxMode.IFRAME)
.setXFrameOptionsMode(HtmlService.XFrameOptionsMode.ALLOWALL);
}
function getTemplate(view){
return HtmlService.createTemplateFromFile(view);
}
and with this JavaScript method I connect my appscript code to pass it to my html
window.onload = function () {
google.script.run
.withSuccessHandler(run_This_On_Success)
.withFailureHandler(onFailure)
.readAreaPRE();
};
function onFailure(error) {
var div = document.getElementById("output");
div.innerHTML = "ERROR: " + error.message;
}
function run_This_On_Success (readAreaPRE) {
let table = $("#selectTable");
table.find("tbody tr").remove();
table.append("<tr><td>" + "</td><td>" + "</td></tr>");
readAreaPRE.forEach(function (e1, readAreaPRE) {
table.append(
"<tr><td>" +
e1[0] +
"</td><td>" +
e1[1] +
"</td><td>" +
"<p><a class='modal-trigger' id=" + e1[0] + " href='#modal1' onclick='capturaid("+e1[0]+",'"+ e1[1]+"')'><i class='material-icons'>edit</i></a>" +
"<a class='modal-trigger' href='#modal3' onclick='capturaidsup("+e1[0]+")'><i class='material-icons'>delete</i></a></p>" +
"</td></tr>"
);
});
};
function capturaidsup(dato1){
$("#delAreaPRE").val(dato1)
}
function capturaid(item1,item2) {
$("#uptAreaPRE1").val(item1);
$("#uptAreaPRE2").val(item2);
}
here is my function: readArePRE
function readAreaPRE() {
var conn = Jdbc.getCloudSqlConnection(url, user, contra);
var stmt = conn.createStatement();
stmt.setMaxRows(1000);
var results = stmt.executeQuery(
"CALL `BD_CENDOC_COL`.`sp_lee_tb_ref_AreasPRE`()"
);
var numCols = results.getMetaData().getColumnCount();
var rowString = new Array(results.length);
var i = 0;
while (results.next()) {
var id_areaPRE = results.getInt("id_areaPRE");
var AreaPRE = results.getString("AreaPRE");
rowString[i] = new Array(numCols);
rowString[i][0] = id_areaPRE;
rowString[i][1] = AreaPRE;
i++;
}
return rowString;
conn.close();
results.close();
}
Thank you in advance, any correction to ask my question will be welcome.
I'm looking for a way to send many requests to an api using a different api url each time.
An example url for my project is:
http://api.bandsintown.com/artists/Hippo%20Campus/events.json?lapi_version=2.0&app_id=music_matcher
I'm using an HTTP request to pull the JSON info into my script and works perfectly...the first time. However, I want to be able to call it 50-100 ish times (max) in a loop with different artist names in the url (I'm using the BandsInTown API). For some reason, when I try to use a loop to call the http request multiple times, only one output appears and it is unpredictable which element in the order it will be (it's usually the output associated with the first or second element in the array). This is what my code looks like:
// HTTP GET call to BandsInTown API
function httpGetAsync(theUrl, callback) { //theURL or a path to file
var httpRequest = new XMLHttpRequest();
httpRequest.onreadystatechange = function() {
if (httpRequest.readyState == 4 && httpRequest.status == 200) {
var data = JSON.parse(httpRequest.responseText);
if (callback) {
callback(data);
}
}
else {
alert("error loading JSON doc");
}
};
httpRequest.open('GET', theUrl, true);
httpRequest.send(null);
}
//extracts data from api for each artist
function parseEvent(artist) {
var url = "http://api.bandsintown.com/artists/" + artist + "/events.json?lapi_version=2.0&app_id=music_matcher";
httpGetAsync(url, function(data) {
var numEvents = Object.keys(data).length;
//var events = [];
for (var j = 0; j < numEvents; j++) {
document.write(data[j].venue.name + "-> ");
document.write("LAT:" + data[j].venue.latitude + " " + "LNG:" + data[j].venue.longitude);
document.write("ARTIST: " + data[j].artists[0].name);
document.write("DATE: " + data[j].datetime);
document.write(" " + j + " ");
}
});
}
var artists = ["Drake", "Mac Demarco", "Hippo Campus", "STRFKR"];
for (var i = 0; i < artists.length; i++) {
parseEvent(artists[i]);
document.write(" ---NEXT ARTIST--- ");
}
So I can't tell exactly what's going on but things are acting weird with my current code. I don't have a whole lot of javascript and web development experience yet so any help is appreciated! I was preferably looking for a way to implement this with pure javascript. I have had trouble figureing out how to handle Node.js and/or JQuery in Eclipse Neon (the IDE I am using)
You have implemented closure pretty well so clearly this isn't a problem of success callback of one function overwriting response of all others.But now when you look at document.write() it all gets clear, this function first wipes your whole content clean then it writes whatever you told it to .That's why you hardly see anyone use it
`document.write('a');`
`document.write('b');`
`document.write('c');` // a and b are gone you would see only 'c'
So after loop gets over you would only see the output of the last call.Though it's mostly random as to which call would finish last it mostly biased towards some particular value due to the the way servers are tuned.
So better approach is to use some <div> or something and pour your results into it like this one
<div id="op"></div>
and
function parseEvent(artist) {
var url = "http://api.bandsintown.com/artists/" + artist + "/events.json?lapi_version=2.0&app_id=music_matcher";
httpGetAsync(url, function(data) {
var numEvents = Object.keys(data).length;
var op = document.getElementById('op');
op.innerHTML = op.innerHTML + " <br><br> <h2>---NEXT ARTIST---<h2> <br>";
//var events = [];
for (var j = 0; j < numEvents; j++) {
op.innerHTML = op.innerHTML + "<br>" + data[j].venue.name + "-> ";
op.innerHTML = op.innerHTML + "<br>" + "LAT:" + data[j].venue.latitude + " " + "LNG:" + data[j].venue.longitude ;
op.innerHTML = op.innerHTML + "<br>" +"ARTIST: " + data[j].artists[0].name;
op.innerHTML = op.innerHTML + "<br>" +"DATE: " + data[j].datetime;
op.innerHTML = op.innerHTML + "<br>" + " " + j + " <br>";
}
});
}
var artists = ["Drake", "Hippo Campus", "STRFKR","Mac Demarco"];
for (var i = 0; i < artists.length; i++) {
parseEvent(artists[i]);
}
I'm new to HTML/javascript and I want to make something that displays Last.FM current playing songs, into a div on html, which displays it in text, I have a code that sends the current song through a chat on www.irccloud.com, and I was wondering If you could change it so that It could get received and put into a DIV on a page, the code is below:
and var r is the completed code, so how would I do something in the div that picks up the source as the link above and then grabs var r from it? If so, how would I do it??
I have tried the following code here
Sorry if I do not make sense.
(function () {
var e = "DeviousRunner";
window.lfmRecentTrack = function (t) {
var n = (new Array).concat(t.recenttracks.track)[0];
var album, spurl;
if (n.album["#text"]) {
album = " (from " + n.album["#text"] + ")";
} else {
album = "";
}
try {
var spotify = new XMLHttpRequest();
spotify.open("GET", "https://ws.spotify.com/search/1/track.json?q=" + encodeURIComponent(n.artist["#text"] + " - " + n.name), false);
spotify.send();
var spotresp = JSON.parse(spotify.responseText);
if (spotresp["tracks"].length > 0) {
//var urisplit = spotresp["tracks"][0]["href"].split(":");
//spurl = " https://open.spotify.com/" + urisplit[1] + "/" + urisplit[2];
spurl = spotresp["tracks"][0]["href"];
} else {
console.log("spotify: couldn't get url");
spurl = "";
}
} catch(e) {
console.log("spotify: " + e.message);
spurl = "";
}
var r = "is listening to " + n.name + " by " + n.artist["#text"] + " " + album + " (" + spurl + ")";
}
var n = document.createElement("script");
n.setAttribute("type", "text/javascript");
n.setAttribute("src", "https://ws.audioscrobbler.com/2.0/?method=user.getrecenttracks&user=" + e + "&api_key=dd5fb083b94a7196cf696b9d7d11bc63&limit=1&format=json&callback=window.lfmRecentTrack");
document.body.appendChild(n)
})();
I updated your FIDDLE,
by moving this:
var element = document.getElementById("rss");
element.innerHTML = r;
inside the function...
hope this is useful for you
I've been working on a script which collates the scores for a list of user from a website. One problem is though, I'm trying to load the next page in the while loop, but the function is not being loaded...
casper.then(function () {
var fs = require('fs');
json = require('usernames.json');
var length = json.username.length;
leaderboard = {};
for (var ii = 0; ii < length; ii++) {
var currentName = json.username[ii];
this.thenOpen("http://www.url.com?ul=" + currentName + "&sortdir=desc&sort=lastfound", function (id) {
return function () {
this.capture("Screenshots/" + json.username[id] + ".png");
if (!casper.exists(x("//*[contains(text(), 'That username does not exist in the system')]"))) {
if (casper.exists(x('//*[#id="ctl00_ContentBody_ResultsPanel"]/table[2]'))) {
this.thenEvaluate(tgsagc.tagNextLink);
tgsagc.cacheCount = 0;
tgsagc.
continue = true;
this.echo("------------ " + json.username[id] + " ------------");
while (tgsagc.
continue) {
this.then(function () {
this.evaluate(tgsagc.tagNextLink);
var findDates, pageNumber;
pageNumber = this.evaluate(tgsagc.pageNumber);
findDates = this.evaluate(tgsagc.getFindDates);
this.echo("Found " + findDates.length + " on page " + pageNumber);
tgsagc.checkFinds(findDates);
this.echo(tgsagc.cacheCount + " Caches for " + json.username[id]);
this.echo("Continue? " + tgsagc["continue"]);
this.click("#tgsagc-link-next");
});
}
leaderboard[json.username[id]] = tgsagc.cacheCount;
console.log("Final Count: " + leaderboard[json.username[id]]);
console.log(JSON.stringify(leaderboard));
} else {
this.echo("------------ " + json.username[id] + " ------------");
this.echo("0 Caches Found");
leaderboard[json.username[id]] = 0;
console.log(JSON.stringify(leaderboard));
}
} else {
this.echo("------------ " + json.username[id] + " ------------");
this.echo("No User found with that Username");
leaderboard[json.username[id]] = null;
console.log(JSON.stringify(leaderboard));
}
});
while (tgsagc.continue) {
this.then(function(){
this.evaluate(tgsagc.tagNextLink);
var findDates, pageNumber;
pageNumber = this.evaluate(tgsagc.pageNumber);
findDates = this.evaluate(tgsagc.getFindDates);
this.echo("Found " + findDates.length + " on page " + pageNumber);
tgsagc.checkFinds(findDates);
this.echo(tgsagc.cacheCount + " Caches for " + json.username[id]);
this.echo("Continue? " + tgsagc["continue"]);
return this.click("#tgsagc-link-next");
});
}
Ok, looking at this code I can suggest a couple of changes you should make:
I don't think you should be calling return from within your function within then(). This maybe terminating the function prematurely. Looking at the casperjs documentation, the examples don't return anything either.
Within your while loop, what sets "tgsagc.continue" to false?
Don't use "continue" as a variable name. It is a reserved word in Javascript used for terminating an iteration of a loop. In your case this shouldn't be a problem, but its bad practice anyhow.
Don't continually re-define the method within your call to the then() function. Refactor your code so that it is defined once elsewhere.
We ended up having to scope the function, so it loads the next page in the loop.
This is mainly because CasperJS is not designed to calculate scores, and it tries to asynchronously do the calculation, missing the required functions