Scraping table from website, with javascript:subOpen href link - javascript

I would like to scrape for each link on this page the page details page behind.
I can get all informations on this page: PAGE
However, I would like to get all info's on the details page, but the href link looks like that, for example:
href="javascript:subOpen('9ca8ed0fae15d43dc1257e7300345b99')"
Here is my sample spreadsheet using the ImportHTML function to get the general overview.
Google Spreadsheet
Any suggestions how to get the details pages?
UPDATE
I implemented the method the following:
function doGet(e){
var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
var feed = UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();
var d = document.createElement('div'); //assuming you can do this
d.innerHTML = feed;//make the text a dom structure
var arr = d.getElementsByTagName('a') //iterate over the page links
var response = "";
for(var i = 0;i<arr.length;i++){
var atr = arr[i].getAttribute('onclick');
if(atr) atr = atr.match(/subOpen\((.*?)\)/) //if onclick calls subOpen
if(atr && atr.length > 1){ //get the id
var detail = UrlFetchApp.fetch(base + '0/'+atr[1]).getContentText();
response += detail//process the relevant part of the content and append to the reposnse text
}
}
return ContentService.createTextOutput(response);
}
However, I get an error when running the method:
ReferenceError: "document" is not defined. (line 6, file "")
What is the document an object of?
I have update the Google Spreadsheet with a webapp.

You can use Firebug in order to inspect the page contents and javascript. For instance you can find that subOpen is actually an alias to subOpenXML declared in xmlhttp01.js.
function subOpenXML(unid) {/*open found doc from search view*/
if (waiting) return alert(bittewar);
var wState = dynDoc.getElementById('windowState');
wState.value = 'H';/*httpreq pending*/
var last = '';
if (unid==docLinks[0]) {last += '&f=1'; thisdocnum = 1;}
if (unid==docLinks[docLinks.length-1]) {
last += '&l=1';
thisdocnum = docLinks.length;
} else {
for (var i=1;i<docLinks.length-1;i++)
if (unid==docLinks[i]) {thisdocnum = i+1; break;}
}
var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
httpreq.open('GET', // &rand=' + Math.random();
/*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);
httpreq.onreadystatechange=onreadystatechange;
// httpreq.setRequestHeader('Accept','text/xml');
httpreq.send(null);
waiting = true;
title2src = firstTextChild(dynDoc.getElementById('title2')).nodeValue;
}
So, after copying the function source and modifying it in firebug's Console tab to add a console.log(url) before the http call, like this:
var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
console.log(url)
httpreq.open('GET', // &rand=' + Math.random();
/*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);
You can execute the function declaration in firebug's Console tab and overwrite subOpen with the modified source.
Clickin in the link then will show that the invoked url is composed of the id passed as parameter to subOpen prefixed by '0/', so in the example you posted it would be a GET to:
http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/0/1fd2313c2e0095bfc1257e49004170ca?OpenDocument&f=1&bm=2
You could also verify this by opening the Network tab in firebug and clicking the link.
Therefore, in order to scrape the details page you'd need to
Parse the id passed to subOpen
Make a GET call to '0/'
Parse the request response
Looking the request response in firebug's Network Tab shows that probably you'll need to do similar parsing to actually get the showed contents, but I haven't looked deep into it.
UPDATE
The importHTML function is not suitable for the kind of scraping you want. Google's HTML or Content Services are better suited for this. You'll need to create a web app and implement the doGet function:
function doGet(e){
var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
var feed = UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();
var response = "";
var match = feed.match(/subOpen\('.*?'\)/g)
if(match){
for(var i = 0; i < match.length;i++){
var m = match[i].match(/\('(.*)'\)/);
if(m && m.length > 1){
var detailText = UrlFetchApp.fetch(base + '0/'+m[1]);
response += //dosomething with detail text
//and concatenate in the response
}
}
}
return ContentService.createTextOutput(response);
}

Related

Use GAS to Fetch URL, Then Take Screenshot (or Convert HTML to PDF/JPG)

My Goal
I am, in short, attempting to create a script that visits a list of URLs and takes a screenshot of what is on the URL. For context, my goal is to save a snapshot that shows an image is listed on free stock photo sites (like pexels.com and unsplash.com).
My Code So Far
Here is what I have so far:
function stockDatabasePDF(){
var ss = SpreadsheetApp.getActiveSpreadsheet();
var sheet = ss.getSheetByName('stock db');
//make the pdf from the sheet
var data = sheet.getRange('A2:A').getValues(); //these are your URLs to check
for ( var i = 0; i < data.length; i++ ) {
if ( data[i][0] !== "" ) { //this means if your data (urls) are NOT blank
var theurl = data[i][0];
var token = ScriptApp.getOAuthToken();
var docurl = UrlFetchApp.fetch(theurl, { headers: { 'Authorization': 'Bearer ' + token } });
var pdf = docurl.getBlob().setName('name').getAs('application/pdf');
}
//save the file to folder on Drive
var fid = '1eHvWjIYyOeB9MQDwovzyXxx8CEIK5aOt';
var folder = DriveApp.getFolderById(fid);
var pdfs = folder.createFile(pdf).getUrl(); //both creates the file and copies the url
}
}
Just FYI... What is in range A2:A is the URLs to the stock photo sites, for example:
https://www.pexels.com/photo/person-riding-brown-horse-3490257/
https://unsplash.com/photos/Cz_SNZZyHgI
The issue
This script seems to ALMOST work. But it runs into one of two problems:
I get a 403 response code (forbidden request)
The screenshot takes, but no images/css/etc. is included.
Here is an example of that
To Close
Any help here would be greatly appreciated. Please let me know if I left anything out or if anyone has any questions. Thank you all!

How to update URL address using ASP when based on JSON object

I'm making an application of a list of races in a table, this information is gotten from a JSON api. Upon selecting a race in a table I need a window to pop up with the name of the race in the url address bar, so something like asp?=race_name at the end.
How would I go about completing this?
I've looked this up but can't seem to find an answer, my guess is I'm googling the wrong thing as I'm not good at explaining
for (let i = 0; i < jsonObj.length; i++) {
document.getElementById(jsonObj[i].id).onclick = function () {
var newWin = window.open("","","width=700,height=700");
newWin.onload = function () {
var doc = newWin.document; // Create new document in window
var raceName = document.createElement('h3');
var distance = document.createElement('p');
var location = document.createElement('p');
var curPos = document.createElement('p');
var difference = document.createElement('p');
raceName.textContent = jsonObj[i].race_name;
distance.textContent = jsonObj[i].distance;
location.textContent = jsonObj[i].venue_name;
doc.body.appendChild(raceName);
doc.body.appendChild(distance);
doc.body.appendChild(location);
}
}
Here the window pops up and includes the correct information, I now just need to include code that would update the url address to include race name.

How do I get $.getJSON to work on JSONP

I know it looks like a lot but my question pertains to a single line of (bolded) code. I know my event handler is set up correctly. I know my url is what it should be by this point, except for the ?callback=? part (I read in another post that by putting this at the end of the url passed to $.getJSON, the getJSON becomes capable of working with JSONP, and according to their API page wiki uses JSONP). I also know for certain that the domMod function NEVER RUNS, NOT EVEN THE FIRST LINE OF IT. So don't worry about the other parts of my code, just please if you can tell me why my $.getJSON is not calling the function, I am really new to this stuff. The error message I get back is
wikiViewer.html:1 Refused to execute script from
'https://en.wikipedia.org/w/api.php?format=json&action=query&generator=searc…=jordan?callback=jQuery111107644474213011563_1454965359373&_=1454965359374'
because its MIME type ('application/json') is not executable, and
strict MIME type checking is enabled.
(function(){
var searchBtn = document.getElementById('search');
//var body = document.getElementsByTagName('body')[0];
var input = document.getElementById("input");
var bodyDiv = document.getElementById('bodyDiv')
$(document).ready(function(){
searchBtn.addEventListener('click', searchWiki);
function searchWiki(){
bodyDiv.innerHTML = "";
var url = 'https:\/\/en.wikipedia.org/w/api.php?format=json&action=query&generator=search&gsrnamespace=0&gsrlimit=10&prop=pageimages|extracts&pilimit=max&exintro&explaintext&exsentences=1&exlimit=max&gsrsearch='
if (input.value === ""){
return;
}
var searchTerm = input.value.replace(/\s/g, '%20');
url = url + searchTerm + "?callback=?";
**$.getJSON(url, domMod)** //change fileName to be whatever we wish to search
function domMod(json){ //what to do with dom based on json file NOTE WE NEED TO FIRST CHECK ANDREMOVE PREVIOUS SEARCH CONTENT
var entry;
if (!json.hasOwnProperty(query)){
return;
}
if (!json.query.hasOwnProperty(pages)){
return;
}
json = json.query.pages;
var keys = Object.keys(json);
var keysLength = keys.length;
for (var i = 0; i < keysLength; i++){
entry = json[keys[i]];
var outterDiv = document.createElement('div');
outterDiv.className = "entry";
var imageDiv = document.createElement('div');
imageDiv.className = "entryImg";
var entryDiv = document.createElement('div');
entryDiv.className = "entryTxt";
outterDiv.appendChild(imageDiv);
outterDiv.appendChild(entryDiv);
entryDiv.innerHTML = '<h2>' + entry.title + '</h2>' + '<p>' + entry.extract + '</p>'
if (entry.hasOwnProperty('thumbnail')){ //add image to our imageDiv child of entryDiv
imageDiv.style.backgroundImage = "url('" + entry.thumbnail.source + "')"
}
bodyDiv.appendChild(outterDiv); //appendChild to the Body
}
}
}
});
}())
You already have a query string started in url using ? but are adding another ? when you do:
url = url + searchTerm + "?callback=?";
Change to
url = url + searchTerm + "&callback=?";
Works fine when I sent term "food"
DEMO

createFile() in google Apps Script is not functioning properly

I am trying to create a file. It works fine when I run the following code segment from the debugger in apps script. However, when I run it real time from the spreadsheet, it says I do not have permission to call createfile. Everything that is logged is identical. The issue is not I do not have authority as I am the only one in the spreadsheet and am the owner. The purpose of the CSV is to move it from my google drive into data for BigQuery
function saveAsCSV(row) { //Doc to Csv
//row = 3; //when this is uncommented and ran from the debugger, it works.
try{
var fileName= Date.now()
fileName = fileName + ".csv";
var csvFile = convertRangeToCsvFile_(fileName,row);
Logger.log(csvFile); //Both times ran on the spreadsheet and from debug equals the same.
DriveApp.createFile(fileName, csvFile);
SpreadsheetApp.getActiveSpreadsheet().getSheetByName("New and Open").getRange("J" + row.toString()).setValue("");
loadCsv(fileName);
}
catch(e){Logger.log("B" + e.message);} //No permission to create file
}
function convertRangeToCsvFile_(csvFileName, r) {
var ws = SpreadsheetApp.getActiveSpreadsheet();
try {
//var data = ws.getValues();
var csvFile = undefined;
var csv = "";
var row = r;
var datArray = Create2DArray(1,19);
datArray[0][0] = ws.getRange("A" + row.toString()).getValue().toString().toUpperCase();
datArray[0][1] = ws.getRange("B"+row.toString()).getValue().toString().toUpperCase();
datArray[0][2] = ws.getRange("C"+row.toString()).getValue().toString().toUpperCase();
datArray[0][3] = ws.getRange("D"+row.toString()).getValue().toString().toUpperCase();
datArray[0][4] = ws.getRange("E"+row.toString()).getValue().toString().toUpperCase();
datArray[0][5] = ws.getRange("F"+row.toString()).getValue().toString().toUpperCase();
datArray[0][6] = ws.getRange("G"+row.toString()).getValue().toString().toUpperCase();
datArray[0][7] = ws.getRange("H"+row.toString()).getValue().toString().toUpperCase();
datArray[0][8] = ws.getRange("I"+row.toString()).getValue().toString().toUpperCase();
datArray[0][9] = new Date(ws.getRange("K"+row.toString()).getValue().toString()).getHours();
datArray[0][10] = new Date(ws.getRange("K"+row.toString()).getValue().toString()).getMinutes();
datArray[0][11] = new Date(ws.getRange("L"+row.toString()).getValue().toString()).getHours();
datArray[0][12] = new Date(ws.getRange("L"+row.toString()).getValue().toString()).getMinutes();
datArray[0][13] = new Date(ws.getRange("M"+row.toString()).getValue().toString()).getHours();
datArray[0][14] = new Date(ws.getRange("M"+row.toString()).getValue().toString()).getMinutes();
datArray[0][15] = new Date(ws.getRange("N"+row.toString()).getValue().toString()).getTime();
datArray[0][16] = new Date(ws.getRange("N"+row.toString()).getValue().toString()).getFullYear();
datArray[0][17] = new Date(ws.getRange("N"+row.toString()).getValue().toString()).getMonth();
datArray[0][18] = new Date(ws.getRange("N"+row.toString()).getValue().toString()).getDate();
for(var i = 0; i < 19; i++){
if(datArray[0][i] == ""){if(i > 9){datArray[0][i] = 0;} else{datArray[0][i] = "nil";} }
if(i < 18){csv += '"' + datArray[0][i] + '"' + ",";}
else{ csv += '"' + datArray[0][i] + '"'; }
}
Logger.log("A " + csv);
Logger.log(csv + "\n" + datArray[0].join(","));
csvFile = csv;
return csvFile;
}
catch(err) {
Logger.log("C" + err);
Browser.msgBox(err);
}
}
You mention in your comment on my answer that you are using onEdit to trigger the script. Since this is a Simple Trigger, your current approach will not work. When you use simple triggers to run an Apps Script, it runs in a sandbox with reduced permissions.
See: https://developers.google.com/apps-script/guides/triggers/#restrictions
The best I can recommend is create a custom menu option with a UI popup asking for the row number to export. If the code is triggered from a menu by the user, it runs with full permission to access that users account.
Depending on your use-case, a scheduled trigger might work too. It could run every 10 minutes or every Hour and export any changes to the spreadsheet. In this case the Apps Script runs as you, with permission to access your account, and the resulting CSV would be created on your drive.
Details on how to create a custom menu: https://developers.google.com/apps-script/guides/triggers/#onopen
Details on how to create a form for the user: https://developers.google.com/apps-script/guides/ui-service
Details on time driven triggers: https://developers.google.com/apps-script/guides/triggers/installable#time-driven_triggers

Using XMLHttpRequest to get words from another website

As of current I am learning to use JavaScript to create web applications. I have just finished developing a hangman game (code will be provided later on). I have used an array of words to get a random word to play with. But as a next step I want to use an XMLHttpRequest to get a random word from a separate website, I was wondering if someone could point me towards a tutorial or give me some information on how to start!
Thanks in advance!
<script type="text/javascript">
var myWords = new Array("first", "hello", "goodbye", "random", "word", "last");
var item = myWords[Math.floor(Math.random() * myWords.length)];
var length = item.length;
var guessedLetters = "";
var error = 0;
function partialWords(item, letters) {
var returnLetter = "";
for (i = 0; i < item.length; i++) {
if (letters.indexOf(item[i]) !== -1) {
returnLetter = returnLetter + item[i];
} else {
returnLetter = returnLetter + '_';
}
}
return returnLetter;
}
function load() {
var input = document.getElementById("hangmanID").value;
var myWords2 = (item.indexOf(input) >= 0);
if (myWords2 === false) {
console.log("That letter is not in the word");
document.getElementById("hangmanID").value = "";
document.getElementById("error").innerHTML = "That letter was wrong!";
document.getElementById("success").innerHTML = "";
error++;
if (error > 0) {
document.getElementById('hangmanImg').innerHTML = "<img src='assets/" + error + ".png'>";
} else {
document.getElementById('hangmanImg').innerHTML = "No Errors yet!";
}
} else {
console.log("That letter is correct");
var string = item.indexOf(input, 0);
console.log(string);
document.getElementById("hangmanID").value = "";
document.getElementById("success").innerHTML = "That letter was right!";
document.getElementById("error").innerHTML = "";
}
guessedLetters = guessedLetters + input;
document.getElementById('hangman').innerHTML = partialWords(item, guessedLetters);
document.getElementById("lettersUsed").innerHTML = guessedLetters;
}
</script>
UPDATE:
PLEASE NOTE THAT I AM ALLOWED TO USE JSONP
Due to same-origin-policy, XMLHttpRequest is not normally allowed to fetch data from other domains. There are work-arounds such as CORS or using a proxy on your domain or using an embedded flash or java applets.
However, JSONP is a different story. That's because JSONP does not technically return data. JSONP returns a javascript file. As such, getting data using JSONP simply requires you to add a script tag to your page:
<script src="http://other.server.com/path/to/jsonp/data"></script>
To do it programmatically:
var jsonp = document.createElement('script');
jsonp.src = "http://other.server.com/path/to/jsonp/data";
document.body.appendChild(jsonp);
The problem with this is that script tags don't return anything. To solve this, the JSONP protocol passes a function name to the server so that the server will wrap that function around the JSON data.
For example, if your regular JSON data looks like this:
{"result":"something"}
The JSONP equivalent would look something like this:
callback({"result":"something"})
So, to take the original example, our new code would now be:
function processResult (obj) {
console.log(obj);
}
var jsonp = document.createElement('script');
jsonp.src = "http://other.server.com/path/to/jsonp/data?jsonp=processResult";
document.body.appendChild(jsonp);
Notice how we're passing the name of the function to handle the return value in the query param of the URL.
Note that while in this example the parameter is "jsonp" the server may implement it using some other name. Another common one is "callback", as in callback=processResult. Read the API documentation of the server you're connecting to.

Categories

Resources