How to save the current webpage with casperjs/phantomjs? - javascript

Is there a way to save the current webpage by using casperjs or phantomjs?
I tried to get the html and save it into a file. But the resulting file was a lot different from the screenshot of that time (with casper.capture). Is there a way to save the current webpage?

Andrey Borisko suggested to use the disk cache to retrieve the resources. My solution is not that efficient, but you don't need to decompress text files.
I use XMLHttpRequest to retrieve all resources after I registered them with the resource.received event handler. I then filter the resources into images, css and fonts. The current limitation is that remote resource paths that contain something like ../ or ./ are not handled correctly.
I retrieve the current page content with getHTML and iterate over all captured resources to replace the path used in the markup, that is identified by a portion of the complete resource URL, with a randomly generated file name. The file extension is created from the content type of the resource. It is converted using mimeType from this gist.
Since CSS files may contain background images or fonts, they have to be processed before saving to disk. The provided loadResource function loads the resource, but does not save it.
Since XMLHttpRequest to download the resources the script has to be invoked with the --web-security=false flag:
casperjs script.js --web-security=false
script.js
var casper = require("casper").create();
var utils = require('utils');
var fs = require('fs');
var mimetype = require('./mimetype'); // URL provided below
var cssResources = [];
var imgResources = [];
var fontResources = [];
var resourceDirectory = "resources";
var debug = false;
fs.removeTree(resourceDirectory);
casper.on("remote.message", function(msg){
this.echo("remote.msg: " + msg);
});
casper.on("resource.error", function(resourceError){
this.echo("res.err: " + JSON.stringify(resourceError));
});
casper.on("page.error", function(pageError){
this.echo("page.err: " + JSON.stringify(pageError));
});
casper.on("downloaded.file", function(targetPath){
if (debug) this.echo("dl.file: " + targetPath);
});
casper.on("resource.received", function(resource){
// don't try to download data:* URI and only use stage == "end"
if (resource.url.indexOf("data:") != 0 && resource.stage == "end") {
if (resource.contentType == "text/css") {
cssResources.push({obj: resource, file: false});
}
if (resource.contentType.indexOf("image/") == 0) {
imgResources.push({obj: resource, file: false});
}
if (resource.contentType.indexOf("application/x-font-") == 0) {
fontResources.push({obj: resource, file: false});
}
}
});
// based on http://docs.casperjs.org/en/latest/modules/casper.html#download
casper.loadResource = function loadResource(url, method, data) {
"use strict";
this.checkStarted();
var cu = require('clientutils').create(utils.mergeObjects({}, this.options));
return cu.decode(this.base64encode(url, method, data));
};
function escapeRegExp(string) {
// from https://stackoverflow.com/a/1144788/1816580
return string.replace(/([.*+?^=!:${}()|\[\]\/\\])/g, "\\$1");
}
function replaceAll(find, replace, str) {
// from https://stackoverflow.com/a/1144788/1816580
return str.replace(find, replace);
}
var wrapFunctions = [
function wrapQuot1(s){
return '"' + s + '"';
},
function wrapQuot2(s){
return "'" + s + "'";
},
function csswrap(s){
return '(' + s + ')';
}
];
function findAndReplace(doc, resources, resourcesReplacer) {
// change page on the fly
resources.forEach(function(resource){
var url = resource.obj.url;
// don't download again
if (!resource.file) {
// set random filename and download it **or** call further processing which in turn will load ans write to disk
resource.file = resourceDirectory+"/"+Math.random().toString(36).slice(2)+"."+mimetype.ext[resource.obj.contentType];
if (typeof resourcesReplacer != "function") {
if (debug) casper.echo("download resource (" + resource.obj.contentType + "): " + url + " to " + resource.file);
casper.download(url, resource.file, "GET");
} else {
resourcesReplacer(resource);
}
}
wrapFunctions.forEach(function(wrap){
// test the resource url (growing from the back) with a string in the document
var lastURL;
var lastRegExp;
var subURL;
// min length is 4 characters
for(var i = 0; i < url.length-5; i++) {
subURL = url.substring(i);
lastRegExp = new RegExp(escapeRegExp(wrap(subURL)), "g");
if (doc.match(lastRegExp)) {
lastURL = subURL;
break;
}
}
if (lastURL) {
if (debug) casper.echo("replace " + lastURL + " with " + resource.file);
doc = replaceAll(lastRegExp, wrap(resource.file), doc);
}
});
});
return doc;
}
function capturePage(){
// remove all <script> and <base> tags
this.evaluate(function(){
Array.prototype.forEach.call(document.querySelectorAll("script"), function(scr){
scr.parentNode.removeChild(scr);
});
Array.prototype.forEach.call(document.querySelectorAll("base"), function(scr){
scr.parentNode.removeChild(scr);
});
});
// TODO: remove all event handlers in html
var page = this.getHTML();
page = findAndReplace(page, imgResources);
page = findAndReplace(page, cssResources, function(cssResource){
var css = casper.loadResource(cssResource.obj.url, "GET");
css = findAndReplace(css, imgResources);
css = findAndReplace(css, fontResources);
fs.write(cssResource.file, css, "wb");
});
fs.write("page.html", page, "wb");
}
casper.start("http://www.themarysue.com/").wait(3000).then(capturePage).run(function(){
this.echo("DONE");
this.exit();
});
The magic happens in findAndReplace. capturePage is completely synchronous so it can be dropped anywhere without much head ache.
URL for mimetype.js

No, I don't think there is an easy way to do this as phantomjs doesn't support rendering pages in mht format (Render as a .mht file #10117). I believe that's what you wanted.
So, it needs some work to accomplish this. I did something similar, but i was doing it the other way around I had a rendered html code that I was rendering into image/pdf through phantomjs. I had to clean the file first and it worked fine for me.
So, what I think you need to do is:
strip all js calls, like script tags or onload attributes, etc..
if you have access from local to the resources like css, images and so on (and you don't need authentication to that domain where you grab the page) than you need to change relative paths of src attributes to absolute to load images/etc.
if you don't have access to the resources when you open the page then I think you need to implement similar script to download those resources at the time phantomjs loads the page and then redirect src attributes to that folder or maybe use data uri.
You might need to change links in css files as well.
This will bring up the images\fonts and styling you are missing currently.
I'm sure there are more points. I'll update the answer if you need more info, once I see my code.

Related

upload file to google drive folder instead of user browser javascript/jsquery

My current script uses the code below to download a CSV file to local drive,
function table2CSV() {
var dataURL = '',
fieldSeparator = ',',
textField = '"',
lineSeparator = '\n',
regExpTesto = /(")/g,
regExp = /[";]/;
$('table tr').each(function() {
var dataRow = '';
if ($('input:checkbox', this).is(':checked') || $(this).is(':first- child'))
{
$('td', this).not(':last').each(function() {
var value = $(this).text();
if (dataRow !== '') dataRow += fieldSeparator;
if (regExp.test(value)) {
value = textField + value.replace(regExpTesto, '$1$1') + textField;
}
dataRow += value;
});
if (dataURL !== '') dataURL += lineSeparator;
dataURL += dataRow;
}
});
window.location.href = 'data:text/csv;charset=utf-8;base64,' + btoa(dataURL);
}
The download is done bye this line as far as I can tell :
window.location.href = 'data:text/csv;charset=utf-8;base64,' + btoa(dataURL);
I would like to be able to have it download(upload/saved) to a shared google drive folder instead.
I have seen similar questions on the forum but can't seem to see how to point the download to a shared google-drive folder.
What must code must be added/changed in order to achieve this ?
Thank you
EDIT: Nevermind, just realised they say it does not support data URIs:
Data URIs and file:// URLs are not supported
I will keep this answer in case anyone else searches for non-data URI uploading
Having a quick look, I came upon Save to Drive
While this might not be exactly what you want, it looks like an easy way to add the ability to save the contents of any URI (hopefully a data URI too) to Google Drive, like so:
<script src="https://apis.google.com/js/platform.js" async defer></script>
<div class="g-savetodrive"
data-src="//example.com/path/to/myfile.pdf"
data-filename="My Statement.pdf"
data-sitename="My Company Name">
</div>

How to download stock price data only when it is not erroneous (404)?

The script downloads historic stock prices from finance.yahoo.com. An array of tickers is used to loops through the script, creats liĀ“nks based on the ticker array and downloads the data associated to each ticker. However, some of the ticker symbols are not up to date anymore and as a result yahoo delivers a 404 page instead of a csv containing price information. The errorpage is then instead stored in a csv and saved to my computer. To not download these files I am looking for the string 'Sorry, the page you requested was not found.', which is contained within each of yahoos error sites as an indicator for a 404 page.
Behaviour of the code (output, see below code):
The code runs through all tickers and downloads all stock price .csv's. This works fine for all ticker, but some ticker symbols are not used anymore by yahoo. In the case of a ticker symbol that is not used anymore the program downloads a .csv containing yahoos 404 page. All files (also the good ones containing actual data) are downloaded in the directory c:\Users\W7ADM\stock-price-leecher\data2.
Problem:
I would like for the code to not download the 404 page into a csv file, but just do nothing in this case and move on to the next ticker symbol in the loop. I am trying to achive this with the if-condition that looks for the String "Sorry, the page you requested was not found." that is diplayed on yahoos 404-pages. In the end I hoope to download all csv's for tickers that actually exists and save them to my hdd.
var url_begin = 'http://real-chart.finance.yahoo.com/table.csv?s=';
var url_end = '&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv';
var tickers = [];
var link_created = '';
var casper = require('casper').create({
pageSettings: {
webSecurityEnabled: false
}
});
casper.start('http://www.google.de', function() {
tickers = ['ADS.DE', '0AM.DE']; //ADS.DE is retrievable, 0AM.DE is not
//loop through all ticker symbols
for (var i in tickers){
//create a link with the current ticker
link_created=url_begin + tickers[i] + url_end;
//check to see, if the created link returns a 404 page
this.open(link_created);
var content = this.getHTML();
//If is is a 404 page, jump to the next iteration of the for loop
if (content.indexOf('Sorry, the page you requested was not found.')>-1){
console.log('No Page found.');
continue; //At this point I want to jump to the next iteration of the loop.
}
//Otherwise download file to local hdd
else {
console.log(link_created);
this.download(link_created, 'stock-price-leecher\\data2\\'+tickers[i]+'.csv');
}
}
});
casper.run(function() {
this.echo('Ende...').exit();
});
The Output:
C:\Users\Win7ADM>casperjs spl_old.js
ADS.DE,0AM.DE
http://real-chart.finance.yahoo.com/table.csv?s=ADS.DE&a=00&b=1&c=1950&d=11&e=31
&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0AM.DE&a=00&b=1&c=1950&d=11&e=31
&f=2050&g=d&ignore=.csv
Ende...
C:\Users\Win7ADM>
casper.open is asynchronous (non-blocking), but you use it in a blocking fashion. You should use casper.thenOpen which has a callback which is called when the page is loaded and you can do stuff with it.
casper.start("http://example.com");
tickers = ['ADS.DE', '0AM.DE']; //ADS.DE is still retrievable, 0AM.DE is not
tickers.forEach(function(ticker){
var link_created = url_begin + ticker + url_end;
casper.thenOpen(link_created, function(){
console.log("open", link_created);
var content = this.getHTML();
if (content.indexOf('Sorry, the page you requested was not found.') > -1) {
console.log('No Page found.');
} else {
console.log("downloading...");
this.download(link_created, 'test14_'+ticker+'.csv');
}
});
});
casper.run();
Instead of using the thenOpen callback, you can also register to the page.resource.received event and download it specifically by checking the status. But now you wouldn't have access to ticker so you either have to store it in a global variable or parse it from resource.url.
var i = 0;
casper.on("page.resource.received", function(resource){
if (resource.stage === "end" && resource.status === 200) {
this.download(resource.url, 'test14_'+(i++)+'.csv');
}
});
casper.start("http://example.com");
tickers = ['ADS.DE', '0AM.DE']; //ADS.DE is still retrievable, 0AM.DE is not
tickers.forEach(function(ticker){
var link_created = url_begin + ticker + url_end;
casper.thenOpen(link_created);
});
casper.run();
I don't think you should do this with open or thenOpen. It may work on PhantomJS, but probably not on SlimerJS.
I actually tried it and your page is strange in that the download doesn't succeed. You can load some dummy page like example.com, download the csv files yourself using __utils__.sendAJAX (it is only accessible from the page context) and write them using the fs module. You should only write it based in the specific 404 error page text that you identified:
casper.start("http://example.com");
casper.then(function(){
tickers = ['ADS.DE', '0AM.DE']; //ADS.DE is still retrievable, 0AM.DE is not
tickers.forEach(function(ticker){
var link_created = url_begin + ticker + url_end;
var content = casper.evaluate(function(url){
return __utils__.sendAJAX(url, "GET");
}, link_created);
console.log("len: ", content.length);
if (content.indexOf('Sorry, the page you requested was not found.') > -1) {
console.log('No Page found.');
} else {
console.log("writing...");
fs.write('test14_'+ticker+'.csv', content);
}
});
});
casper.run();

Use Javascript to scan a local directory and then update an Anchor tag with the "latest" file

I'm trying to create a simple HTML page that will basically, load up some javascript, and check my direction (../Files)
the Files folder contains
file_001.txt
file_002.txt
file_003.txt
I then want javascript to use the latest one (file_003.txt), and update a anchor tag with the id of "file_download".
Would anyone by any chance have an idea how to do this?
The reason is, lets say I have a terms and conditions PDF file that is T&C_001.pdf and download the line a new terms and conditions is released. so we keep the T&C_001.pdf for any old records and we upload T&C_002.pdf. Now this will not need any HTML knowledge or even Javascript. The owner of the site would just need to add a new file with 002 on the end.
though, not fully accurate, you could try
<script type="text/javascript">
function getLatest(location, filename, ext, readyCallback, index) {
var tgIndex = typeof index === 'undefined' ? 1 : index;
$.ajax({
type: 'HEAD',
url: location + '/' + filename + '-' + index + '.' + ext,
success: function() {
getLatest(location, filename, ext, readyCallback, tgIndex + 1);
},
error: function() {
if (tgIndex > 1) {
readyCallback(location + '/' + filename + '-' + (tgIndex-1) + '.txt');
return;
}
readyCallback();
}
});
}
getLatest('linkToSite', 'file', 'txt', function(filename) {
if (typeof filename === 'undefined') {
alert('No file found on the location');
}
// do something with returned filename
});
</script>
which would try to check if the file is there, when not the previous index was the latest one. If the first one fails, there is no file on the specified location
This reply doesn't add formatting, and rather expects the files as:
- file-1.txt, file-2.txt, ...
this example assumes jquery :)
a fiddle of it you can find here
http://jsfiddle.net/Icepickle/JL5Su/

Download file: ACCESS DENIED

I keep on having "ACCESS DENIED" after hitting my download button.
I already have full control on the specified folder.
I use this in jquery.
function DownloadFile(ProductNumber, File)
{
var windowSizeArray = ["width=400,height=400",
"width=500,height=600,scrollbars=yes"];
File = "C:/Documents and Settings/My PC/My Documents/" + File;
if (File != "")
{
var windowName = "popUp";
var windowSize = windowSizeArray[$(this).attr("rel")];
var exist = isExists(File);
if (exist)
{
window.open(File, windowName, windowSize);
}
else
{
ShowAlertMessage("The file for Product no. <a href='" + File + "' target='blank'>" + ProductNumber+ "</a> does not exist.");
}
}
else
{
ShowAlertMessage("No PDF file for Product no: " + ProductNumber+ ".");
}
}
You can't access local files like you do in your snippet.
You have to upload the file to the server and use PHP/another serverside language to do that. jQuery (or Javascript) only runs in the browser and does not have access to files outside it. Serverside web-languages only have access to files located on the server (or other servers using get_file_contents or cURL).
Your code looks like a C#/Java-source. They can access these local files.

Temporarily unzip a file to view contents within a browser

I want to unzip a file that contains an html page, css, and js directories. I want to unzip this temporarily and view the html in an iFrame, preferrably. I am using jszip which is working. I got the html to load, but how do I add the image, js, and css folders into the iFrame?
Here is what I have so far...
<div id="jszip_utils"></div>
<iframe id="iframe"></iframe>
<script type="text/javascript">
function showError(elt, err) {
elt.innerHTML = "<p class='alert alert-danger'>" + err + "</p>";
}
function showContent(elt, content) {
elt.innerHTML = "<p class='alert alert-success'>loaded !<br/>" +
"Content = " + content + "</p>";
}
var htmltext = JSZipUtils.getBinaryContent("/zip/myWebsite.zip", function (err, data) {
var elt = document.getElementById('jszip_utils');
if (err) {
showError(elt, err);
return;
}
try {
JSZip.loadAsync(data)
.then(function (zip) {
for(var name in zip.files) {
if (name.substring(name.lastIndexOf('.') + 1) === "html") {
return zip.file(name).async("string");
}
}
return zip.file("").async("string");
})
.then(function success(text) {
$('#iframe').contents().find('html').html(text);
showContent(elt, text);
}, function error(e) {
showError(elt, e);
});
} catch(e) {
showError(elt, e);
}
});
</script>
This gets the html, but the js css and image files are not showing up. I believe I need to do some sort of fake routing, but I'm not sure how I would be able to do that. Thanks for your help.
If the html/js in the zip is not too complicated, for instance an AngularJS app that has routes for partials, this is possible.
The trick is to replace css,js,img src/href urls that point to a file in the zip with either:
Object Url: URL.createObjectURL(Blob or File object);
Data Url: data:[<mediatype>][;base64],<data>
Or in the case of js and css inject the content directly into the appropriate element
After replacing the src/href references than just inject the new html into the iframe.
Step 1: Parse the html so you can manipulate it
//html from a call like zip.file("index.html").async("string")
let parser = new DOMParser;
let doc = parser.parseFromString(html,"text/html");
Step 2: Find all elements with a relative path (e.g. /imgs/img.jpg) as they are easier to deal with as you can then use that path for zip.file
//Simply finds all resource elements, then filters all that dont start with '/'
var elements = jQuery("link[href],script[src],img[src]",doc).filter(function(){
return /^\//.test(this.href || this.src);
});
Step 3: Replace src,href with object url, data url, or direct content
//assume element is the html element: <script src="/js/main.js"></script>
zip.file(element.src).async("string").then(jsText=>{
element.src = "data:text/javascript,"+encodeURIComponent(jsText);
});
Step 4: Get the new html text and inject it into the iframe
let newHTML = doc.documentElement.outerHTML;
var viewer = document.querySelector('#iframeID');
viewer = viewer.contentWindow || viewer.contentDocument.document || viewer.contentDocument;
viewer.document.open();
viewer.document.write(html);
viewer.document.close();
JSFiddle Demo - Demonstrates replacing the src/href urls
As a security note, if you are using zip files that you do not know the contents of, you should run the whole app in a protected iframe

Categories

Resources