I have written code for a small Chrome Extension that scrapes content based on selectors. Once I have the content I need, I push it to my PHP server in order to store it in a database. The code works well except when content is loaded dynamically, I can only see what is visible the source code from the browser.
I thought I could solve the issue by waiting until the page was fully loaded (setInterval) but even when I see that the content is loaded, the source code I'm getting is still the initial one.
I use the following code to create a tab and retrieve the source code:
function CreateTab (createProperties)
{
chrome.tabs.create(createProperties, tab =>
{
if (chrome.runtime.lastError)
{
console.log(chrome.runtime.lastError);
}
else
{
container = "h2";
var i = 0;
var checkClass = setInterval (function()
{
if (i<=3)
{
console.log('Iteration :'+i);
chrome.tabs.executeScript(tab.id,
{
file: 'GetSource.js',
}, async function(results)
{
let e = chrome.runtime.lastError;
if(e !== undefined)
{
console.log(tab.id, e);
}
// GETTING HTML
parser = new DOMParser();
content = parser.parseFromString(results, "text/html");
console.log(content);
// CAPTURE TITLES
try
{
datatable = content.querySelectorAll(container);
}
catch(err)
{
console.log('Iteration '+i+' table not found');
datatable = "";
}
if (datatable.length>0)
{
clearInterval(checkClass);
removeTab(tab.id,3)
// DATA FOUND!
i=10;
}
});
i++;
}
else
{
clearInterval(checkClass);
console.log('Container not found');
removeTab(tab.id,5)
}
},10000);
}
});
}
The script I'm calling to get the source is the following (GetSource.js)
function DOMtoString(document_root) {
var html = '',
node = document_root.firstChild;
while (node) {
switch (node.nodeType) {
case Node.ELEMENT_NODE:
html += node.outerHTML;
break;
case Node.TEXT_NODE:
html += node.nodeValue;
break;
case Node.CDATA_SECTION_NODE:
html += '<![CDATA[' + node.nodeValue + ']]>';
break;
case Node.COMMENT_NODE:
html += '<!--' + node.nodeValue + '-->';
break;
case Node.DOCUMENT_TYPE_NODE:
// (X)HTML documents are identified by public identifiers
html += "<!DOCTYPE " + node.name + (node.publicId ? ' PUBLIC "' + node.publicId + '"' : '') + (!node.publicId && node.systemId ? ' SYSTEM' : '') + (node.systemId ? ' "' + node.systemId + '"' : '') + '>\n';
break;
}
node = node.nextSibling;
}
return html;
}
sourcecode = DOMtoString (document);
sourcecode
How can I make sure that the GetSource.js script is getting the final rendered source code? There are more and more sites using dynamic content loaded after the initial load and with this script I'm stuck on those sites.
There are ready to use scrapers in the Chrome store that can get dynamic content so there must be a solution for my script too. I don't want to use to solutions as I need to retrieve the selectors I want to catch from my server and once the scraping is done pushing them back to the server.
Do you have any idea of how I could achieve this?
Thanks
Related
I am using EOWEBROWSER to send some data and get back the data. This is the C# code I'm using on back-end:
void WebView_JSExtInvoke(object sender, JSExtInvokeArgs e)
{
switch (e.FunctionName)
{
case "applicationHeartbeat":
heartBeats.Enqueue("Heart beat received at: " + DateTime.Now);
heartBeatCount++;
string js = #"var span = document.getElementById('beatSpan');
if(span !== null){
span.parentNode.removeChild(span);
}
var counter = 1;
span = document.createElement('span');
span.setAttribute('id', 'beatSpan');
span.innerHTML = '" + heartBeatCount + #" Heart Beats.<br /> Last heart beat receieved ';
var childSpan = document.createElement('span');
childSpan.setAttribute('id', 'count');
span.appendChild(childSpan);
if(document.getElementsByTagName('div').length > 0){
document.getElementsByTagName('div')[0].appendChild(span)
}
else{
document.body.appendChild(span);
}
if(timer !== null) clearInterval(timer);
var timer = setInterval(function(){document.getElementById('count').innerHTML = ++counter + ' sec ago'}, 1000);";
m_CurPage.WebView.EvalScript(js);
break;
case "notifyErrorInfo":
m_CurPage.WebView.EvalScript("alert('"+ e.Arguments[0] + "')");
break;
case "notifyFlashingCompleted":
m_CurPage.WebView.EvalScript("alert('Flashing Completed notification received')");
break;
}
}
On the front-end I am using javascript. This is how I invoke eoWebBrowser:
heartbeatInterval = setInterval(function() {
window.eval(eoWebBrowser.extInvoke("applicationHeartbeat", {}));
},5000)
This javascript function will help me call that C# function. I can also see the results on the frontend webpage. Now I am trying to create a log file on javascript side, the function must have the following details:
look for the textfile
if file exists append whatever I am sending from c# to that text file
if file doesnt exist create a file and then append what ever the data I am sending from c# side
When the pageload is done, the text has to be created and the results from c# eobrowser must append to the JavaScript created text file. I'm using pure javascript only, no jquery or angular frameworks.
I am using qzprint API for printing labels in my open cart extension. Everything was working fine but suddenly it stopped working on FF. In Internet explorer it works fine. If i add alerts in my functions of applet it works fine on firefox as well but not sure why not with out alerts. here is my code.
calling applet functions in my header.tpl
<script type="text/javascript">
deployQZ('<?php echo HTTP_CATALOG ?>');
useDefaultPrinter();
<script>
Applet file containing functions
function deployQZ(path) {
//alert("alert for printing label");
pathApplet = path + 'java/qz-print.jar';
pathJnlp = path + 'java/qz-print_jnlp.jnlp';
var attributes = {id: "qz", code:'qz.PrintApplet.class',
archive: pathApplet, width:1, height:1};
var parameters = {jnlp_href: pathJnlp,
cache_option:'plugin', disable_logging:'false',
initial_focus:'false'};
if (deployJava.versionCheck("1.7+") == true) {}
else if (deployJava.versionCheck("1.6+") == true) {
attributes['archive'] = 'java/jre6/qz-print.jar';
parameters['jnlp_href'] = 'java/jre6/qz-print_jnlp.jnlp';
}
deployJava.runApplet(attributes, parameters, '1.5');
}
/**
* Automatically gets called when applet has loaded.
*/
function qzReady() {
// Setup our global qz object
window["qz"] = document.getElementById('qz');
//var title = document.getElementById("title");
if (qz) {
try {
//title.innerHTML = title.innerHTML + " " + qz.getVersion();
//document.getElementById("content").style.background = "#F0F0F0";
} catch(err) { // LiveConnect error, display a detailed meesage
document.getElementById("content").style.background = "#F5A9A9";
alert("ERROR: \nThe applet did not load correctly. Communication to the " +
"applet has failed, likely caused by Java Security Settings. \n\n" +
"CAUSE: \nJava 7 update 25 and higher block LiveConnect calls " +
"once Oracle has marked that version as outdated, which " +
"is likely the cause. \n\nSOLUTION: \n 1. Update Java to the latest " +
"Java version \n (or)\n 2. Lower the security " +
"settings from the Java Control Panel.");
}
}
}
/**
* Returns is the applet is not loaded properly
*/
function isLoaded() {
if (!qz) {
alert('Error:\n\n\tPrint plugin is NOT loaded!');
return false;
} else {
try {
if (!qz.isActive()) {
alert('Error:\n\n\tPrint plugin is loaded but NOT active!');
return false;
}
} catch (err) {
alert('Error:\n\n\tPrint plugin is NOT loaded properly!');
return false;
}
}
return true;
}
function useDefaultPrinter() {
//alert("alert for printing label");
if (isLoaded()) {
// Searches for default printer
qz.findPrinter();
// Automatically gets called when "qz.findPrinter()" is finished.
window['qzDoneFinding'] = function() {
// Alert the printer name to user
var printer = qz.getPrinter();
//alert(printer !== null ? 'Default printer found: "' + printer + '"':
//'Default printer ' + 'not found');
document.getElementById("name_printer").innerHTML = 'Default printer found: "' + printer + '"';
// Remove reference to this function
window['qzDoneFinding'] = null;
defaultFound = true;
};
}
}
As u can see in my deployqz() and usedefaultprinter() functions i have alert on first line which is in comments if its commented it doesn't work in fire fox and if not commented than it works fine. With comments i get alert message from isLoaded() function "Print plugin is NOT loaded properly!".
Also in my console i get this
An unbalanced tree was written using document.write() causing data from the network to be reparsed. For more information https://developer.mozilla.org/en/Optimizing_Your_Pages_for_Speculative_Parsing
Try this:
If the qzReady is called by the applet when ready, put useDefaultPrinter inside that function.
if isLoaded takes some time, call useDefaultPrinter in there too using setTimeout
Like this
<script type="text/javascript">
deployQZ('<?php echo HTTP_CATALOG ?>');
<script>
Applet file containing functions
var qz;
function deployQZ(path) {
pathApplet = path + 'java/qz-print.jar';
pathJnlp = path + 'java/qz-print_jnlp.jnlp';
var attributes = {id: "qz", code:'qz.PrintApplet.class',
archive: pathApplet, width:1, height:1};
var parameters = {jnlp_href: pathJnlp,
cache_option:'plugin', disable_logging:'false',
initial_focus:'false'};
if (deployJava.versionCheck("1.7+") == true) {}
else if (deployJava.versionCheck("1.6+") == true) {
attributes['archive'] = 'java/jre6/qz-print.jar';
parameters['jnlp_href'] = 'java/jre6/qz-print_jnlp.jnlp';
}
deployJava.runApplet(attributes, parameters, '1.5');
}
/**
* Automatically gets called when applet has loaded.
*/
function qzReady() {
// Setup our global qz object
qz = document.getElementById('qz');
if (qz) {
try {
useDefaultPrinter();
} catch(err) { // LiveConnect error, display a detailed meesage
document.getElementById("content").style.background = "#F5A9A9";
alert("ERROR: \nThe applet did not load correctly. Communication to the " +
"applet has failed, likely caused by Java Security Settings. \n\n" +
"CAUSE: \nJava 7 update 25 and higher block LiveConnect calls " +
"once Oracle has marked that version as outdated, which " +
"is likely the cause. \n\nSOLUTION: \n 1. Update Java to the latest " +
"Java version \n (or)\n 2. Lower the security " +
"settings from the Java Control Panel.");
}
}
else { setTimeout(useDefaultPrinter,300); }
}
/**
* Returns is the applet is not loaded properly
*/
function isLoaded() {
if (!qz) {
alert('Error:\n\n\tPrint plugin is NOT loaded!');
return false;
} else {
try {
if (!qz.isActive()) {
alert('Error:\n\n\tPrint plugin is loaded but NOT active!');
return false;
}
} catch (err) {
alert('Error:\n\n\tPrint plugin is NOT loaded properly!');
return false;
}
}
return true;
}
function useDefaultPrinter() {
//alert("alert for printing label");
if (isLoaded()) {
// Searches for default printer
qz.findPrinter();
// Automatically gets called when "qz.findPrinter()" is finished.
window['qzDoneFinding'] = function() {
// Alert the printer name to user
var printer = qz.getPrinter();
//alert(printer !== null ? 'Default printer found: "' + printer + '"':
//'Default printer ' + 'not found');
document.getElementById("name_printer").innerHTML = 'Default printer found: "' + printer + '"';
// Remove reference to this function
window['qzDoneFinding'] = null;
defaultFound = true;
};
}
else { setTimeout(useDefaultPrinter,300); }
}
I am retrieving data from Parse in my Wordpress page fine. I can append the names of my objects into a page in Wordpress, but as soon as a try to append a HTML tag like <h1> or <p>, I get an error within my Chrome console:
Uncaught SyntaxError: Unexpected token ILLEGAL
Below is the code within my wordpress page that works without errors:
<div id="list1"><h1>Beer List</h1></div>
<div id="list2"><h1>Tap List</h1></div>
<script type="text/javascript">
if (typeof jQuery != 'undefined') {
Parse.initialize("", "");
var Objs = Parse.Object.extend("Obj");
var query = new Parse.Query(Objs);
query.ascending("name");
query.find({
success: function(results) {
var obj1String = '';
var obj2String = '';
for(var i=0;i<results.length;i++)
{
var object = results[i];
obj1String= obj1String +' '+object.get('name')+'</br>';
if(object.get('isObj2') == true){
obj2String = obj2String +' '+object.get('name')+'</br>';
}
}
jQuery( "#list1" ).append( obj1String );
jQuery( "#list2" ).append( obj2String );
},
error: function(error) {
alert("Error: " + error.code + " " + error.message);
}
});
}
</script>
But when I add, for example a tag to one of my objStrings, I get the error. i.e.:
obj1String= obj1String +'<h1>'+object.get('name')+'</h1></br>';
Here is how the page is rendering, any why the error is happening. It seems to be adding a line break when it sees those tags:
for(var i=0;i<results.length;i++)
{
var object = results[i];
obj1String= obj1String +' //line break added here
<h1>'+object.get('name')+'</h1>
<p></br>';
if(object.get('isObj2') == true){
objString = obj2String +' '+object.get('name')+'</br>';
}
I have see other threads for this error (i.e. here). But I could not get any suggestions there to work.
Any help would be greatly appreciated, thanks!
I've had a similar problem to this before and I fixed it by making sure all the HTML tags in the JavaScript are on the same line.
As for wordpress template files you can read more about it here http://codex.wordpress.org/Page_Templates. Simply it will allow you to use a PHP file as a template to display that page.
Is there a way to save the current webpage by using casperjs or phantomjs?
I tried to get the html and save it into a file. But the resulting file was a lot different from the screenshot of that time (with casper.capture). Is there a way to save the current webpage?
Andrey Borisko suggested to use the disk cache to retrieve the resources. My solution is not that efficient, but you don't need to decompress text files.
I use XMLHttpRequest to retrieve all resources after I registered them with the resource.received event handler. I then filter the resources into images, css and fonts. The current limitation is that remote resource paths that contain something like ../ or ./ are not handled correctly.
I retrieve the current page content with getHTML and iterate over all captured resources to replace the path used in the markup, that is identified by a portion of the complete resource URL, with a randomly generated file name. The file extension is created from the content type of the resource. It is converted using mimeType from this gist.
Since CSS files may contain background images or fonts, they have to be processed before saving to disk. The provided loadResource function loads the resource, but does not save it.
Since XMLHttpRequest to download the resources the script has to be invoked with the --web-security=false flag:
casperjs script.js --web-security=false
script.js
var casper = require("casper").create();
var utils = require('utils');
var fs = require('fs');
var mimetype = require('./mimetype'); // URL provided below
var cssResources = [];
var imgResources = [];
var fontResources = [];
var resourceDirectory = "resources";
var debug = false;
fs.removeTree(resourceDirectory);
casper.on("remote.message", function(msg){
this.echo("remote.msg: " + msg);
});
casper.on("resource.error", function(resourceError){
this.echo("res.err: " + JSON.stringify(resourceError));
});
casper.on("page.error", function(pageError){
this.echo("page.err: " + JSON.stringify(pageError));
});
casper.on("downloaded.file", function(targetPath){
if (debug) this.echo("dl.file: " + targetPath);
});
casper.on("resource.received", function(resource){
// don't try to download data:* URI and only use stage == "end"
if (resource.url.indexOf("data:") != 0 && resource.stage == "end") {
if (resource.contentType == "text/css") {
cssResources.push({obj: resource, file: false});
}
if (resource.contentType.indexOf("image/") == 0) {
imgResources.push({obj: resource, file: false});
}
if (resource.contentType.indexOf("application/x-font-") == 0) {
fontResources.push({obj: resource, file: false});
}
}
});
// based on http://docs.casperjs.org/en/latest/modules/casper.html#download
casper.loadResource = function loadResource(url, method, data) {
"use strict";
this.checkStarted();
var cu = require('clientutils').create(utils.mergeObjects({}, this.options));
return cu.decode(this.base64encode(url, method, data));
};
function escapeRegExp(string) {
// from https://stackoverflow.com/a/1144788/1816580
return string.replace(/([.*+?^=!:${}()|\[\]\/\\])/g, "\\$1");
}
function replaceAll(find, replace, str) {
// from https://stackoverflow.com/a/1144788/1816580
return str.replace(find, replace);
}
var wrapFunctions = [
function wrapQuot1(s){
return '"' + s + '"';
},
function wrapQuot2(s){
return "'" + s + "'";
},
function csswrap(s){
return '(' + s + ')';
}
];
function findAndReplace(doc, resources, resourcesReplacer) {
// change page on the fly
resources.forEach(function(resource){
var url = resource.obj.url;
// don't download again
if (!resource.file) {
// set random filename and download it **or** call further processing which in turn will load ans write to disk
resource.file = resourceDirectory+"/"+Math.random().toString(36).slice(2)+"."+mimetype.ext[resource.obj.contentType];
if (typeof resourcesReplacer != "function") {
if (debug) casper.echo("download resource (" + resource.obj.contentType + "): " + url + " to " + resource.file);
casper.download(url, resource.file, "GET");
} else {
resourcesReplacer(resource);
}
}
wrapFunctions.forEach(function(wrap){
// test the resource url (growing from the back) with a string in the document
var lastURL;
var lastRegExp;
var subURL;
// min length is 4 characters
for(var i = 0; i < url.length-5; i++) {
subURL = url.substring(i);
lastRegExp = new RegExp(escapeRegExp(wrap(subURL)), "g");
if (doc.match(lastRegExp)) {
lastURL = subURL;
break;
}
}
if (lastURL) {
if (debug) casper.echo("replace " + lastURL + " with " + resource.file);
doc = replaceAll(lastRegExp, wrap(resource.file), doc);
}
});
});
return doc;
}
function capturePage(){
// remove all <script> and <base> tags
this.evaluate(function(){
Array.prototype.forEach.call(document.querySelectorAll("script"), function(scr){
scr.parentNode.removeChild(scr);
});
Array.prototype.forEach.call(document.querySelectorAll("base"), function(scr){
scr.parentNode.removeChild(scr);
});
});
// TODO: remove all event handlers in html
var page = this.getHTML();
page = findAndReplace(page, imgResources);
page = findAndReplace(page, cssResources, function(cssResource){
var css = casper.loadResource(cssResource.obj.url, "GET");
css = findAndReplace(css, imgResources);
css = findAndReplace(css, fontResources);
fs.write(cssResource.file, css, "wb");
});
fs.write("page.html", page, "wb");
}
casper.start("http://www.themarysue.com/").wait(3000).then(capturePage).run(function(){
this.echo("DONE");
this.exit();
});
The magic happens in findAndReplace. capturePage is completely synchronous so it can be dropped anywhere without much head ache.
URL for mimetype.js
No, I don't think there is an easy way to do this as phantomjs doesn't support rendering pages in mht format (Render as a .mht file #10117). I believe that's what you wanted.
So, it needs some work to accomplish this. I did something similar, but i was doing it the other way around I had a rendered html code that I was rendering into image/pdf through phantomjs. I had to clean the file first and it worked fine for me.
So, what I think you need to do is:
strip all js calls, like script tags or onload attributes, etc..
if you have access from local to the resources like css, images and so on (and you don't need authentication to that domain where you grab the page) than you need to change relative paths of src attributes to absolute to load images/etc.
if you don't have access to the resources when you open the page then I think you need to implement similar script to download those resources at the time phantomjs loads the page and then redirect src attributes to that folder or maybe use data uri.
You might need to change links in css files as well.
This will bring up the images\fonts and styling you are missing currently.
I'm sure there are more points. I'll update the answer if you need more info, once I see my code.
I want to unzip a file that contains an html page, css, and js directories. I want to unzip this temporarily and view the html in an iFrame, preferrably. I am using jszip which is working. I got the html to load, but how do I add the image, js, and css folders into the iFrame?
Here is what I have so far...
<div id="jszip_utils"></div>
<iframe id="iframe"></iframe>
<script type="text/javascript">
function showError(elt, err) {
elt.innerHTML = "<p class='alert alert-danger'>" + err + "</p>";
}
function showContent(elt, content) {
elt.innerHTML = "<p class='alert alert-success'>loaded !<br/>" +
"Content = " + content + "</p>";
}
var htmltext = JSZipUtils.getBinaryContent("/zip/myWebsite.zip", function (err, data) {
var elt = document.getElementById('jszip_utils');
if (err) {
showError(elt, err);
return;
}
try {
JSZip.loadAsync(data)
.then(function (zip) {
for(var name in zip.files) {
if (name.substring(name.lastIndexOf('.') + 1) === "html") {
return zip.file(name).async("string");
}
}
return zip.file("").async("string");
})
.then(function success(text) {
$('#iframe').contents().find('html').html(text);
showContent(elt, text);
}, function error(e) {
showError(elt, e);
});
} catch(e) {
showError(elt, e);
}
});
</script>
This gets the html, but the js css and image files are not showing up. I believe I need to do some sort of fake routing, but I'm not sure how I would be able to do that. Thanks for your help.
If the html/js in the zip is not too complicated, for instance an AngularJS app that has routes for partials, this is possible.
The trick is to replace css,js,img src/href urls that point to a file in the zip with either:
Object Url: URL.createObjectURL(Blob or File object);
Data Url: data:[<mediatype>][;base64],<data>
Or in the case of js and css inject the content directly into the appropriate element
After replacing the src/href references than just inject the new html into the iframe.
Step 1: Parse the html so you can manipulate it
//html from a call like zip.file("index.html").async("string")
let parser = new DOMParser;
let doc = parser.parseFromString(html,"text/html");
Step 2: Find all elements with a relative path (e.g. /imgs/img.jpg) as they are easier to deal with as you can then use that path for zip.file
//Simply finds all resource elements, then filters all that dont start with '/'
var elements = jQuery("link[href],script[src],img[src]",doc).filter(function(){
return /^\//.test(this.href || this.src);
});
Step 3: Replace src,href with object url, data url, or direct content
//assume element is the html element: <script src="/js/main.js"></script>
zip.file(element.src).async("string").then(jsText=>{
element.src = "data:text/javascript,"+encodeURIComponent(jsText);
});
Step 4: Get the new html text and inject it into the iframe
let newHTML = doc.documentElement.outerHTML;
var viewer = document.querySelector('#iframeID');
viewer = viewer.contentWindow || viewer.contentDocument.document || viewer.contentDocument;
viewer.document.open();
viewer.document.write(html);
viewer.document.close();
JSFiddle Demo - Demonstrates replacing the src/href urls
As a security note, if you are using zip files that you do not know the contents of, you should run the whole app in a protected iframe