Block existing scripts - javascript

I am making an addon for firefox. I want to extract a video from a HTML page and display it on a black background. Here is what i've got.
//main.js
var pageMod = require("page-mod");
pageMod.add(new pageMod.PageMod({
include: "http://myserver.fr/*",
contentStyleFile: data.url("modify.css"),
contentScriptFile: data.url('hack.js'),
contentScriptWhen: 'start'
}));
//hack.js
video = document.body.innerHTML;
document.body.innerHTML = '';
video = video.substring(video.lastIndexOf("<object"),video.lastIndexOf("</object>"));
video = "<div id='fond'></div><div id='mavideo'>"+video+"</div>"
document.body.innerHTML = video;
document.body.style.backgroundColor = "black";
document.body.style.margin = "0";
My code works but the probleme is that I have to wait "for hours" while the other javascript is beeing executed. I've tried to use contentScriptWhen: 'start' but i dosent change a thing.
Any idea to block the other script of the page ?

Blocking scripts from loading is going to be a little difficult, we can quickly remove them instead.
// remove all scripts
var scripts = document.getElementsByTagName("script");
var parent = null;
for (var i = 0; i < scripts.length; i += 1) {
parent = scripts.item(i).parentNode;
parent.removeChild(scripts.item(i));
}
This code tries to remove all the script tags from your page which would stop the loading of any scripts and allows you to run the rest of your code. Put this at the top of your hack.js script.
I don't know the page you're looking at so it's hard to know what else is going on but your use of substring, and lastIndexOf aren't going to be very fast at all either. We can get rid of those and see if you get any noticible speed increase.
Try using query selectors and Document Fragments instead. Here's an example:
//hack.js
var fragment = document.createDocumentFragment();
var objects = document.getElementsByTagName("object");
// create your div objects and append to the fragment
var fond = document.createElement("div");
fond.setAttribute("id", "fond");
fragment.appendChild(fond);
var mavideo = document.createElement("div");
mavideo.setAttribute("id", "mavideo");
fragment.appendChild(mavideo);
// append all <object> tags to your video div
for (var i = 0; i < objects.length; i += 1) {
mavideo.appendChild(objects.item(i));
}
// clear and append it all in
document.body.innerHTML = '';
document.body.appendChild(fragment);
That code throws all the objects into a document fragment so you can wipe the whole page away and then append it all back in; no libraries required. Your divs fond & mavideo are in there as well.
I didn't really test all this code out so hopefully it works as expected.

Related

How do I load an HTML file (Which has another js file linked) using an XMLHttpRequest so that the javascript from the HTML page loaded still works? [duplicate]

I tried to load some scripts into a page using innerHTML on a <div>. It appears that the script loads into the DOM, but it is never executed (at least in Firefox and Chrome). Is there a way to have scripts execute when inserting them with innerHTML?
Sample code:
<!DOCTYPE html>
<html>
<body onload="document.getElementById('loader').innerHTML = '<script>alert(\'hi\')<\/script>'">
Shouldn't an alert saying 'hi' appear?
<div id="loader"></div>
</body>
</html>
Here is a method that recursively replaces all scripts with executable ones:
function nodeScriptReplace(node) {
if ( nodeScriptIs(node) === true ) {
node.parentNode.replaceChild( nodeScriptClone(node) , node );
}
else {
var i = -1, children = node.childNodes;
while ( ++i < children.length ) {
nodeScriptReplace( children[i] );
}
}
return node;
}
function nodeScriptClone(node){
var script = document.createElement("script");
script.text = node.innerHTML;
var i = -1, attrs = node.attributes, attr;
while ( ++i < attrs.length ) {
script.setAttribute( (attr = attrs[i]).name, attr.value );
}
return script;
}
function nodeScriptIs(node) {
return node.tagName === 'SCRIPT';
}
Example call:
nodeScriptReplace(document.getElementsByTagName("body")[0]);
You have to use eval() to execute any script code that you've inserted as DOM text.
MooTools will do this for you automatically, and I'm sure jQuery would as well (depending on the version. jQuery version 1.6+ uses eval). This saves a lot of hassle of parsing out <script> tags and escaping your content, as well as a bunch of other "gotchas".
Generally if you're going to eval() it yourself, you want to create/send the script code without any HTML markup such as <script>, as these will not eval() properly.
Here is a very interesting solution to your problem:
http://24ways.org/2005/have-your-dom-and-script-it-too
So use this instead of script tags:
<img src="empty.gif" onload="alert('test');this.parentNode.removeChild(this);" />
You can create script and then inject the content.
var g = document.createElement('script');
var s = document.getElementsByTagName('script')[0];
g.text = "alert(\"hi\");"
s.parentNode.insertBefore(g, s);
This works in all browsers :)
I do this every time I wanna insert a script tag dynamically !
const html =
`<script>
alert('👋 there ! Wanna grab a 🍺');
</script>`;
const scriptEl = document.createRange().createContextualFragment(html);
parent.append(scriptEl);
NOTE: ES6 used
EDIT 1:
Clarification for you guys - I've seen a lot of answers use appendChild and wanted to let you guys know that it works exactly as append
I used this code, it is working fine
var arr = MyDiv.getElementsByTagName('script')
for (var n = 0; n < arr.length; n++)
eval(arr[n].innerHTML)//run script inside div
I had this problem with innerHTML, I had to append a Hotjar script to the "head" tag of my Reactjs application and it would have to execute right after appending.
One of the good solutions for dynamic Node import into the "head" tag is React-helment module.
Also, there is a useful solution for the proposed issue:
No script tags in innerHTML!
It turns out that HTML5 does not allow script tags to be dynamically added using the innerHTML property. So the following will not execute and there will be no alert saying Hello World!
element.innerHTML = "<script>alert('Hello World!')</script>";
This is documented in the HTML5 spec:
Note: script elements inserted using innerHTML do not execute when
they are inserted.
But beware, this doesn't mean innerHTML is safe from cross-site scripting. It is possible to execute JavaScript via innerHTML without using tags as illustrated on MDN's innerHTML page.
Solution: Dynamically adding scripts
To dynamically add a script tag, you need to create a new script element and append it to the target element.
You can do this for external scripts:
var newScript = document.createElement("script");
newScript.src = "http://www.example.com/my-script.js";
target.appendChild(newScript);
And inline scripts:
var newScript = document.createElement("script");
var inlineScript = document.createTextNode("alert('Hello World!');");
newScript.appendChild(inlineScript);
target.appendChild(newScript);
Here's a more modern (and concise) version of mmm's awesome solution:
function executeScriptElements(containerElement) {
const scriptElements = containerElement.querySelectorAll("script");
Array.from(scriptElements).forEach((scriptElement) => {
const clonedElement = document.createElement("script");
Array.from(scriptElement.attributes).forEach((attribute) => {
clonedElement.setAttribute(attribute.name, attribute.value);
});
clonedElement.text = scriptElement.text;
scriptElement.parentNode.replaceChild(clonedElement, scriptElement);
});
}
Note: I've also tried alternative solutions using cloneNode() or outerHTML but that didn't work.
You could do it like this:
var mydiv = document.getElementById("mydiv");
var content = "<script>alert(\"hi\");<\/script>";
mydiv.innerHTML = content;
var scripts = mydiv.getElementsByTagName("script");
for (var i = 0; i < scripts.length; i++) {
eval(scripts[i].innerText);
}
Here a solution that does not use eval, and works with scripts, linked scripts , as well as with modules.
The function accepts 3 parameters :
html : String with the html code to insert
dest : reference to the target element
append : boolean flag to enable appending at the end of the target element html
function insertHTML(html, dest, append=false){
// if no append is requested, clear the target element
if(!append) dest.innerHTML = '';
// create a temporary container and insert provided HTML code
let container = document.createElement('div');
container.innerHTML = html;
// cache a reference to all the scripts in the container
let scripts = container.querySelectorAll('script');
// get all child elements and clone them in the target element
let nodes = container.childNodes;
for( let i=0; i< nodes.length; i++) dest.appendChild( nodes[i].cloneNode(true) );
// force the found scripts to execute...
for( let i=0; i< scripts.length; i++){
let script = document.createElement('script');
script.type = scripts[i].type || 'text/javascript';
if( scripts[i].hasAttribute('src') ) script.src = scripts[i].src;
script.innerHTML = scripts[i].innerHTML;
document.head.appendChild(script);
document.head.removeChild(script);
}
// done!
return true;
}
For anyone still trying to do this, no, you can't inject a script using innerHTML, but it is possible to load a string into a script tag using a Blob and URL.createObjectURL.
I've created an example that lets you run a string as a script and get the 'exports' of the script returned through a promise:
function loadScript(scriptContent, moduleId) {
// create the script tag
var scriptElement = document.createElement('SCRIPT');
// create a promise which will resolve to the script's 'exports'
// (i.e., the value returned by the script)
var promise = new Promise(function(resolve) {
scriptElement.onload = function() {
var exports = window["__loadScript_exports_" + moduleId];
delete window["__loadScript_exports_" + moduleId];
resolve(exports);
}
});
// wrap the script contents to expose exports through a special property
// the promise will access the exports this way
var wrappedScriptContent =
"(function() { window['__loadScript_exports_" + moduleId + "'] = " +
scriptContent + "})()";
// create a blob from the wrapped script content
var scriptBlob = new Blob([wrappedScriptContent], {type: 'text/javascript'});
// set the id attribute
scriptElement.id = "__loadScript_module_" + moduleId;
// set the src attribute to the blob's object url
// (this is the part that makes it work)
scriptElement.src = URL.createObjectURL(scriptBlob);
// append the script element
document.body.appendChild(scriptElement);
// return the promise, which will resolve to the script's exports
return promise;
}
...
function doTheThing() {
// no evals
loadScript('5 + 5').then(function(exports) {
// should log 10
console.log(exports)
});
}
I've simplified this from my actual implementation, so no promises that there aren't any bugs in it. But the principle works.
If you don't care about getting any value back after the script runs, it's even easier; just leave out the Promise and onload bits. You don't even need to wrap the script or create the global window.__load_script_exports_ property.
Here is a recursive function to set the innerHTML of an element that I use in our ad server:
// o: container to set the innerHTML
// html: html text to set.
// clear: if true, the container is cleared first (children removed)
function setHTML(o, html, clear) {
if (clear) o.innerHTML = "";
// Generate a parseable object with the html:
var dv = document.createElement("div");
dv.innerHTML = html;
// Handle edge case where innerHTML contains no tags, just text:
if (dv.children.length===0){ o.innerHTML = html; return; }
for (var i = 0; i < dv.children.length; i++) {
var c = dv.children[i];
// n: new node with the same type as c
var n = document.createElement(c.nodeName);
// copy all attributes from c to n
for (var j = 0; j < c.attributes.length; j++)
n.setAttribute(c.attributes[j].nodeName, c.attributes[j].nodeValue);
// If current node is a leaf, just copy the appropriate property (text or innerHTML)
if (c.children.length == 0)
{
switch (c.nodeName)
{
case "SCRIPT":
if (c.text) n.text = c.text;
break;
default:
if (c.innerHTML) n.innerHTML = c.innerHTML;
break;
}
}
// If current node has sub nodes, call itself recursively:
else setHTML(n, c.innerHTML, false);
o.appendChild(n);
}
}
You can see the demo here.
Filter your script tags and run each of them with eval
var tmp= document.createElement('div');
tmp.innerHTML = '<script>alert("hello")></script>';
[...tmp.children].filter(x => x.nodeName === 'SCRIPT').forEach(x => eval(x.innerText));
Yes you can, but you have to do it outside of the DOM and the order has to be right.
var scr = '<scr'+'ipt>alert("foo")</scr'+'ipt>';
window.onload = function(){
var n = document.createElement("div");
n.innerHTML = scr;
document.body.appendChild(n);
}
...will alert 'foo'. This won't work:
document.getElementById("myDiv").innerHTML = scr;
And even this won't work, because the node is inserted first:
var scr = '<scr'+'ipt>alert("foo")</scr'+'ipt>';
window.onload = function(){
var n = document.createElement("div");
document.body.appendChild(n);
n.innerHTML = scr;
}
Krasimir Tsonev has a great solution that overcome all problems.
His method doesn't need using eval, so no performance nor security problems exist.
It allows you to set innerHTML string contains html with js and translate it immediately to an DOM element while also executes the js parts exist along the code. short ,simple, and works exactly as you want.
Enjoy his solution:
http://krasimirtsonev.com/blog/article/Convert-HTML-string-to-DOM-element
Important notes:
You need to wrap the target element with div tag
You need to wrap the src string with div tag.
If you write the src string directly and it includes js parts, please take attention to write the closing script tags correctly (with \ before /) as this is a string.
Use $(parent).html(code) instead of parent.innerHTML = code.
The following also fixes scripts that use document.write and scripts loaded via src attribute. Unfortunately even this doesn't work with Google AdSense scripts.
var oldDocumentWrite = document.write;
var oldDocumentWriteln = document.writeln;
try {
document.write = function(code) {
$(parent).append(code);
}
document.writeln = function(code) {
document.write(code + "<br/>");
}
$(parent).html(html);
} finally {
$(window).load(function() {
document.write = oldDocumentWrite
document.writeln = oldDocumentWriteln
})
}
Source
Try using template and document.importNode. Here is an example:
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Sample</title>
</head>
<body>
<h1 id="hello_world">Sample</h1>
<script type="text/javascript">
var div = document.createElement("div");
var t = document.createElement('template');
t.innerHTML = "Check Console tab for javascript output: Hello world!!!<br/><script type='text/javascript' >console.log('Hello world!!!');<\/script>";
for (var i=0; i < t.content.childNodes.length; i++){
var node = document.importNode(t.content.childNodes[i], true);
div.appendChild(node);
}
document.body.appendChild(div);
</script>
</body>
</html>
You can also wrap your <script> like this and it will get executed:
<your target node>.innerHTML = '<iframe srcdoc="<script>alert(top.document.title);</script>"></iframe>';
Please note: The scope inside srcdoc refers to the iframe, so you have to use top like in the example above to access the parent document.
Single line solution below:
document.getElementsByTagName("head")[0].append(document.createRange().createContextualFragment('<script src="https://google.com/file.js"></script>'));
My solution for this problem is to set a Mutation Observer to detect <script></script> nodes and then replace it with a new <script></script> node with the same src. For example:
let parentNode = /* node to observe */ void 0
let observer = new MutationObserver(mutations=>{
mutations.map(mutation=>{
Array.from(mutation.addedNodes).map(node=>{
if ( node.parentNode == parentNode ) {
let scripts = node.getElementsByTagName('script')
Array.from(scripts).map(script=>{
let src = script.src
script = document.createElement('script')
script.src = src
return script
})
}
})
})
})
observer.observe(document.body, {childList: true, subtree: true});
Gabriel Garcia's mention of MutationObservers is on the right track, but didn't quite work for me. I am not sure if that was because of a browser quirk or due to a mistake on my end, but the version that ended up working for me was the following:
document.addEventListener("DOMContentLoaded", function(event) {
var observer = new MutationObserver(mutations=>{
mutations.map(mutation=>{
Array.from(mutation.addedNodes).map(node=>{
if (node.tagName === "SCRIPT") {
var s = document.createElement("script");
s.text=node.text;
if (typeof(node.parentElement.added) === 'undefined')
node.parentElement.added = [];
node.parentElement.added[node.parentElement.added.length] = s;
node.parentElement.removeChild(node);
document.head.appendChild(s);
}
})
})
})
observer.observe(document.getElementById("element_to_watch"), {childList: true, subtree: true,attributes: false});
};
Of course, you should replace element_to_watch with the name of the element that is being modified.
node.parentElement.added is used to store the script tags that are added to document.head. In the function used to load the external page, you can use something like the following to remove no longer relevant script tags:
function freeScripts(node){
if (node === null)
return;
if (typeof(node.added) === 'object') {
for (var script in node.added) {
document.head.removeChild(node.added[script]);
}
node.added = {};
}
for (var child in node.children) {
freeScripts(node.children[child]);
}
}
And an example of the beginning of a load function:
function load(url, id, replace) {
if (document.getElementById(id) === null) {
console.error("Element of ID "+id + " does not exist!");
return;
}
freeScripts(document.getElementById(id));
var xhttp = new XMLHttpRequest();
// proceed to load in the page and modify innerHTML
}
Building up on Danny '365CSI' Engelman's comment, here is an universal solution:
<script>
alert("This script always runs.");
script01 = true;
</script>
<img src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"
onload="if(typeof script01==='undefined') eval(this.previousElementSibling.innerHTML)">
Use this as innerHTML (i.e. loaded by XMLHttpRequest) or directly (i.e. inserted by PHP backend), the script always loads once.
Explanation: script loaded as innerHTML is not executed, but onload content atribute is. If the script was not executed (added as innerHTML) then the script is executed in image onload event. If the script was loaded (added by backend) then script01 variable is defined and onload will not run the script for the second time.
Execute (Java Script) tag from innerHTML
Replace your script element with div having a class attribute class="javascript" and close it with </div>
Don't change the content that you want to execute (previously it was in script tag and now it is in div tag)
Add a style in your page...
<style type="text/css"> .javascript { display: none; } </style>
Now run eval using jquery(Jquery js should be already included)
$('.javascript').each(function() {
eval($(this).text());
});`
You can explore more here, at my blog.
For me the best way is to insert the new HTML content through innerHtml and then use
setTimeout(() => {
var script_el = document.createElement("script")
script_el.src = 'script-to-add.js'
document.body.appendChild(script_el)
}, 500)
The setTimeout is not required but it works better. This worked for me.
My own twist, using modern JS and typescript. Not sure why people are filtering on tagName etc when querySelector is right there.
Works a charm for me:
function makeScriptsExecutable(el: Element) {
el.querySelectorAll("script").forEach(script => {
const clone = document.createElement("script")
for (const attr of script.attributes) {
clone.setAttribute(attr.name, attr.value)
}
clone.text = script.innerHTML
script.parentNode?.replaceChild(clone, script)
})
}
simple, no eval, no functions:
fetch('/somepage')
.then(x=>x.text())
.then(x=>{
divDestination.innerHTML=x;
divDestination.querySelectorAll("script")
.forEach(x=>{
var sc=document.createElement("script");
sc.appendChild(document.createTextNode(x.innerText));
divDestination.appendChild(sc)
})
})

Userscript getElementsByTagName not returning all links of Jira page

I need to make my Jira page on internal network show my assignee name in red color using a userscript without any external code use - no jQuery etc. And be as simple as possible, as I'm just learning.
Example page is here (List view): https://jira.atlassian.com/issues/?filter=-5
My internal page has following tag with my name in it, like this:
<a class="user-hover" rel="myusername" id="assignee_myusername" href="https://jira.mydomain.org/secure/ViewProfile.jspa?name=myusername" target="_parent">Myfirstname Mylastname</a>
My userscript is only making color changes on top and bottom lines of the page (I am logged in; it changes most links for userspecific ones). But the middle, where list of cases and assignees is listed, doesn't get affected. Here is my script:
// ==UserScript==
// #name Jira
// #namespace https://jira.mydomain.org/secure/Dashboard.jspa
// #include https://jira.mydomain.org/*
// ==/UserScript==
var links = document.getElementsByTagName( 'a' );
var element;
for ( var i = 0; i < links.length; i++ ) {
element = links[ i ];
if ( element.id = "assignee_myusername" ) {
element.style.color = "red";
element.style.backgroundColor = "black";
}
}
How do I make it highlight my name in Assignee column?
P.S. This is not same as the Ajax question, as that uses external code, which I specified in the beginning - no external code. No external code.
My guess is that the parts of the Jira page you want to script have not been loaded when your user script executes. Jira uses Ajax to load parts of the page asynchronously. Try adding a setTimeout around your code:
window.setTimeout(function () {
var links = document.getElementsByTagName( 'a' );
var element;
for ( var i = 0; i < links.length; i++ ) {
element = links[ i ];
if ( element.id = "assignee_myusername" ) {
element.style.color = "red";
element.style.backgroundColor = "black";
}
}
}, 1000);
This will highlight the link after one second. It's a hacky solution, but if it works, then that at least will show you that the async loading is, in fact, the problem.
If you just want to change the style of an element with ID “assignee_myusername”, you don't need to use getElementsByTagName, you can use getElementById:
window.setTimeout(function () {
var element = document.getElementById( 'assignee_myusername' );
element.style.color = "red";
element.style.backgroundColor = "black";
}, 1000);

Javascript pulling content from commented html

Bit of a JS newbie, I have a tracking script that reads the meta data of the page and places the right scripts on that page using this:
var element = document.querySelector('meta[name="tracking-title"]');
var content = element && element.getAttribute("content");
console.log(content)
This obviously posts the correct tag to console so I can make sure it's working .. and it does in a test situation. However, on the actual website the meta data i'm targeting is produced on the page by a Java application and beyond my control, the problem is it is in a commented out area. This script cannot read within a commented out area. ie
<!-- your tracking meta is here
<meta name="tracking-title" content="this-is-the-first-page">
Tracking finished -->
Any ideas appreciated.
You can use this code:
var html = document.querySelector('html');
var content;
function traverse(node) {
if (node.nodeType == 8) { // comment
var text = node.textContent.replace(/<!--|-->/g, '');
var frag = document.createDocumentFragment();
var div = document.createElement('div');
frag.appendChild(div);
div.innerHTML = text;
var element = div.querySelector('meta[name="tracking-title"]');
if (element) {
content = element.getAttribute("content");
}
}
var children = node.childNodes;
if (children.length) {
for (var i = 0; i < children.length; i++) {
traverse(children[i]);
}
}
}
traverse(html);
One way is to use a NodeIterator and get comment nodes. Quick example below. You will still need to parse the returned value for the data you want but I am sure you can extend this here to do what you want.
Fiddle: http://jsfiddle.net/AtheistP3ace/gfu791c5/
var commentedOutHTml = [];
var iterator = document.createNodeIterator(document.body, NodeFilter.SHOW_COMMENT, NodeFilter.FILTER_ACCEPT, false);
var currentNode;
while (currentNode = iterator.nextNode()) {
commentedOutHTml.push(currentNode.nodeValue);
}
alert(commentedOutHTml.toString());
You can try this. This will require you to use jQuery however.
$(function() {
$("*").contents().filter(function(){
return this.nodeType == 8;
}).each(function(i, e){
alert(e.nodeValue);
});
});

HTML JavaScript delay downloading img src until node in DOM

Hi I have markup sent to me from a server and I set it as the innerHTML of a div element for the purpose of traversing the tree, finding image nodes, and changing their src values. Is there a way to prevent the original src value from being downloaded?
Here is what I am doing
function replaceImageSrcsInMarkup(markup) {
var div = document.createElement('div');
div.innerHTML = markup;
var images = div.getElementsByTagName('img');
images.forEach(replaceSrc);
return div.innerHTML;
}
The problem is that in browsers as soon as you do:
var img = document.createElement('img'); img.src = 'someurl.com' the browser fires off a request to someurl.com. Is there a way to prevent this without resorting to parsing the markup myself? If there is in no other way does anyone know a good way of parsing the markup with as little code as possible to accomplish my goal?
I know you are already happy with your solution, but I think it would be worth sharing a safe method for future users.
You can now simply use the DOMParser object to generate an external document from your HTML string, instead of using a div created by your current document as container.
DOMParser specifically avoids the pitfalls mentioned in the question and other threats: no img src download, no JavaScript execution, even in elements attributes.
So in your case you can safely do:
function replaceImageSrcsInMarkup(markup) {
var parser = new DOMParser(),
doc = parser.parseFromString(markup, "text/html");
// Manipulate `doc` as a regular document
var images = doc.getElementsByTagName('img');
for (var i = 0; i < images.length; i += 1) {
replaceSrc(images[i]);
}
return doc.body.innerHTML;
}
Demo: http://jsfiddle.net/94b7gyg9/1/
Note: with your current code, browsers will still try downloading the resource initially specified in your img nodes src attribute, even if you change it before the end of JS execution. Trace network transactions in this demo: http://jsfiddle.net/94b7gyg9/
Rather than append the new markup to the DOM before you change the img sources, create an element, set it's inner HTML, change the source of the images and then finally, append the changed markup to the page.
Here's a fully-worked sample.
<!DOCTYPE html>
<html>
<head>
<script>
"use strict";
function byId(id,parent){return (parent == undefined ? document : parent).getElementById(id);}
//function allByClass(className,parent){return (parent == undefined ? document : parent).getElementsByClassName(className);}
function allByTag(tagName,parent){return (parent == undefined ? document : parent).getElementsByTagName(tagName);}
function newEl(tag){return document.createElement(tag);}
//function newTxt(txt){return document.createTextNode(txt);}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
window.addEventListener('load', onDocLoaded, false);
function onDocLoaded()
{
byId('goBtn').addEventListener('click', onGoBtnClick, false);
}
var dummyString = "<img src='img/girl.png'/><img src='img/gfx07.jpg'/>";
function onGoBtnClick(evt)
{
var div = newEl('div');
div.innerHTML = dummyString;
var mImgs = allByTag('img', div);
for (var i=0, n=mImgs.length; i<n; i++)
{
mImgs[i].src = "img/murderface.jpg";
}
document.body.appendChild(div);
}
</script>
<style>
</style>
</head>
<body>
<button id='goBtn'>GO!</button>
</body>
</html>
You could directly parse the markup string using a regex to replace the img src. Searching for all the img src urls in the string and then replacing them with the new url.
var regex = /<img[^>]+src="?([^"\s]+)"?\s*\/>/g;
var imgUrls = [];
while ( m = regex.exec( markup ) ) {
imgUrls.push( m[1] );
}
imgUrls.forEach(function(url) {
markup = markup.replace(url,'new-url');
});
Another solution might be, if you have access to it, to set the all the img src to an empty string, and put the url in in a data-src attribute. Having your markup string look like something like this
markup = '
';
Then setting this markup to your div.innerHTML won't trigger any download from the browser. And you can still parse it using regular DOM selector.
div.innerHTML = markup;
var images = div.getElementsByTagName('img');
images.forEach(function(img){
var oldSrc = img.getAttribute('data-src');
img.setAttribute('src', 'new-url');
});

Don't load scripts appended with innerHTML?

I'm appending a whole HTML page to a div (to scrape). How do I stop it from requesting script, and css files ? I tried immediately removing those nodes but they still get requested.
It's for a browser addon, I'm scraping with JS
As #adeneo wrote you don't have to add the html to a page in order to scrape information from it, you can turn it into DOM tree that is disconnected from the page DOM and process it there.
In jQuery it is simple $("html text here"). Then you can scrape it using the API,
eg.
function scrape_html(html_string) {
var $dom = $(html_string);
var name = $dom.find('.name').text();
return name;
}
without jQuery:
function scrape_html(html_string) {
var container = document.createElement('div');
container.innerHTML = html_string;
var name = container.getElementsByClassName('name')[0].innerText;
return name;
}
Setting the innerHTML of a temporary HTML element that has not been added to the document, will not execute scripts, and since it does not belong to your document, the style will not be applied either.
This will give you an opportunity to strip out any unwanted elements before copying the innerHTML to your own document.
Example:
var temp = document.createElement('div');
temp.innerHTML = html; // the HTML of the 'other' page.
function removeElements(element, tagName)
{
var elements = temp.getElementsByTagName(tagName);
while(elements.length > 0)
{
elements[0].parentNode.removeChild(elements[0]);
}
}
removeElements(temp, 'script');
removeElements(temp, 'style');
removeElements(temp, 'link');
container.innerHTML = temp.innerHTML;

Categories

Resources