Get current rendered (virtual?) DOM via javascript [duplicate] - javascript

I have not seen a satisfactory answer for this question. This basically a duplicate of this question, but it was improperly closed and the answers given are not sufficient.
I have come up with my own solution which I will post below.
This can be useful for web scraping, or in my case, running tests on a javascript library that handles custom elements. I make sure it is producing the output that I want, then I use this function to scrape the HTML for a given test output and use that copied HTML as the expected output to compare the test against in the future.

Here is a function that can do what is requested. Note that it ignores html comments and other fringe things. But it retrieves regular elements, text nodes, and custom elements with shadowRoots. It also handles slotted template content. It has not been tested exhaustively but seems to be working well for my needs.
Use it like extractHTML(document.body) or extractHTML(document.getElementByID('app')).
function extractHTML(node) {
// return a blank string if not a valid node
if (!node) return ''
// if it is a text node just return the trimmed textContent
if (node.nodeType===3) return node.textContent.trim()
//beyond here, only deal with element nodes
if (node.nodeType!==1) return ''
let html = ''
// clone the node for its outer html sans inner html
let outer = node.cloneNode()
// if the node has a shadowroot, jump into it
node = node.shadowRoot || node
if (node.children.length) {
// we checked for children but now iterate over childNodes
// which includes #text nodes (and even other things)
for (let n of node.childNodes) {
// if the node is a slot
if (n.assignedNodes) {
// an assigned slot
if (n.assignedNodes()[0]){
// Can there be more than 1 assigned node??
html += extractHTML(n.assignedNodes()[0])
// an unassigned slot
} else { html += n.innerHTML }
// node is not a slot, recurse
} else { html += extractHTML(n) }
}
// node has no children
} else { html = node.innerHTML }
// insert all the (children's) innerHTML
// into the (cloned) parent element
// and return the whole package
outer.innerHTML = html
return outer.outerHTML
}

Only if shadowRoots are created with the mode:"open" setting can you access shadowRoots from the outside.
You can then dive into elements and shadowRoots with something like:
const shadowDive = (
el,
selector,
match = (m, r) => console.warn('match', m, r)
) => {
let root = el.shadowRoot || el;
root.querySelector(selector) && match(root.querySelector(selector), root);
[...root.children].map(el => shadowDive(el, selector, match));
}
Note: extracting raw HTML is pointless if Web Component styling is based on shadowDOM behaviour; you will loose all correct styling.

Related

How can I get all the HTML in a document or node containing shadowRoot elements

I have not seen a satisfactory answer for this question. This basically a duplicate of this question, but it was improperly closed and the answers given are not sufficient.
I have come up with my own solution which I will post below.
This can be useful for web scraping, or in my case, running tests on a javascript library that handles custom elements. I make sure it is producing the output that I want, then I use this function to scrape the HTML for a given test output and use that copied HTML as the expected output to compare the test against in the future.
Here is a function that can do what is requested. Note that it ignores html comments and other fringe things. But it retrieves regular elements, text nodes, and custom elements with shadowRoots. It also handles slotted template content. It has not been tested exhaustively but seems to be working well for my needs.
Use it like extractHTML(document.body) or extractHTML(document.getElementByID('app')).
function extractHTML(node) {
// return a blank string if not a valid node
if (!node) return ''
// if it is a text node just return the trimmed textContent
if (node.nodeType===3) return node.textContent.trim()
//beyond here, only deal with element nodes
if (node.nodeType!==1) return ''
let html = ''
// clone the node for its outer html sans inner html
let outer = node.cloneNode()
// if the node has a shadowroot, jump into it
node = node.shadowRoot || node
if (node.children.length) {
// we checked for children but now iterate over childNodes
// which includes #text nodes (and even other things)
for (let n of node.childNodes) {
// if the node is a slot
if (n.assignedNodes) {
// an assigned slot
if (n.assignedNodes()[0]){
// Can there be more than 1 assigned node??
html += extractHTML(n.assignedNodes()[0])
// an unassigned slot
} else { html += n.innerHTML }
// node is not a slot, recurse
} else { html += extractHTML(n) }
}
// node has no children
} else { html = node.innerHTML }
// insert all the (children's) innerHTML
// into the (cloned) parent element
// and return the whole package
outer.innerHTML = html
return outer.outerHTML
}
Only if shadowRoots are created with the mode:"open" setting can you access shadowRoots from the outside.
You can then dive into elements and shadowRoots with something like:
const shadowDive = (
el,
selector,
match = (m, r) => console.warn('match', m, r)
) => {
let root = el.shadowRoot || el;
root.querySelector(selector) && match(root.querySelector(selector), root);
[...root.children].map(el => shadowDive(el, selector, match));
}
Note: extracting raw HTML is pointless if Web Component styling is based on shadowDOM behaviour; you will loose all correct styling.

Scan the DOM upward to find element matching selector (no jQuery)

I wanted a function that could scan the DOM upward from a DOMElement and also scan the children of each parent as it's going up.
It had to keep going until it would found any <element> matching the selector received in parameter. The selector had to be any type of valid CSS selector.
It was also needed to be done in pure JS (no jQuery)
I ended up making a modified version of this function that I found on this site. You can use this function as you wish, scale it up, claim it yours, whatever you want.
Here's the solution I found
GetClosest = function (elem, selector) {
for (; elem && elem !== document.body; elem = elem.parentNode) {
// If the elem matches at first iteration.
if(elem.matches(selector)) {
return elem;
} else {
// Scans all the childs of current iterated element (always higher in DOM until found).
// If one matches the selector it'll stop and return it.
child = elem.parentNode.firstChild;
do {
if(child.nodeType === 3) continue; // text node
if(child.matches(selector)) return child;
} while (child = child.nextElementSibling);
}
}
return null;
};

How do you find the (string) length of a starting tag or ending tag?

I'm trying to write a jQuery or pure Javascript function (preferring the more readable solution) that can count the length of a starting tag or ending tag in an HTML document.
For example,
<p>Hello.</p>
would return 3 and 4 for the starting and ending tag lengths. Adding attributes,
<span class="red">Warning!</span>
would return 18 and 7 for the starting and ending tag lengths. Finally,
<img src="foobar.png"/>
would return 23 and 0 (or -1) for the starting and ending tag lengths.
I'm looking for a canonical, guaranteed-to-work-according-to-spec solution, so I'm trying to use DOM methods rather than manual text manipulations. For example, I would like the solution to work even for weird cases like
<p>spaces infiltrating the ending tag</ p >
and
<img alt="unended singleton tags" src="foobar.png">
and such. That is, my hope is that as long as we use proper DOM methods, we should be able to find the number of characters between < and > no matter how weird things get, even
<div data-tag="<div>">HTML-like strings within attributes</div>
I have looked at the jQuery API (especially the Manipulation section, including DOM Insertion and General Attributes subsections), but I don't see anything that would help.
Currently the best idea I have, given an element node is
lengthOfEndTag = node.tagName.length + 3;
lengthOfStartTag = node.outerHTML.length
- node.innerHTML.length
- lengthOfEndTag;
but of course I don't want to make such an assumption for the end tag.
(Finally, I'm familiar with regular expressions—but trying to avoid them if at all possible.)
EDIT
#Pointy and #squint helped me understand that it's not possible to see </ p >, for example, because the HTML is discarded once the DOM is created. That's fine. The objective, adjusted, is to find the length of the start and end tags as would be rendered in outerHTML.
An alternate way to do this could be to use XMLSerializer's serializeToString on a clone copy of the node (with id set) to avoid having to parse innerHTML, then split over "><"
var tags = (function () {
var x = new XMLSerializer(); // scope this so it doesn't need to be remade
return function tags(elm) {
var s, a, id, n, o = {open: null, close: null}; // spell stuff with var
if (elm.nodeType !== 1) throw new TypeError('Expected HTMLElement');
n = elm.cloneNode(); // clone to get rid of innerHTML
id = elm.getAttribute('id'); // re-apply id for clone
if (id !== null) n.setAttribute('id', id); // if it was set
s = x.serializeToString(n); // serialise
a = s.split('><');
if (a.length > 1) { // has close tag
o.close = '<' + a.pop();
o.open = a.join('><') + '>'; // join "just in case"
}
else o.open = a[0]; // no close tag
return o;
}
}()); // self invoke to init
After running this, you can access .length of open and close properties
tags(document.body); // {open: "<body class="question-page">", close: "</body>"}
What if an attribute's value has >< in it? XMLSerializer escapes this to >< so it won't change the .split.
What about no close tag? close will be null.
This answer helped me understand what #Pointy and #squint were trying to say.
The following solution works for me:
$.fn.lengthOfStartTag = function () {
var node = this[0];
if (!node || node.nodeType != 1) {
$.error("Called $.fn.lengthOfStartTag on non-element node.");
}
if (!$(node).is(":empty")) {
return node.outerHTML.indexOf(node.innerHTML);
}
return node.outerHTML.length;
}
$.fn.lengthOfEndTag = function () {
var node = this[0];
if (!node || node.nodeType != 1) {
$.error("Called $.fn.lengthOfEndTag on non-element node.");
}
if (!$(node).is(":empty")) {
var indexOfInnerHTML = node.outerHTML.indexOf(node.innerHTML);
return node.outerHTML.length - (indexOfInnerHTML + node.innerHTML.length);
}
return -1;
}
Sample jsFiddle here.

What are valid node types for the Range setEndAfter function in Safari?

I am writing a module that should allow users to select parts of an HTML document. To get the internals to work I expand the Range of the selection to a valid HTML snippet.
For the case where B is a descendant of A I find the ancestor of B which is a child of A and want to set the range to end after that node using setEndAfter. This is what I have now:
var closestChild = function (node, descendant) {
var parent;
if (descendant.parentElement) {
parent = descendant.parentElement;
if ( node === parent ) {
return descendant;
}
return closestChild(node, parent);
}
return false;
}
var legalRange = function (range) {
var newRange = range.cloneRange(),
child;
if (range.startContainer === range.endContainer) {
return newRange;
}
child = closestChild(range.startContainer.parentElement, range.endContainer.parentElement);
if (child) {
newRange.setEndAfter(child);
return newRange;
}
return null;
};
But this throws a INVALID_NODE_TYPE_ERR: DOM Range Exception 2 when I try to set the end point. I have also tried using parentNode instead of parentElement with the same exception thrown. This is not a problem if i use setEnd(). What types of nodes should I pass to do this.
PS: It turns out that the code works in FireFox, so my problem is now with Safari and Chrome.
I found the solution.
When I set up my test cases, I didn't add the elements to the document. It seems that Chrome and Safari treated the nodes as invalid when using setEndAfter if the nodes were not part of the document.
As answered by Eivind, the problem is that the node used to set the range position is not attached to the document. So one solution is to attach it before you set the range position.
Another solution, if you can't attach the node to the DOM for some reason, is to use setStart() and setEnd() instead.
// Instead of `range.setStartBefore(node)`
var parent = node.parent;
range.setStart(parent, Array.from(parent.childNodes).indexOf(node))
// Instead of `range.setStartAfter(node)`
var parent = node.parent;
range.setStart(parent, Array.from(parent.childNodes).indexOf(node) + 1)
// Instead of `range.setEndBefore(node)`
var parent = node.parent;
range.setEnd(parent, Array.from(parent.childNodes).indexOf(node))
// Instead of `range.setEndAfter(node)`
var parent = node.parent;
range.setEnd(parent, Array.from(parent.childNodes).indexOf(node) + 1)
Note: Array.from(arrayLike) is not supported in Internet Explorer <= 11. Use Array.prototype.slice.call(arrayLike) instead of you need IE support.

Wrap <a> tags around http text

How do I find every word on a page beginning with http:// and wrap tags around it?
Can I use something like regex perhaps?
I disagree heavily that jQuery can be much use in finding a solution here. Granted you have to get down and dirty with some of the textNode element attributes but putting the DOM back together again after you split your matched node can be made a wee bit easier using the jQuery library.
The following code is documented inline to explain the action taken. I've written it as a jQuery plugin in case you just want to take this and move it around elsewhere. This way you can scope which elements you want to convert URLs for or you can simply use the $("body") selector.
(function($) {
$.fn.anchorTextUrls = function() {
// Test a text node's contents for URLs and split and rebuild it with an achor
var testAndTag = function(el) {
// Test for URLs along whitespace and punctuation boundaries (don't look too hard or you will be consumed)
var m = el.nodeValue.match(/(https?:\/\/.*?)[.!?;,]?(\s+|"|$)/);
// If we've found a valid URL, m[1] contains the URL
if (m) {
// Clone the text node to hold the "tail end" of the split node
var tail = $(el).clone()[0];
// Substring the nodeValue attribute of the text nodes based on the match boundaries
el.nodeValue = el.nodeValue.substring(0, el.nodeValue.indexOf(m[1]));
tail.nodeValue = tail.nodeValue.substring(tail.nodeValue.indexOf(m[1]) + m[1].length);
// Rebuild the DOM inserting the new anchor element between the split text nodes
$(el).after(tail).after($("<a></a>").attr("href", m[1]).html(m[1]));
// Recurse on the new tail node to check for more URLs
testAndTag(tail);
}
// Behave like a function
return false;
}
// For each element selected by jQuery
this.each(function() {
// Select all descendant nodes of the element and pick out only text nodes
var textNodes = $(this).add("*", this).contents().filter(function() {
return this.nodeType == 3
});
// Take action on each text node
$.each(textNodes, function(i, el) {
testAndTag(el);
});
});
}
}(jQuery));
$("body").anchorTextUrls(); //Sample call
Please keep in mind that given the way I wrote this to populate the textNodes array, the method will find ALL descendant text nodes, not just immediate children text nodes. If you want it to replace URLs only amongst the text within a specific selector, remove the .add("*", this) call that adds all the descendants of the selected element.
Here's a fiddle example.
This is one of those few things that jQuery doesn't directly help you with much. You basically have to walk through the DOM tree and examine the text nodes (nodeType === 3); if you find a text node containing the target text you want to wrap ("http://.....", whatever rules you want to apply), you then split the text node (using splitText) into three parts (the part before the string, the part that is the string, and the part following the string), then put the a element around the second of those.
That sounds a bit complicated, but it isn't really all that bad. It's just a recursive descent walker function (for working through the DOM), a regex match to find the things you want to replace, and then a couple of calls to splitText, createElement, insertBefore, appendChild.
Here's an example that searches for a fixed string; just add your regex matching for "http://":
walk(document.body, "foo");
function walk(node, targetString) {
var child;
switch (node.nodeType) {
case 1: // Element
for (child = node.firstChild;
child;
child = child.nextSibling) {
walk(child, targetString);
}
break;
case 3: // Text node
handleText(node, targetString);
break;
}
}
function handleText(node, targetString) {
var start, targetNode, followingNode, wrapper;
// Does the text contain our target string?
// (This would be a regex test in your http://... case)
start = node.nodeValue.indexOf(targetString);
if (start >= 0) {
// Split at the beginning of the match
targetNode = node.splitText(start);
// Split at the end of the match
followingNode = targetNode.splitText(targetString.length);
// Wrap the target in an element; in this case, we'll
// use a `span` with a class, but you'd use an `a`.
// First we create the wrapper and insert it in front
// of the target text.
wrapper = document.createElement('span');
wrapper.className = "wrapper";
targetNode.parentNode.insertBefore(wrapper, targetNode);
// Now we move the target text inside it
wrapper.appendChild(targetNode);
// Clean up any empty nodes (in case the target text
// was at the beginning or end of a text ndoe)
if (node.nodeValue.length == 0) {
node.parentNode.removeChild(node);
}
if (followingNode.nodeValue.length == 0) {
followingNode.parentNode.removeChild(followingNode);
}
}
}
Live example
Update: The above didn't handle it if there were multiple matches in the same text node (doh!). And oh what the heck, I did a regexp match — you will have to adjust the regexp, and probably do some post-processing on each match, because what's here is too simplistic. But it's a start:
// The regexp should have a capture group that
// will be the href. In our case below, we just
// make it the whole thing, but that's up to you.
// THIS REGEXP IS ALMOST CERTAINLY TOO SIMPLISTIC
// AND WILL NEED ADJUSTING (for instance: what if
// the link appears at the end of a sentence and
// it shouldn't include the ending puncutation?).
walk(document.body, /(http:\/\/[^ ]+)/i);
function walk(node, targetRe) {
var child;
switch (node.nodeType) {
case 1: // Element
for (child = node.firstChild;
child;
child = child.nextSibling) {
walk(child, targetRe);
}
break;
case 3: // Text node
handleText(node, targetRe);
break;
}
}
function handleText(node, targetRe) {
var match, targetNode, followingNode, wrapper;
// Does the text contain our target string?
// (This would be a regex test in your http://... case)
match = targetRe.exec(node.nodeValue);
if (match) {
// Split at the beginning of the match
targetNode = node.splitText(match.index);
// Split at the end of the match.
// match[0] is the full text that was matched.
followingNode = targetNode.splitText(match[0].length);
// Wrap the target in an `a` element.
// First we create the wrapper and insert it in front
// of the target text. We use the first capture group
// as the `href`.
wrapper = document.createElement('a');
wrapper.href = match[1];
targetNode.parentNode.insertBefore(wrapper, targetNode);
// Now we move the target text inside it
wrapper.appendChild(targetNode);
// Clean up any empty nodes (in case the target text
// was at the beginning or end of a text ndoe)
if (node.nodeValue.length == 0) {
node.parentNode.removeChild(node);
}
if (followingNode.nodeValue.length == 0) {
followingNode.parentNode.removeChild(followingNode);
}
// Continue with the next match in the node, if any
match = followingNode
? targetRe.exec(followingNode.nodeValue)
: null;
}
}
Live example
I am not practically but you can try it
$('a([href^="http://"])').each( function(){
//perform your task
})

Categories

Resources