can adjacent text nodes in the DOM be merged with Javascript? - javascript

Suppose I have a sentence in the webpage DOM that when I examine it, consists of 3 text nodes followed by perhaps some element like BOLD or ITALIC. I want to merge the text nodes into one text node, since having adjacent text nodes is meaningless - there is no reason to have them. Is there a way to merge them easily?
Thanks

It seems that Node.normalize() is doing exactly what you want.
You can refer to: Node.normalize()

Maybe this will help you:
var parentNode = document.getElementById('pelements');
var textNode = document.createElement('p');
while (parentNode.firstChild) {
textNode.textContent += parentNode.firstChild.textContent;
parentNode.removeChild(parentNode.firstChild);
}
parentNode.appendChild(textNode);
<div id="pelements">
<p>A</p>
<p>B</p>
<p>C</p>
</div>

It is possible, but you need to specify the parent element. It should be possible to traverse the whole DOM and every node, but if you can avoid that, it would be better.
nodes = document.body.childNodes;
nodesToDelete = [];
function combineTextNodes(node, prevText) {
if (node.nextSibling && node.nextSibling.nodeType == 3) {
nodesToDelete.push(node.nextSibling);
return combineTextNodes(node.nextSibling, prevText + node.nodeValue);
} else {
return prevText + node.nodeValue;
}
}
for (i = 0; i < nodes.length; i++) {
if (nodes[i].nodeType == 3) {
nodes[i].nodeValue = combineTextNodes(nodes[i], '');
}
}
for (i = 0; i < nodesToDelete.length; i++) {
console.log(nodesToDelete[i]);
nodesToDelete[i].remove();
}

Related

RegEx - Search an entire document, pick out words that match a JSON array, and auto link them

I'm trying to produce some javascript code that will traverse an HTML document and pick out words from a JSON array, if matched the javascript would wrap the text in a <a href='glossary/#[matched text]'>[matched text]</a> and render to screen.
I seem to have that part semi-down, the bit where I'm falling over is how best to tell the system to ignore certain elements (i.e text already in a, buttons, input, element attributes...etc). I've tried to resolve this with the regex and managed to fumble along and get the following:
/(?<!<(a|button|submit|pre|img|svg|path|h[0-9]|.*data-ignore.*>|input\/>|textarea|pre|code))((?<!(="|data-))\btext\b(?!"))(?!<\/(a|button|submit|pre|img|svg|path|h[0-9])>)/gi
(text is the word I'm trying to auto-link) - https://regex101.com/r/u7cLPR/1
If you follow the Regex101 link you'll see I "think" I've managed to cover all bases bar one which is when the word occurs in a class='' tag (and therefore others like style and such)
Any help here would be greatly appreciated here, as always with Regex I always seem to miss the mark or over-complicate the solution, (is Regex even the right tool for the job here?)
It would be recursive and quite fast. Check out my answer about changing size of English letters in all text nodes - It's the same idea.
var words = ["text", "one"]
var skip_elements = ["BUTTON", "TEXTAREA"]
var EnglishCharFixer = {
do_elem: function(elem) {
var nodes = this.textNodesUnder(elem);
this.process_text_nodes(nodes)
},
textNodesUnder: function(node) {
var all = [];
for (node = node.firstChild; node; node = node.nextSibling) {
if (node.nodeType == 3) {
all.push(node);
} else {
if (skip_elements.indexOf(node.tagName) == -1) {
all = all.concat(this.textNodesUnder(node));
}
}
}
return all;
},
replace_node: function(node, str) {
var replacementNode = document.createElement('span');
replacementNode.innerHTML = str
node.parentNode.insertBefore(replacementNode, node);
node.parentNode.removeChild(node);
},
do_text: function(str) {
// improve this function please
words.forEach(function(word) {
str = str.replace(word, '' + word + "");
})
return str;
},
process_text_nodes: function(nodes) {
for (var index = 0; index < nodes.length; index++) {
var node = nodes[index];
var value = node.nodeValue
var str = this.do_text(value)
if (str != value) {
this.replace_node(node, str)
}
}
}
}
EnglishCharFixer.do_elem(document.body);
<body>
<h1>some title with text</h1>
<button>text shouldn't change</button> just a text node
<div style="padding:30px">
<p>paragraph with text</p>
another one
<br>
<img src="https://picsum.photos/100" title="this text shouldn't change as well">
</div>
<textarea>hello text</textarea>
</body>

How to get all childNodes in JS including all the 'grandchildren'?

I want to scan a div for all childNodes including the ones that are nestled within other elements. Right now I have this:
var t = document.getElementById('DivId').childNodes;
for(i=0; i<t.length; i++) alert(t[i].id);
But it only gets the children of the Div and not the grandchildren. Thanks!
Edit: This question was too vague. Sorry about that. Here's a fiddle:
http://jsfiddle.net/F6L2B/
The body.onload script doesn't run at JSFiddle, but it works, except that the 'Me Second' and 'Me Third' input fields are not being assigned a tabIndex and are therefore being skipped over.
This is the fastest and simplest way, and it works on all browsers:
myDiv.getElementsByTagName("*")
If you're looking for all HTMLElement on modern browsers you can use:
myDiv.querySelectorAll("*")
What about great-grandchildren?
To go arbitrarily deep, you could use a recursive function.
var alldescendants = [];
var t = document.getElementById('DivId').childNodes;
for(let i = 0; i < t.length; i++)
if (t[i].nodeType == 1)
recurseAndAdd(t[i], alldescendants);
function recurseAndAdd(el, descendants) {
descendants.push(el.id);
var children = el.childNodes;
for(let i=0; i < children.length; i++) {
if (children[i].nodeType == 1) {
recurseAndAdd(children[i]);
}
}
}
If you really only want grandchildren, then you could take out the recursion (and probably rename the function)
function recurseAndAdd(el, descendants) {
descendants.push(el.id);
var children = el.childNodes;
for(i=0; i < children.length; i++) {
if (children[i].nodeType == 1) {
descendants.push(children[i].id);
}
}
}
If anyone else wants all nodes within that tree, and not just the elements, here's a 2022-era JS snippet:
function getDescendantNodes(node, all = []) {
all.push(...node.childNodes);
for (const child of node.childNodes)
getDescendantNodes(child, all);
return all;
}
Protip: afterwards, you can filter to your preferred nodeType with the Node global: nodes = getDescendantNodes($0); nodes.filter(n => n.nodeType === Node.TEXT_NODE)

counting text node recursively using javascript

Let say I have a mark up like this
<html id="test">
<body>
Some text node.
<div class="cool"><span class="try">This is another text node.</span></div>
Yet another test node.
</body>
</html>
my js code
function countText(node){
var counter = 0;
if(node.nodeType === 3){
counter+=node.nodeValue.length;
countText(node);
}
else{}
}
Now if I want to count the text nodes
console.log("count text : " + countText(document.getElementById("test"));
this should return me the count but its not working and moreover what should I put in else condition.
I never used nodeType so kind of having problem using it . Any help will be appreciated.
There are a couple of things wrong in your code:
Your HTML is malformed.
You are appending text to your counter instead of increasing it.
You never loop over the children of the a node, you always pass the same node to the recursive call.
You don't do anything if a node is not a text node.
This will work:
function countText(node){
var counter = 0;
if(node.nodeType === 3){
counter++;
}
else if(node.nodeType === 1) { // if it is an element node,
var children = node.childNodes; // examine the children
for(var i = children.length; i--; ) {
counter += countText(children[i]);
}
}
return counter;
}
alert(countText(document.body));
DEMO
Which number corresponds to which node type can be found here.
Update:
If you want to count the words, you have to split each text node into words first. In the following I assume that words are separated by white spaces:
if(node.nodeType === 3){
counter = node.nodeValue.split(/\s+/g).length;
}
Update 2
I know you want to use a recursive function, but if you want to count the words only, then there is a much easier and more efficient way:
function countWords(node){
// gets the text of the node and all its descendants
var text = node.innerText || node.textContent
return text.split(/\s+/g).length;
}
You want something like
function countTextNodes(node) {
var n = 0;
if(node.nodeType == 3)
n = 1;
for(var i = 0; i < node.childNodes.length; ++i)
n += countTextNodes(node.childNodes[i]);
return n;
}
This can be compressed into more compact code, but I went for legibility here.
Call this on the root in which you want to count text nodes. For example, to count text nodes throughout the entire document, you would want to call countTextNodes(document.getDocumentElement()).

What's the best way to strip out only the anchor HTML tags in javascript, given a string of html?

Let's say there's a string of HTML, with script tags, plain text, whatever.
What's the best way to strip out only the <a> tags?
I've been using some methods here, but these are for all tags. Strip HTML from Text JavaScript
Using jQuery:
var content = $('<div>' + htmlString + '</div>');
content.find('a').replaceWith(function() { return this.childNodes; });
var newHtml = content.html();
Adding a wrapping <div> tag allows us to get the desired HTML back.
I wrote a more detailed explanation on my blog.
This approach will preserve existing DOM nodes, minimizing side-effects if you have elements within the anchors that have events attached to them.
function unwrapAnchors() {
if(!('tagName' in this) || this.tagName.toLowerCase() != 'a' || !('parentNode' in this)) {
return;
}
var childNodes = this.childNodes || [], children = [], child;
// Convert childNodes collection to array
for(var i = 0, childNodes = this.childNodes || []; i < childNodes.length; i++) {
children[i] = childNodes[i];
}
// Move children outside element
for(i = 0; i < children.length; i++) {
child = children[i];
if(('tagName' in child) && child.tagName.toLowerCase() == 'a') {
child.parentNode.removeChild(child);
} else {
this.parentNode.insertBefore(child, this);
}
}
// Remove now-empty anchor
this.parentNode.removeChild(this);
}
To use (with jQuery):
$('a').each(unwrapAnchors);
To use (without jQuery):
var a = document.getElementsByTagName('a');
while(a.length) {
unwrapAnchors.call(a[a.length - 1]);
}
A <a> tag is not supposed to hold any other <a> tag, so a simple ungreedy regexp would do the trick (i.e. string.match(/<a>(.*?)<\/a>/), but this example suppose the tags have no attribute).
Here's a native (non-library) solution if performance is a concern.
function stripTag(str, tag) {
var a, parent, div = document.createElement('div');
div.innerHTML = str;
a = div.getElementsByTagName( tag );
while( a[0] ) {
parent = a[0].parentNode;
while (a[0].firstChild) {
parent.insertBefore(a[0].firstChild, a[0]);
}
parent.removeChild(a[0]);
}
return div.innerHTML;
}
Use it like this:
alert( stripTag( my_string, 'a' ) );

How to get text from all descendents of an element, disregarding scripts?

My current project involves gathering text content from an element and all of its descendants, based on a provided selector.
For example, when supplied the selector #content and run against this HTML:
<div id="content">
<p>This is some text.</p>
<script type="text/javascript">
var test = true;
</script>
<p>This is some more text.</p>
</div>
my script would return (after a little whitespace cleanup):
This is some text. var test = true; This is some more text.
However, I need to disregard text nodes that occur within <script> elements.
This is an excerpt of my current code (technically, it matches based on one or more provided selectors):
// get text content of all matching elements
for (x = 0; x < selectors.length; x++) { // 'selectors' is an array of CSS selectors from which to gather text content
matches = Sizzle(selectors[x], document);
for (y = 0; y < matches.length; y++) {
match = matches[y];
if (match.innerText) { // IE
content += match.innerText + ' ';
} else if (match.textContent) { // other browsers
content += match.textContent + ' ';
}
}
}
It's a bit simplistic in that it just returns all text nodes within the element (and its descendants) that matches the provided selector. The solution I'm looking for would return all text nodes except for those that fall within <script> elements. It doesn't need to be especially high-performance, but I do need it to ultimately be cross-browser compatible.
I'm assuming that I'll need to somehow loop through all children of the element that matches the selector and accumulate all text nodes other than ones within <script> elements; it doesn't look like there's any way to identify JavaScript once it's already rolled into the string accumulated from all of the text nodes.
I can't use jQuery (for performance/bandwidth reasons), although you may have noticed that I do use its Sizzle selector engine, so jQuery's selector logic is available.
function getTextContentExceptScript(element) {
var text= [];
for (var i= 0, n= element.childNodes.length; i<n; i++) {
var child= element.childNodes[i];
if (child.nodeType===1 && child.tagName.toLowerCase()!=='script')
text.push(getTextContentExceptScript(child));
else if (child.nodeType===3)
text.push(child.data);
}
return text.join('');
}
Or, if you are allowed to change the DOM to remove the <script> elements (which wouldn't usually have noticeable side effects), quicker:
var scripts= element.getElementsByTagName('script');
while (scripts.length!==0)
scripts[0].parentNode.removeChild(scripts[0]);
return 'textContent' in element? element.textContent : element.innerText;
EDIT:
Well first let me say im not too familar with Sizzle on its lonesome, jsut within libraries that use it... That said..
if i had to do this i would do something like:
var selectors = new Array('#main-content', '#side-bar');
function findText(selectors) {
var rText = '';
sNodes = typeof selectors = 'array' ? $(selectors.join(',')) : $(selectors);
for(var i = 0; i < sNodes.length; i++) {
var nodes = $(':not(script)', sNodes[i]);
for(var j=0; j < nodes.length; j++) {
if(nodes[j].nodeType != 1 && node[j].childNodes.length) {
/* recursion - this would work in jQ not sure if
* Sizzle takes a node as a selector you may need
* to tweak.
*/
rText += findText(node[j]);
}
}
}
return rText;
}
I didnt test any of that but it should give you an idea. Hopefully someone else will pipe up with more direction :-)
Cant you just grab the parent node and check the nodeName in your loop... like:
if(match.parentNode.nodeName.toLowerCase() != 'script' && match.nodeName.toLowerCase() != 'script' ) {
match = matches[y];
if (match.innerText) { // IE
content += match.innerText + ' ';
} else if (match.textContent) { // other browsers
content += match.textContent + ' ';
}
}
ofcourse jquery supports the not() syntax in selectors so could you just do $(':not(script)')?

Categories

Resources