counting text node recursively using javascript - javascript

Let say I have a mark up like this
<html id="test">
<body>
Some text node.
<div class="cool"><span class="try">This is another text node.</span></div>
Yet another test node.
</body>
</html>
my js code
function countText(node){
var counter = 0;
if(node.nodeType === 3){
counter+=node.nodeValue.length;
countText(node);
}
else{}
}
Now if I want to count the text nodes
console.log("count text : " + countText(document.getElementById("test"));
this should return me the count but its not working and moreover what should I put in else condition.
I never used nodeType so kind of having problem using it . Any help will be appreciated.

There are a couple of things wrong in your code:
Your HTML is malformed.
You are appending text to your counter instead of increasing it.
You never loop over the children of the a node, you always pass the same node to the recursive call.
You don't do anything if a node is not a text node.
This will work:
function countText(node){
var counter = 0;
if(node.nodeType === 3){
counter++;
}
else if(node.nodeType === 1) { // if it is an element node,
var children = node.childNodes; // examine the children
for(var i = children.length; i--; ) {
counter += countText(children[i]);
}
}
return counter;
}
alert(countText(document.body));
DEMO
Which number corresponds to which node type can be found here.
Update:
If you want to count the words, you have to split each text node into words first. In the following I assume that words are separated by white spaces:
if(node.nodeType === 3){
counter = node.nodeValue.split(/\s+/g).length;
}
Update 2
I know you want to use a recursive function, but if you want to count the words only, then there is a much easier and more efficient way:
function countWords(node){
// gets the text of the node and all its descendants
var text = node.innerText || node.textContent
return text.split(/\s+/g).length;
}

You want something like
function countTextNodes(node) {
var n = 0;
if(node.nodeType == 3)
n = 1;
for(var i = 0; i < node.childNodes.length; ++i)
n += countTextNodes(node.childNodes[i]);
return n;
}
This can be compressed into more compact code, but I went for legibility here.
Call this on the root in which you want to count text nodes. For example, to count text nodes throughout the entire document, you would want to call countTextNodes(document.getDocumentElement()).

Related

Regex replace function not updating DOM in expected manner

I am trying to update my html using regex and javascript. I can capture the group I intend, however nothing on the dom is changing the way I would expect.
I want to discover all currency mentions of AUD, I have a regex that is capturing that. I then need to change how it is displayed.
I used the .replace function, however it does not seem to change anything. It only seems to change it in the console.log.
10AUD
Help us all
<p> 20THB</p>
<p> There is 100USD, here</p>
<p>More random sentances that should not be evalued even with 500 numbers in it</p>
<p>Here I can write a bunch like I had 300THB in my pocket and a spare 50AUD note aswell</p>
Here is the js portion.
regAUD = RegExp(/([0-9]+(AUD))/gi);
function checker() {
str = document.getElementsByTagName('p');
//str = str.innerText;
for (i = 0; i < str.length; i++) {
str[i].id = String(i+"para");
console.log(str[i].innerText);
inner = str[i].innerText;
//Now make an if statement to if we go further
//res = isCurrency(inner); //This passes correctly
res = 1;
if(res === 1){
if(regAUD.test(inner)){
inner = inner.replace(regAUD, '<b>$1</b>');
console.log(inner);
console.log("Done?");
}
}
}
}
I assume I am using the function incorrectly. It does show up in the console but not in the elements expected. It doesn't change anything, ie I expect it to currently make 10AUD and 500AUD bold but it does not do this. Is there a better method to achieve this change in the DOM or have I used this function incorrectly?
You forget to affect your new HTML string to the concerned elements' own HTML.
Also, you could have a shorter function.
Here's one with two parameters:
selector to execute the highlight on wanted elements.
currency to highlight the wanted currency.
// Highlights "AUD" mentions winthin all <p> elements.
highlightCurrencyFromElement('p', 'aud');
function highlightCurrencyFromElement(selector, currency) {
const elements = document.querySelectorAll(selector),
pattern = new RegExp(`(\\d+\\s*${currency})`, 'gi');
for (let i = 0; i < elements.length; i++) {
elements[i].innerHTML = elements[i].innerHTML.replace(pattern, '<b>$1</b>');
}
}
<div>
<p>5</p>
<p>21 AUD</p>
</div>
<div>
<p>50AUD</p>
<p>50 €</p>
<p>87 USD</p>
<span>20 AUD (in span, so no highlighted)</span>
<div>
You have to set the paragraph innerHTML porperty after replacing regxmatch to replace in DOM.
regAUD = RegExp(/([0-9]+(AUD))/gi);
function checker() {
str = document.getElementsByTagName('p');
//str = str.innerText;
for (i = 0; i < str.length; i++) {
str[i].id = String(i+"para");
console.log(str[i].innerText);
inner = str[i].innerText;
//Now make an if statement to if we go further
//res = isCurrency(inner); //This passes correctly
res = 1;
if(res === 1){
if(regAUD.test(inner)){
inner = inner.replace(regAUD, '<b>$1</b>');
str[i].innerHTML= inner;
console.log(inner);
console.log("Done?");
}
}
}
}

Understanding the recursive function

Maybe this function is very simple for you. but i have problems with how this functions works.please explain how to compile this function.
It is zero the count in the run loop ?
function countChars(elm) {
var i, count = 0;
if (elm.nodeType == 3) { // TEXT_NODE
return elm.nodeValue.length;
}
for (i = 0, child; child = elm.childNodes[i]; i++) {
count += countChars(child);
}
return count;
}
Some Text
The DOM will look like this:
[
Textnode {"Some Text"}
]
The upper code is valid HTML, actually it is a text node. So if we want to get its length we simply take its length:
if (elm.nodeType == 3) { // TEXT_NODE
return elm.nodeValue.length;
}
Lets imagine a more complicated case:
Some Text <span> nested </span>
DOM:
[
Textnode {"Some Text"},
Span {
children:[
Textnode {"nested"}
]
}
]
Now weve got four nodes. A text node and a span node, and a text node nested in the span node. And theyre all nested in some kind of body. So to get its text length, we need to iterate over the nodes, and then go deeper ( have a look into tree traversal ):
var count = 0;
for (var i = 0, child; child = elm.childNodes[i]; i++) {
count += countChars(child);
}
return count;
So the upper example will work like this:
count=0
iterate over body:
child textnode:
count+="Some Text".length
child span:
inner count=0
iterate over span:
child textnode
inner count+="nested".length
count+=inner count
finished
/**
** This function counts the characters of a given node.
**
** #param : The current node to work on
**/
function countChars(elm) {
// Set local variables
var i, count = 0;
// Check to see if the current node is text.
// If its a text it cannot have any other children so simply
// return the text length int his node
if (elm.nodeType == 3) { // TEXT_NODE
return elm.nodeValue.length;
}
// This is not a text node so driil doen in to its all
// children and loop into them to count the number of
// character they have
for (i = 0, child; child = elm.childNodes[i]; i++) {
// dive into the current child in count the number of
// characters it contains. since we use recursion the count will
// be returned form the previous run and will return the number of
// characters collect from all "previous" children
count += countChars(child);
}
return count;
}

Drop empty childrens using Javascript

I try to drop empty nodes using JavaScript, because the navigator count the return line or space like a child node
So, I make this try, first it's work correctly but if I count the nodes for the second time it's give me a number ( contains the empty nodes ) despite I make removechild
My code :
function testing(){
var c = document.body.childNodes;
for(i=0;i<c.length;i++){
if(c[i].nodeName == "#text")
{
var rest = c[i].textContent;
if(rest.length == 0){
//I want to remove the empty nodes
document.body.removeChild(document.body.childNodes[i]);
}
}
}
//this give a number contains the empty nodes
var d =document.body.childNodes.length;
alert(d);
}
My HTML code
<body onclick="testing()">
<p>test</p>
<p>test1</p>
</body>
If I try to know the length of this HTML code, the navigator give me 5 not 2 (it's count the return line and space)
So, for that I want to delete the empty nodes the take result 2
if(c[i].nodeName == "#text")
At this line you check if it is a #text node but in your html are p elements.
var rest = c[i].textContent;
if(rest.length == 0){
//I want to remove the empty nodes
document.body.removeChild(document.body.childNodes[i]);
}
Also with those lines you check if the content of the element is empty which is also not the case in both of your p elements.
So your alert should correctly contain 2.
If your problem is that you have text elements which contain whitespaces and newlines in your body element you can check for them with regex.
Just change this line
if(rest.length == 0){
To
if(/^[\s\n]{0,}$/.test(rest)){
And for the compleatness sake:
change the following line
for(i=0;i<c.length;i++){
To
for(i=len -1;i>=0;i--){
As described in #Bindrid's answer because you can miss elements.
So the compleate code should be
function testing(){
var c = document.body.childNodes;
for(var i = c.length; i >= 0; i--) {
if(c[i].nodeName == "#text") {
var rest = c[i].textContent;
if(/^[\s\n]{0,}$/.test(rest)) {
//I want to remove the empty nodes
document.body.removeChild(document.body.childNodes[i]);
}
}
}
//this give a number contains the empty nodes
var d =document.body.childNodes.length;
alert(d);
}
An element is considered empty when has no content, whitespaces, line return are considered as content. So, you have two way depending on what you need. if you need for true empties elements use document.querySelectorAll(':empty'), otherwise, use document.body.childNodes and check if its content is empty with String.prototype.trim.
Hope it helps.
<div>
</div>
<section></section>
function isTextNode(element) {
return element.nodeType === 3;
}
function removeEmptyTextNodes(element) {
if(!isTextNode(element)) { return; }
var content = (element.textContent || "").trim();
if(content.length > 0) { return; }
console.log("removing", element);
element.parentNode.removeChild(element);
}
Array.prototype.forEach.call(
document.body.childNodes, removeEmptyTextNodes
);
<div>
</div>
<span></span>
Sounds like you're overthinking it. You've observed the difference between childNodes and children, which is that childNodes contains all nodes, including text nodes consisting entirely of whitespace, while children is a collection of just the child nodes that are elements. That's really all there is to it.
function testing(){
alert(document.body.children.length);
var c = document.body.children;
for(i=0;i<c.length;i++){
if(c[i].innerHTML == "")
{
//I want to remove the empty nodes
document.body.removeChild(c[i]);
}
}
//this give a number contains the empty nodes
var d =document.body.children.length;
alert(d);
}
<body onclick="testing()">
<p>test</p>
<p>test1</p>
<p></p>
ss
</body>
for(i=0;i<c.length;i++){
if(c[i].nodeName == "#text")
{
var rest = c[i].textContent;
if(rest.length == 0){
//I want to remove the empty nodes
document.body.removeChild(document.body.childNodes[i]);
}
}
}
should actually be as shown below to take in account that you are removing nodes as you run through the list. If node 7 and 8 are empty and you remove 7, 8 becomes 7 while you are in the 7 loop so 8 as the new 7 gets missed on the next loop.
var len = c.length;
for(i=len -1;i>=0;i--){
if(c[i].nodeName == "#text")
{
var rest = c[i].textContent;
if(rest.length == 0){
//I want to remove the empty nodes
document.body.removeChild(document.body.childNodes[i]);
}
}
}

can adjacent text nodes in the DOM be merged with Javascript?

Suppose I have a sentence in the webpage DOM that when I examine it, consists of 3 text nodes followed by perhaps some element like BOLD or ITALIC. I want to merge the text nodes into one text node, since having adjacent text nodes is meaningless - there is no reason to have them. Is there a way to merge them easily?
Thanks
It seems that Node.normalize() is doing exactly what you want.
You can refer to: Node.normalize()
Maybe this will help you:
var parentNode = document.getElementById('pelements');
var textNode = document.createElement('p');
while (parentNode.firstChild) {
textNode.textContent += parentNode.firstChild.textContent;
parentNode.removeChild(parentNode.firstChild);
}
parentNode.appendChild(textNode);
<div id="pelements">
<p>A</p>
<p>B</p>
<p>C</p>
</div>
It is possible, but you need to specify the parent element. It should be possible to traverse the whole DOM and every node, but if you can avoid that, it would be better.
nodes = document.body.childNodes;
nodesToDelete = [];
function combineTextNodes(node, prevText) {
if (node.nextSibling && node.nextSibling.nodeType == 3) {
nodesToDelete.push(node.nextSibling);
return combineTextNodes(node.nextSibling, prevText + node.nodeValue);
} else {
return prevText + node.nodeValue;
}
}
for (i = 0; i < nodes.length; i++) {
if (nodes[i].nodeType == 3) {
nodes[i].nodeValue = combineTextNodes(nodes[i], '');
}
}
for (i = 0; i < nodesToDelete.length; i++) {
console.log(nodesToDelete[i]);
nodesToDelete[i].remove();
}

How to get text from all descendents of an element, disregarding scripts?

My current project involves gathering text content from an element and all of its descendants, based on a provided selector.
For example, when supplied the selector #content and run against this HTML:
<div id="content">
<p>This is some text.</p>
<script type="text/javascript">
var test = true;
</script>
<p>This is some more text.</p>
</div>
my script would return (after a little whitespace cleanup):
This is some text. var test = true; This is some more text.
However, I need to disregard text nodes that occur within <script> elements.
This is an excerpt of my current code (technically, it matches based on one or more provided selectors):
// get text content of all matching elements
for (x = 0; x < selectors.length; x++) { // 'selectors' is an array of CSS selectors from which to gather text content
matches = Sizzle(selectors[x], document);
for (y = 0; y < matches.length; y++) {
match = matches[y];
if (match.innerText) { // IE
content += match.innerText + ' ';
} else if (match.textContent) { // other browsers
content += match.textContent + ' ';
}
}
}
It's a bit simplistic in that it just returns all text nodes within the element (and its descendants) that matches the provided selector. The solution I'm looking for would return all text nodes except for those that fall within <script> elements. It doesn't need to be especially high-performance, but I do need it to ultimately be cross-browser compatible.
I'm assuming that I'll need to somehow loop through all children of the element that matches the selector and accumulate all text nodes other than ones within <script> elements; it doesn't look like there's any way to identify JavaScript once it's already rolled into the string accumulated from all of the text nodes.
I can't use jQuery (for performance/bandwidth reasons), although you may have noticed that I do use its Sizzle selector engine, so jQuery's selector logic is available.
function getTextContentExceptScript(element) {
var text= [];
for (var i= 0, n= element.childNodes.length; i<n; i++) {
var child= element.childNodes[i];
if (child.nodeType===1 && child.tagName.toLowerCase()!=='script')
text.push(getTextContentExceptScript(child));
else if (child.nodeType===3)
text.push(child.data);
}
return text.join('');
}
Or, if you are allowed to change the DOM to remove the <script> elements (which wouldn't usually have noticeable side effects), quicker:
var scripts= element.getElementsByTagName('script');
while (scripts.length!==0)
scripts[0].parentNode.removeChild(scripts[0]);
return 'textContent' in element? element.textContent : element.innerText;
EDIT:
Well first let me say im not too familar with Sizzle on its lonesome, jsut within libraries that use it... That said..
if i had to do this i would do something like:
var selectors = new Array('#main-content', '#side-bar');
function findText(selectors) {
var rText = '';
sNodes = typeof selectors = 'array' ? $(selectors.join(',')) : $(selectors);
for(var i = 0; i < sNodes.length; i++) {
var nodes = $(':not(script)', sNodes[i]);
for(var j=0; j < nodes.length; j++) {
if(nodes[j].nodeType != 1 && node[j].childNodes.length) {
/* recursion - this would work in jQ not sure if
* Sizzle takes a node as a selector you may need
* to tweak.
*/
rText += findText(node[j]);
}
}
}
return rText;
}
I didnt test any of that but it should give you an idea. Hopefully someone else will pipe up with more direction :-)
Cant you just grab the parent node and check the nodeName in your loop... like:
if(match.parentNode.nodeName.toLowerCase() != 'script' && match.nodeName.toLowerCase() != 'script' ) {
match = matches[y];
if (match.innerText) { // IE
content += match.innerText + ' ';
} else if (match.textContent) { // other browsers
content += match.textContent + ' ';
}
}
ofcourse jquery supports the not() syntax in selectors so could you just do $(':not(script)')?

Categories

Resources