javascript HTML from document.body.innerHTML - javascript

I am trying to build a string of the contents of a webpage, without HTML syntax (probably replace it with a space, so words are not all conjoined) or punctuation.
so say you have the code:
<body>
<h1>Content:</h1>
<p>paragraph 1</p>
<p>paragraph 2</p>
<script> alert("blah blah blah"); </script>
This is some text<br />
....and some more
</body>
I want to return the string:
var content = "Content paragraph 1 paragraph 2 this is some text and this is some more";
any idea how to do this? Thanks.

You can use the innerText property (instead of innerHTML, which returns the HTML tags as well):
var content = document.getElementsByTagName("body")[0].innerText;
However, note that this will also include new lines, so if you are after exactly what you specified in your question, you would need to remove them.

There is the W3C DOM 3 Core textContent property supported by some browsers, or the MS/HTML5 innerText property supported by other browsers (some support both). Likely the content of the script element is unwanted, so a recursive traverse of the related part of the DOM tree seems best:
// Get the text within an element
// Doesn't do any normalising, returns a string
// of text as found.
function getTextRecursive(element) {
var text = [];
var self = arguments.callee;
var el, els = element.childNodes;
for (var i=0, iLen=els.length; i<iLen; i++) {
el = els[i];
// May need to add other node types here
// Exclude script element content
if (el.nodeType == 1 && el.tagName && el.tagName.toLowerCase() != 'script') {
text.push(self(el));
// If working with XML, add nodeType 4 to get text from CDATA nodes
} else if (el.nodeType == 3) {
// Deal with extra whitespace and returns in text here.
text.push(el.data);
}
}
return text.join('');
}

You'll need a striptags function in javascript for that and a regex to replace consecutive newlines with a single space.

You can try using the replace statement below
var str = "..your HTML..";
var content = str.replace(/</?[a-zA-Z0-9]+>|<[a-zA-Z0-9]+\s*/>|\r?\n/g," ");
For the HTML that you have provided above, this will give you the following string in content
Content: paragraph 1 paragraph 2 alert("blah blah blah"); This is some text ....and some more

Related

How do you avoid editing tags inside of tags when replacing characters inside of a tag

So lets say you have HTML that looks something like this:
<p>This text is <b>bold</b></p>
and this Javascript:
var obs = document.getElementsByTagName("p");
var obsreplace = obs[0].innerHTML.replace(/b/gi, "g");
obs[0].innerHTML = obsreplace;
This will change the HTML to this:
<p>This text is <g>gold</g></p>
But what if I want to change it to:
<p>This text is <b>gold</b></p>
and leave the bold tags unchanged?
You'll want to use childNodes, and run your replace against each of those. That will capture text nodes separately from elements. By recursively looping through each child element until there are only text nodes left, you can safely run your replace command against all text in the element.
function safeReplace(elem, replaceFn){
let nodes = elem.childNodes, i = -1
while(++i < nodes.length){
let node = nodes[i]
if(node.nodeType == 3){ //signifies a text node
node.textContent = replaceFn(node.textContent)
} else {
safeReplace(node, replaceFn)
}
}
}
document.querySelector('button').addEventListener('click', function(){
safeReplace(document.getElementById('testDiv'), function(str){
return str.replace(/e/gi, '_E_')
})
})
<div id="testDiv">
This div has outer content, as well as <p>a paragraph containing a <span>spanned text</span> in the middle</p>More text.</div>
<button>Replace Text</button>
So, two solutions, replace with regex keeping the matched groups, or use the getElementsByTagName? For sure you have to check the length of obs and bold before doing any operation on the elements, making sure there are no errors.
var obs = document.getElementsByTagName("p");
var bold = obs[0].getElementsByTagName("b")[0];
bold.innerHTML = "gold";
var obs = document.getElementsByTagName("p");
var replaceHtml = obs[1].innerHTML.replace(/(<b>).*?(<\/b>)/, "$1gold$2");
obs[1].innerHTML = replaceHtml;
<p>This text is <b>bold</b></p>
<p>This text is <b>bold</b></p>

Replace text with link with chrome extension

I am trying to replace text on a webpage with links. When I try this it just replaces the text with the tag and not a link. For example this code will replace "river" with:
asdf
This is what I have so far:
function handleText(textNode)
{
var v = textNode.nodeValue;
v = v.replace(/\briver\b/g, 'asdf');
textNode.nodeValue = v;
}
If all you wanted to do was change the text to other plain text, then you could change the contents of the text nodes directly. However, you are wanting to add an <a> element. For each <a> element you want to add, you are effectively wanting to add a child element. Text nodes can not have children. Thus, to do this you have to actually replace the text node with a more complicated structure. In doing so, you will want to make as little impact on the DOM as possible, in order to not disturb other scripts which rely on the current structure of the DOM. The simplest way to make little impact is to replace the text node with a <span> which contains the new text nodes (the text will split around the new <a>) and any new <a> elements.
The code below should do what you desire. It replaces the textNode with a <span> containing the new text nodes and the created <a> elements. It only makes the replacement when one or more <a> elements need to be inserted.
function handleTextNode(textNode) {
if(textNode.nodeName !== '#text'
|| textNode.parentNode.nodeName === 'SCRIPT'
|| textNode.parentNode.nodeName === 'STYLE'
) {
//Don't do anything except on text nodes, which are not children
// of <script> or <style>.
return;
}
let origText = textNode.textContent;
let newHtml=origText.replace(/\briver\b/g,'asdf');
//Only change the DOM if we actually made a replacement in the text.
//Compare the strings, as it should be faster than a second RegExp operation and
// lets us use the RegExp in only one place for maintainability.
if( newHtml !== origText) {
let newSpan = document.createElement('span');
newSpan.innerHTML = newHtml;
textNode.parentNode.replaceChild(newSpan,textNode);
}
}
//Testing: Walk the DOM of the <body> handling all non-empty text nodes
function processDocument() {
//Create the TreeWalker
let treeWalker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT,{
acceptNode: function(node) {
if(node.textContent.length === 0) {
//Alternately, could filter out the <script> and <style> text nodes here.
return NodeFilter.FILTER_SKIP; //Skip empty text nodes
} //else
return NodeFilter.FILTER_ACCEPT;
}
}, false );
//Make a list of the text nodes prior to modifying the DOM. Once the DOM is
// modified the TreeWalker will become invalid (i.e. the TreeWalker will stop
// traversing the DOM after the first modification).
let nodeList=[];
while(treeWalker.nextNode()){
nodeList.push(treeWalker.currentNode);
}
//Iterate over all text nodes, calling handleTextNode on each node in the list.
nodeList.forEach(function(el){
handleTextNode(el);
});
}
document.getElementById('clickTo').addEventListener('click',processDocument,false);
<input type="button" id="clickTo" value="Click to process"/>
<div id="testDiv">This text should change to a link -->river<--.</div>
The TreeWalker code was taken from my answer here.

Get all text contents of a DOM element outside its children using Javascript

I have a DOM element that has some number of children, interleaved by text strings.
I want to get each of these text strings, and replace them using regex.
For example:
<div>Text started.
<a>Link child inside</a>
Some more text.
<u>Another child</u>
Closing text.</div>
In this example, I want to extract the strings "Text started.", "Some more text.", and "Closing text.", so that I can replace each of them later with something else.
The solution should be generic since the number of children inside the parent can vary, and the node types as well.
Anyone got a clever answer to achieve this easily using javascript?
You can use childNodes to check if the nodeType is a text node.
Doing this inside a forEach you can easily replace the text with whatever you want.
Example:
var div = document.getElementsByTagName('div').item(0);
[].slice.call(div.childNodes).forEach(function(node , i) {
if(node.nodeType === 3) {
var currNode = div.childNodes[i];
var currText = div.childNodes[i].textContent;
currNode.textContent = currText.replace(/text/i, ' Foo');
}
})
<div>
Text started.
<a>Link child inside</a>
Some more text.
<u>Another child</u>
Closing text.
</div>
You can do as follows;
var textNodes = [...test.childNodes].filter(child => child.nodeType === Node.TEXT_NODE)
textNodes.forEach(tn => console.log(tn.textContent))
<div id="test">Text started.
<a>Link child inside</a>
Some more text.
<u>Another child</u>
Closing text.</div>

How to remove a character not within an HTML element?

I know how to remove or manipulate a string within an HTML element but how would I do that if it's outside of an HTML element. I'm trying to remove the (|) character but not really sure how to do it using javascript.
<html>
<body>
<p>Hello</p>
|
<p>World</p>
|
<p>!</p>
</body>
You can use Node.nodeType in combination with $.fn.contents() and $.fn.filter()
The read-only Node.nodeType property returns an unsigned short integer representing the type of the node.
$(document).ready(function() {
$('div').contents().filter(function(){
return this.nodeType == Node.TEXT_NODE && this.textContent.trim()=='|';
}).remove();
});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<div>
<p>Hello</p>
|
<p>World</p>
|
<p>!</p>
</div>
Using innerHTML with text replacement would probably be the easiest thing to wrap your head around, but it's really slow and really bad practise.
There are sure libraries that can do what you want, but I don't like requiring huge libraries for such simple tasks.
So here's a pure JavaScript way:
Text nodes are just like element nodes in terms of adding, removing and changing the contained text.
You can get a hold of the first "|" via document.body.firstElementChild.nextSibling, and then you can remove it from its parent or edit its contents like normal:
var textNode1 = document.body.firstElementChild.nextSibling;
var textNode2 = textNode1.nextSibling.nextSibling;
// Remove the first text node
textNode1.parentNode.removeChild(textNode1);
// Replace the second by something else
textNode2.textContent = 'Derp';
[ Fiddle ]
If you need to, you can also check whether a node is a text node or something else by checking .nodeType:
for(var i = 0; i < document.body.childNodes.length; i++)
{
if(document.body.childNodes[i].nodeType == 3) // "3" means text node
{
document.body.removeChild(document.body.childNodes[i]);
}
}
Possible values of .nodeType and their meanings are listed at MDN, W3Schools or the DOM Level 1 Specification (pages 25-28).
Just iterate over the direct descendants of the node you're inspecting, <body> in this case, and remove each text node you encounter.
var root = document.body;
[].forEach.call(root.childNodes, function(node) {
if (node.nodeType == 3) {
root.removeChild(node);
}
});
<html>
<body>
<p>Hello</p>
|
<p>World</p>
|
<p>!</p>
</body>
Here's a vanilla-JavaScript, recursive solution that removes all text not within the "normal" nodes. By that, I mean text that is not within the following tags:
[ "a", "p", "script", "span", "b", "em", "strong", "i" ]
which are generally the one which have only text nodes as children.
// these elements can have text-only nodes
var okayTypes = [ "a", "p", "script", "span", "b", "em", "strong", "i" ]
var removeAllText = function (node) {
// if the nodeType is 3 (text node) and the nodeName is "#text"
if (node.nodeName == "#text") {
node.parentNode.removeChild(node);
} else if (node.childNodes) {
for (var i = node.childNodes.length; i--;) {
// if the child element does not allow text-only nodes, recurse on it
if (okayTypes.indexOf(String(node.childNodes[i].tagName).toLowerCase()) == -1)
removeAllText(node.childNodes[i]);
}
}
}
removeAllText(document.body);
<body>
<p>do not remove 1</p>
remove1
<p>do not remove 2</p>
remove2
<p>do not remove 3</p>
</body>

How to change spaces to <br>s in a text node?

I am writing a Greasemonkey script in which in one place I'm converting all the spaces in the text into <br> so that the string appears more vertical in its cell in the table.
The HTML that I'm matching is:
<span>
<img class="icon itemicon" alt="Manual item" ...etc...>
Chapter 2 reading
<a title="Sort in descending order" href="index.php....."> etc </a>
</span>
Here is my code:
var child = spanelm.firstChild;
var textChild = child.nextSibling;
textChild.textContent = textChild.textContent.replace(/ /g, "<br>");
My Greasemonkey script is finding the right element and doing the change successfully, but on the web page I see the <br> tags literally:
Chapter<br>2<br>reading
(In another place in my code, I'm doing a similar thing, which seems to work:
spanelm.firstChild.innerHTML = spanelm.firstChild.text.replace(/ /g, "<br>");
but I cannot get that to work in this case, with this organization of the HTML.)
I feel like I need to tell Greasemonkey to eval() the HTML or something...
A text node doesn't have an innerHTML, so you can't insert new tags that way. (You should be very careful about this kind of string-replace of any node using innerHTML -- it's only a matter of time before the code trashes nested elements, inline JS, attached event handlers, etc.)
To insert tags in a text element, you need to break up that element into new text elements, interspersed with the tags you desire. In this case, replacing each run of spaces with a <br> would use code like this:
(See it in action at jsFiddle.)
var precedingNode = document.querySelector ("span > img.icon.itemicon");
var node = precedingNode.nextSibling;
var words = node.textContent.split (/ +/);
var parent = node.parentNode;
for (var J = 0, numWords = words.length; J < numWords; ++J) {
var newWord = document.createTextNode (words[J]);
var newBreak = document.createElement ("br");
parent.insertBefore (newWord, node);
parent.insertBefore (newBreak, node);
}
parent.removeChild (node);

Categories

Resources