Match all whitespace in an HTML string with JavaScript - javascript

Lets say you have an HTML string like this:
<div id="loco" class="hey" >lorem ipsum pendus <em>hey</em>moder <hr /></div>
And need to place <br/> elements after every space character.... which I was doing with:
HTMLtext.replace(/\s{1,}/g, ' <br/>');
However, the problem is that this inserts breaks after space characters in-between tags (between tag properties) too and I'd of course like to do this for tag textual contents only. Somehow I was always really bad with regular expressions - could anyone help out?
So basically do my original whitespace match but only if its not between < and > ?

Regex is not a good tool for this. You should be working with the DOM, not with the raw HTML string.
For a quick-and-dirty solution that presupposes that there are no < or > character in your string except those delimiting a tag, you can try this, though:
result = subject.replace(/\s+(?=[^<>]*<)/g, "$&<br/>");
This inserts a <br/> after whitespace only if the next angle bracket is an opening angle bracket.
Explanation:
\s+ # Match one or more whitespace characters (including newlines!)
(?= # but only if (positive lookahead assertion) it's possible to match...
[^<>]* # any number of non-angle brackets
< # followed by an opening angle bracket
) # ...from this position in the string onwards.
Replace that with $& (which contains the matched characters) plus <br/>.
This regex does not check if there is a > further behind, as this would require a positive look*behind* assertion, and JavaScript does not support these. So you can't check for that, but if you control the HTML and are sure that the conditions I mentioned above are met, that shouldn't be a problem.

See this answer for iterating the dom and replacing whitespaces with <br /> elements. The adapted code would be:
(function iterate_node(node) {
if (node.nodeType === 3) { // Node.TEXT_NODE
var text = node.data,
words = text.split(/\s/);
if (words.length > 1) {
node.data = words[0];
var next = node.nextSibling,
parent = node.parentNode;
for (var i=1; i<words.length; i++) {
var tnode = document.createTextNode(words[i]),
br = document.createElement("br");
parent.insertBefore(br, next);
parent.insertBefore(tnode, next);
}
}
} else if (node.nodeType === 1) { // Node.ELEMENT_NODE
for (var i=node.childNodes.length-1; i>=0; i--) {
iterate_node(node.childNodes[i]); // run recursive on DOM
}
}
})(content); // any dom node
(Demo at jsfiddle.net)

Okay, so you don't want to match spaces inside HTML tags. Only regular expressions isn't sufficient for this. I'll use a lexer to do the job. You can see the output here.
var lexer = new Lexer;
var result = "";
lexer.addRule(/</, function (c) { // start of a tag
this.state = 2; // go to state 2 - exclusive tag state
result += c; // copy to output
});
lexer.addRule(/>/, function (c) { // end of a tag
this.state = 0; // go back to state 0 - initial state
result += c; // copy to output
}, [2]); // only apply this rule when in state 2
lexer.addRule(/.|\n/, function (c) { // match any character
result += c; // copy to output
}, [2]); // only apply this rule when in state 2
lexer.addRule(/\s+/, function () { // match one or more spaces
result += "<br/>"; // replace with "<br/>"
});
lexer.addRule(/.|\n/, function (c) { // match any character
result += c; // copy to output
}); // everything else
lexer.input = '<div id="loco" class="hey" >lorem ipsum pendus <em>hey</em>moder <hr /></div>';
lexer.lex();
Of course, a lexer is a very powerful tool. You may also skip angled brackets inside the value of an attribute in a tag. However I'll leave that for you to implement. Good luck.

Related

Split a string of HTML into an array by particular tags

Given this HTML as a string "html", how can I split it into an array where each header <h marks the start of an element?
Begin with this:
<h1>A</h1>
<h2>B</h2>
<p>Foobar</p>
<h3>C</h3>
Result:
["<h1>A</h1>", "<h2>B</h2><p>Foobar</p>", "<h3>C</h3>"]
What I've tried:
I wanted to use Array.split() with a regex, but the result splits each <h into its own element. I need to figure out how to capture from the start of one <h until the next <h. Then include the first one but exclude the second one.
var html = '<h1>A</h1><h2>B</h2><p>Foobar</p><h3>C</h3>';
var foo = html.split(/(<h)/);
Edit: Regex is not a requirement in anyway, it's just the only solution that I thought would work for generally splitting HTML strings in this way.
In your example you can use:
/
<h // Match literal <h
(.) // Match any character and save in a group
> // Match literal <
.*? // Match any character zero or more times, non greedy
<\/h // Match literal </h
\1 // Match what previous grouped in (.)
> // Match literal >
/g
var str = '<h1>A</h1><h2>B</h2><p>Foobar</p><h3>C</h3>'
str.match(/<h(.)>.*?<\/h\1>/g); // ["<h1>A</h1>", "<h2>B</h2>", "<h3>C</h3>"]
But please don't parse HTML with regexp, read RegEx match open tags except XHTML self-contained tags
From the comments to the question, this seems to be the task:
I'm taking dynamic markdown that I'm scraping from GitHub. Then I want to render it to HTML, but wrap every title element in a ReactJS <WayPoint> component.
The following is a completely library-agnostic, DOM-API based solution.
function waypointify(html) {
var div = document.createElement("div"), nodes;
// parse HTML and convert into an array (instead of NodeList)
div.innerHTML = html;
nodes = [].slice.call(div.childNodes);
// add <waypoint> elements and distribute nodes by headings
div.innerHTML = "";
nodes.forEach(function (node) {
if (!div.lastChild || /^h[1-6]$/i.test(node.nodeName)) {
div.appendChild( document.createElement("waypoint") );
}
div.lastChild.appendChild(node);
});
return div.innerHTML;
}
Doing the same in a modern library with less lines of code is absolutely possible, see it as a challenge.
This is what it produces with your sample input:
<waypoint><h1>A</h1></waypoint>
<waypoint><h2>B</h2><p>Foobar</p></waypoint>
<waypoint><h3>C</h3></waypoint>
I'm sure someone could reduce the for loop to put the angle brackets back in but this is how I'd do it.
var html = '<h1>A</h1><h2>B</h2><p>Foobar</p><h3>C</h3>';
//split on ><
var arr = html.split(/></g);
//split removes the >< so we need to determine where to put them back in.
for(var i = 0; i < arr.length; i++){
if(arr[i].substring(0, 1) != '<'){
arr[i] = '<' + arr[i];
}
if(arr[i].slice(-1) != '>'){
arr[i] = arr[i] + '>';
}
}
Additionally, we could actually remove the first and last bracket, do the split and then replace the angle brackets to the whole thing.
var html = '<h1>A</h1><h2>B</h2><p>Foobar</p><h3>C</h3>';
//remove first and last characters
html = html.substring(1, html.length-1);
//do the split on ><
var arr = html.split(/></g);
//add the brackets back in
for(var i = 0; i < arr.length; i++){
arr[i] = '<' + arr[i] + '>';
}
Oh, of course this will fail with elements that have no content.
Hi I used this function to convert html String Dom in array
static getArrayTagsHtmlString(str){
let htmlSplit = str.split(">")
let arrayElements = []
let nodeElement =""
htmlSplit.forEach((element)=>{
if (element.includes("<")) {
nodeElement = element+">"
}else{
nodeElement = element
}
arrayElements.push(nodeElement)
})
return arrayElements
}
Happy code

How to remove string between two characters every time they occur [duplicate]

This question already has answers here:
Strip HTML from Text JavaScript
(44 answers)
removing html tags from string
(3 answers)
Closed 7 years ago.
I need to get rid of any text inside < and >, including the two delimiters themselves.
So for example, from string
<brev-y>th</brev-y><sw-ex>a</sw-ex><sl>t</sl>​
I would like to get this one
that
This is what i've tried so far:
var str = annotation.split(' ');
str.substring(str.lastIndexOf("<") + 1, str.lastIndexOf(">"))
But it doesn't work for every < and >.
I'd rather not use RegEx if possible, but I'm happy to hear if it's the only option.
You can simply use the replace method with /<[^>]*>/g.It matches < followed by [^>]* any amount of non> until > globally.
var str = '<brev-y>th</brev-y><sw-ex>a</sw-ex><sl>t</sl>';
str = str.replace(/<[^>]*>/g, "");
alert(str);
For string removal you can use RegExp, it is ok.
"<brev-y>th</brev-y><sw-ex>a</sw-ex><sl>t</sl>​".replace(/<\/?[^>]+>/g, "")
Since the text you want is always after a > character, you could split it at that point, and then the first character in each String of the array would be the character you need. For example:
String[] strings = stringName.split("<");
String word = "";
for(int i = 0; i < strings.length; i++) {
word += strings[i].charAt(0);
}
This is probably glitchy right now, but I think this would work. You don't need to actually remove the text between the "<>"- just get the character right after a '>'
Using a regular expression is not the only option, but it's a pretty good option.
You can easily parse the string to remove the tags, for example by using a state machine where the < and > characters turns on and off a state of ignoring characters. There are other methods of course, some shorter, some more efficient, but they will all be a few lines of code, while a regular expression solution is just a single replace.
Example:
function removeHtml1(str) {
return str.replace(/<[^>]*>/g, '');
}
function removeHtml2(str) {
var result = '';
var ignore = false;
for (var i = 0; i < str.length; i++) {
var c = str.charAt(i);
switch (c) {
case '<': ignore = true; break;
case '>': ignore = false; break;
default: if (!ignore) result += c;
}
}
return result;
}
var s = "<brev-y>th</brev-y><sw-ex>a</sw-ex><sl>t</sl>";
console.log(removeHtml1(s));
console.log(removeHtml2(s));
There are several ways to do this. Some are better than others. I haven't done one lately for these two specific characters, so I took a minute and wrote some code that may work. I will describe how it works. Create a function with a loop that copies an incoming string, character by character, to an outgoing string. Make the function a string type so it will return your modified string. Create the loop to scan from incoming from string[0] and while less than string.length(). Within the loop, add an if statement. When the if statement sees a "<" character in the incoming string it stops copying, but continues to look at every character in the incoming string until it sees the ">" character. When the ">" is found, it starts copying again. It's that simple.
The following code may need some refinement, but it should get you started on the method described above. It's not the fastest and not the most elegant but the basic idea is there. This did compile, and it ran correctly, here, with no errors. In my test program it produced the correct output. However, you may need to test it further in the context of your program.
string filter_on_brackets(string str1)
{
string str2 = "";
int copy_flag = 1;
for (size_t i = 0 ; i < str1.length();i++)
{
if(str1[i] == '<')
{
copy_flag = 0;
}
if(str1[i] == '>')
{
copy_flag = 2;
}
if(copy_flag == 1)
{
str2 += str1[i];
}
if(copy_flag == 2)
{
copy_flag = 1;
}
}
return str2;
}

JavaScript Replace Text with HTML Between it

I want to replace some text in a webpage, only the text, but when I replace via the document.body.innerHTML I could get stuck, like so:
HTML:
<p>test test </p>
<p>test2 test2</p>
<p>test3 test3</p>
Js:
var param = "test test test2 test2 test3";
var text = document.body.innerHTML;
document.body.innerHTML = text.replace(param, '*' + param + '*');
I would like to get:
*test test
test2 test2
test3* test3
HTML of 'desired' outcome:
<p>*test test </p>
<p>test2 test2</p>
<p>test3* test3</p>
So If I want to do that with the parameter above ("test test test2 test2 test3") the <p></p> would not be taken into account - resulting into the else section.
How can I replace the text with no "consideration" to the html markup that could be between it?
Thanks in advance.
Edit (for #Sonesh Dabhi):
Basically I need to replace text in a webpage, but when I scan the
webpage with the html in it the replace won't work, I need to scan and
replace based on text only
Edit 2:
'Raw' JavaScript Please (no jQuery)
This will do what you want, it builds a regex expression to find the text between tags and replace in there. Give it a shot.
http://jsfiddle.net/WZYG9/5/
The magic is
(\s*(?:<\/?\w+>)*\s*)*
Which, in the code below has double backslashes to escape them within the string.
The regex itself looks for any number of white space characters (\s). The inner group (?:</?\w+>)* matches any number of start or end tags. ?: tells java script to not count the group in the replacement string, and not remember the matches it finds. < is a literal less than character. The forward slash (which begins an end html tag) needs to be escaped, and the question mark means 0 or 1 occurrence. This is proceeded by any number of white space characters.
Every space within the "text to search" get replaced with this regular expression, allowing it to match any amount of white space and tags between the words in the text, and remember them in the numbered variables $1, $2, etc. The replacement string gets built to put those remembered variables back in.
Which matches any number of tags and whitespace between them.
function wrapTextIn(text, character) {
if (!character) character = "*"; // default to asterik
// trim the text
text = text.replace(/(^\s+)|(\s+$)/g, "");
//split into words
var words = text.split(" ");
// return if there are no words
if (words.length == 0)
return;
// build the regex
var regex = new RegExp(text.replace(/\s+/g, "(\\s*(?:<\\/?\\w+>)*\\s*)*"), "g");
//start with wrapping character
var replace = character;
//for each word, put it and the matching "tags" in the replacement string
for (var i = 0; i < words.length; i++) {
replace += words[i];
if (i != words.length - 1 & words.length > 1)
replace += "$" + (i + 1);
}
// end with the wrapping character
replace += character;
// replace the html
document.body.innerHTML = document.body.innerHTML.replace(regex, replace);
}
WORKING DEMO
USE THAT FUNCTION TO GET TEXT.. no jquery required
First remove tags. i.e You can try document.body.textContent / document.body.innerText or use this example
var StrippedString = OriginalString.replace(/(<([^>]+)>)/ig,"");
Find and replace (for all to be replace add 1 more thing "/g" after search)
String.prototype.trim=function(){return this.replace(/^\s\s*/, '').replace(/\s\s*$/, '');};
var param = "test test test2 test2 test3";
var text = (document.body.textContent || document.body.innerText).trim();
var replaced = text.search(param) >= 0;
if(replaced) {
var re = new RegExp(param, 'g');
document.body.innerHTML = text.replace(re , '*' + param + '*');
} else {
//param was not replaced
//What to do here?
}
See here
Note: Using striping you will lose the tags.

Help write regex that will surround certain text with <strong> tags, only if the <strong> tag isn't present

I have several posts on a website; all these posts are chat conversations of this type:
AD: Hey!
BC: What's up?
AD: Nothing
BC: Okay
They're marked up as simple paragraphs surrounded by <p> tags.
Using the javascript replace function, I want all instances of "AD" in the beginning of a conversation (ie, all instances of "AD" at the starting of a line followed by a ":") to be surrounded by <strong> tags, but only if the instance isn't already surrounded by a <strong> tag.
What regex should I use to accomplish this? Am I trying to do what this advises against?
The code I'm using is like this:
var posts = document.getElementsByClassName('entry-content');
for (var i = 0; i < posts.length; i++) {
posts[i].innerHTML = posts[i].innerHTML.replace(/some regex here/,
'replaced content here');
}
If AD: is always at the start of a line then the following regex should work, using the m switch:
.replace(/^AD:/gm, "<strong>AD:</strong>");
You don't need to check for the existence of <strong> because ^ will match the start of the line and the regex will only match if the sequence of characters that follows the start of the line are AD:.
You're not going against the "Don't use regex to parse HTML" advice because you're not parsing HTML, you're simply replacing a string with another string.
An alternative to regex would be to work with ranges, creating a range selecting the text and then using execCommand to make the text bold. However, I think this would be much more difficult and you would likely face differences in browser implementations. The regex way should be enough.
After seeing your comment, the following regex would work fine:
.replace(/<(p|br)>AD:/gm, "<$1><strong>AD:</strong>");
Wouldn't it be easier to set the class or style property of found paragraph to text-weight: bold or a class that does roughly the same? That way you wouldn't have to worry about adding in tags, or searching for existing tags. Might perform better, too, if you don't have to do any string replaces.
If you really want to add the strong tags anyway, I'd suggest using DOM functions to find childNodes of your paragraph that are <strong>, and if you don't find one, add it and move the original (text) childNode of the paragraph into it.
Using regular expressions on the innerHTML isn't reliable and will potentially lead to problems. The correct way to do this is a tiresome process but is much more reliable.
E.g.
for (var i = 0, l = posts.length; i < l; i++) {
findAndReplaceInDOM(posts[i], /^AD:/g, function(match, node){
// Make sure current node does note have a <strong> as a parent
if (node.parentNode.nodeName.toLowerCase() === 'strong') {
return false;
}
// Create and return new <strong>
var s = document.createElement('strong');
s.appendChild(document.createTextNode(match[0]));
return s;
});
}
And the findAndReplaceInDOM function:
function findAndReplaceInDOM(node, regex, replaceFn) {
// Note: regex MUST have global flag
if (!regex || !regex.global || typeof replaceFn !== 'function') {
return;
}
var start, end, match, parent, leftNode,
rightNode, replacementNode, text,
d = document;
// Loop through all childNodes of "node"
if (node = node && node.firstChild) do {
if (node.nodeType === 1) {
// Regular element, recurse:
findAndReplaceInDOM(node, regex, replaceFn);
} else if (node.nodeType === 3) {
// Text node, introspect
parent = node.parentNode;
text = node.data;
regex.lastIndex = 0;
while (match = regex.exec(text)) {
replacementNode = replaceFn(match, node);
if (!replacementNode) {
continue;
}
end = regex.lastIndex;
start = end - match[0].length;
// Effectively split node up into three parts:
// leftSideOfReplacement + REPLACEMENT + rightSideOfReplacement
leftNode = d.createTextNode( text.substring(0, start) );
rightNode = d.createTextNode( text.substring(end) );
parent.insertBefore(leftNode, node);
parent.insertBefore(replacementNode, node);
parent.insertBefore(rightNode, node);
// Remove original node from document
parent.removeChild(node);
}
}
} while (node = node.nextSibling);
}

How do I use Javascript to modify the content of a node?

I need to use Javascript to do three things:
Select all nodes with a class of "foo".
Find all words inside these nodes that begin with "*".
Surround those words with <span class="xyz"> ... </span>, where xyz is the word itself.
For example, the content:
<ul>
<li class="foo">
*abc def *ghi
</li>
<li class="bar">
abc *def *ghi
</li>
</ul>
would become
<ul>
<li class="foo">
<span class="abc">*abc</span> def <span class="ghi">*ghi</span>
</li>
<li class="bar">
abc *def *ghi <!-- Not part of a node with class "foo", so
</li> no changes made. -->
</ul>
How might I do this? (P.S. Solutions involving jQuery work too, but other than that I'd prefer not include any additional dependencies.)
No jQuery required:
UE_replacer = function (node) {
// just for performance, skip attribute and
// comment nodes (types 2 and 8, respectively)
if (node.nodeType == 2) return;
if (node.nodeType == 8) return;
// for text nodes (type 3), wrap words of the
// form *xyzzy with a span that has class xyzzy
if (node.nodeType == 3) {
// in the actual text, the nodeValue, change
// all strings ('g'=global) that start and end
// on a word boundary ('\b') where the first
// character is '*' and is followed by one or
// more ('+'=one or more) 'word' characters
// ('\w'=word character). save all the word
// characters (that's what parens do) so that
// they can be used in the replacement string
// ('$1'=re-use saved characters).
var text = node.nodeValue.replace(
/\b\*(\w+)\b/g,
'<span class="$1">*$1</span>' // <== Wrong!
);
// set the new text back into the nodeValue
node.nodeValue = text;
return;
}
// for all other node types, call this function
// recursively on all its child nodes
for (var i=0; i<node.childNodes.length; ++i) {
UE_replacer( node.childNodes[i] );
}
}
// start the replacement on 'document', which is
// the root node
UE_replacer( document );
Updated: To contrast the direction of strager's answer, I got rid of my botched jQuery and kept the regular expression as simple as possible. This 'raw' javascript approach turns out to be much easier than I expected.
Although jQuery is clearly good for manipulating DOM structure, it's actually not easy to figure out how to manipulate text elements.
Don't try to process the innerHTML/html() of an element. This will never work because regex is not powerful enough to parse HTML. Just walk over the Text nodes looking for what you want:
// Replace words in text content, recursively walking element children.
//
function wrapWordsInDescendants(element, tagName, className) {
for (var i= element.childNodes.length; i-->0;) {
var child= element.childNodes[i];
if (child.nodeType==1) // Node.ELEMENT_NODE
wrapWordsInDescendants(child, tagName, className);
else if (child.nodeType==3) // Node.TEXT_NODE
wrapWordsInText(child, tagName, className);
}
}
// Replace words in a single text node
//
function wrapWordsInText(node, tagName, className) {
// Get list of *word indexes
//
var ixs= [];
var match;
while (match= starword.exec(node.data))
ixs.push([match.index, match.index+match[0].length]);
// Wrap each in the given element
//
for (var i= ixs.length; i-->0;) {
var element= document.createElement(tagName);
element.className= className;
node.splitText(ixs[i][1]);
element.appendChild(node.splitText(ixs[i][0]));
node.parentNode.insertBefore(element, node.nextSibling);
}
}
var starword= /(^|\W)\*\w+\b/g;
// Process all elements with class 'foo'
//
$('.foo').each(function() {
wrapWordsInDescendants(this, 'span', 'xyz');
});
// If you're not using jQuery, you'll need the below bits instead of $...
// Fix missing indexOf method on IE
//
if (![].indexOf) Array.prototype.indexOf= function(item) {
for (var i= 0; i<this.length; i++)
if (this[i]==item)
return i;
return -1;
}
// Iterating over '*' (all elements) is not fast; if possible, reduce to
// all elements called 'li', or all element inside a certain element etc.
//
var elements= document.getElementsByTagName('*');
for (var i= elements.length; i-->0;)
if (elements[i].className.split(' ').indexOf('foo')!=-1)
wrapWordsInDescendants(elements[i], 'span', 'xyz');
The regexp would look something like this (sed-ish syntax):
s/\*\(\w+\)\b\(?![^<]*>\)/<span class="\1">*\1</span>/g
Thus:
$('li.foo').each(function() {
var html = $(this).html();
html = html.replace(/\*(\w+)\b(?![^<]*>)/g, "<span class=\"$1\">*$1</span>");
$(this).html(html);
});
The \*(\w+)\b segment is the important piece. It finds an asterisk followed by one or more word characters followed by some sort of word termination (e.g. end of line, or a space). The word is captured into $1, which is then used as the text and the class of the output.
The part just after that ((?![^<]*>)) is a negative lookahead. It asserts that a closing angle bracket does not follow, unless there is an opening angle bracket before it. This prevents a match where the string is inside an HTML tag. This doesn't handle malformed HTML, but that shouldn't be the case anyway.

Categories

Resources