Javascript regular expression that ignores a substring - javascript

Background:
I found similiar S.O. posts on this topic, but I failed to make it work for my scenario. Appologies in advance if this is a dupe.
My Intent:
Take every English word in a string, and convert it to a html hyperlink. This logic needs to ignore only the following markup: <br/>, <b>, </b>
Here's what I have so far. It converts English words to hyperlinks as I expect, but has no ignore logic for html tags (that's where I need your help):
text = text.replace(/\b([A-Z\-a-z]+)\b/g, "$1");
Example Input / Output:
Sample Input:
this <b>is</b> a test
Expected Output:
this <b>is</b> a test
Thank you.

Issues with regexing HTML aside, the way I'd do this is in two steps:
First of foremost, one way or another, extract the texts outside the tags
Then only do this transform to these texts, and leave everything else untouched
Related questions
Regex replace string but not inside html tag
RegEx: Matching a especific string that is not inside in HTML tag
regex - match not in tag
RegEx to ignore / skip everything in html tags
Text Extraction from HTML Java

Here's a hybrid solution that gives you the performance gain of innerHTML and the luxury of not having to mess with HTML strings when looking for the matches:
function findMatchAndReplace(node, regex, replacement) {
var parent,
temp = document.createElement('div'),
next;
if (node.nodeType === 3) {
parent = node.parentNode;
temp.innerHTML = node.data.replace(regex, replacement);
while (temp.firstChild)
parent.insertBefore(temp.firstChild, node);
parent.removeChild(node);
} else if (node.nodeType === 1) {
if (node = node.firstChild) do {
next = node.nextSibling;
findMatchAndReplace(node, regex, replacement);
} while (node = next);
}
}
Input:
<div id="foo">
this <b>is</b> a test
</div>
Process:
findMatchAndReplace(
document.getElementById('foo'),
/\b\w+\b/g,
'$&'
);
Output (whitespace added for clarity):
<div id="foo">
this
<b>is</b>
a
test
</div>

Here's another JavaScript method.
var StrWith_WELL_FORMED_TAGS = "This <b>is</b> a test, <br> Mr. O'Leary! <!-- What about comments? -->";
var SplitAtTags = StrWith_WELL_FORMED_TAGS.split (/[<>]/);
var ArrayLen = SplitAtTags.length;
var OutputStr = '';
var bStartWithTag = StrWith_WELL_FORMED_TAGS.charAt (0) == "<";
for (var J=0; J < ArrayLen; J++)
{
var bWeAreInsideTag = (J % 2) ^ bStartWithTag;
if (bWeAreInsideTag)
{
OutputStr += '<' + SplitAtTags[J] + '>';
}
else
{
OutputStr += SplitAtTags[J].replace (/([a-z']+)/gi, '$1');
}
}
//-- Replace "console.log" with "alert" if not using Firebug.
console.log (OutputStr);

Related

How to find string in between special characters in a string and replace it as superscript

I have a title in the following format - test ^TM^ title test. I want to convert this text in such a way that the word enclosed between '^' as superscript without changing its position, as shown below.
testTMtitle test
Please help me.
A regex can do this for you, see the example below.
const
input = 'test ^TM^ title test',
// This regex will match any text between two "^" characters. The text
// between the "^" will be placed inside a capture group.
regex = /\^(.*?)\^/g,
// This replaces the match with the text from the capture group, wrapped in a sup tag.
htmlString = input.replace(regex, `<sup>$1</sup>`);
console.log(htmlString);
document.getElementById('output').innerHTML = htmlString;
<div id="output"></div>
Using javascript:
text = "test ^TM^ title test";
text.replace(/\^(.+?)\^/g,'$1'.sup());
I thinks you need to open your files in any HTML or PHP editor and use find and replace option.
For example: find ^TM^
Replace with: TM
string = "hello^TM^ hi";
isOdd = true;
newString = "";
for(i=0; i<string.length; i++){
if(string[i] == '^'){
if(isOdd){
newString += "<sup>";
} else {
newString += "</sup>";
}
isOdd = !isOdd;
} else{
newString += string[i];
}
}
//newString will have "hello<sup>TM</sup> hi"

Issue with replacing color of word occurrence

I am trying to replace the font color of "bad" throughout an HTML page to red but there are two main issues, firstly it repeats the entire sentence before replacement and I do not want that and also the bad in "badly" was also replaced but I just want only the font in the word "bad" replaced. How do I go about this? Here is my code
window.onload = function() {
var text = document.getElementById("content");
var str = text.innerHTML,
reg = /red/ig;
var toStr = String(reg);
var color = (toStr.replace('\/g', '|')).substring(1);
var colors = color.replace("|");
if (colors.indexOf("red") > -1) {
str = str.replace(/bad/g, '<span style="color:red;">bad</span>');
}
document.getElementById("updated").innerHTML = str;
}
<div id="content">are they good or bad, tell me how badly it is if bad.</div>
<div id="updated"></div>
use \b (a word boundary token) as said by raphael75
str = str.replace(/\b(bad)\b/gi, '<span style="color:red;">$1</span>');
also, I:
added the i modifier, for it to be case insensitive (capture Bad, BAD, bad)
encapsulated (bad) in the regEx to call again with $1, this keeps the case (if capitalized Bad, stays Bad)
var str = "bad incredibad badly Bad, testing bad, bad. bad";
str = str.replace(/\b(bad)\b/gi, '<span style="color:red;">$1</span>');
document.getElementsByTagName('p')[0].innerHTML = str;
<p></p>
regex101 is a go-to place for JavaScript RegEx
To replace text in HTML documents, it is a good idea to traverse the DOM tree and replace the content of text nodes where needed.
To match complete words only ("bad" but not "badly"), use regex with word boundary anchors \b.
The following implementation performs depth-first DOM tree traversal and replaces text nodes containing the word "bad" with text nodes for both the text before and after that word as well as a <span style="color: red">bad</span> in place of "bad":
var walk = (node, func) => {
var children = node.childNodes;
for (var i = children.length - 1; i >= 0; --i) {
walk(children[i], func);
}
func(node);
};
var highlight = (text, color) => {
var span = document.createElement("span");
span.style.color = color;
span.appendChild(document.createTextNode(text));
return span;
};
var root = document.getElementById("content");
walk(root, (node) => {
if (node.nodeType != Node.TEXT_NODE) return;
var texts = node.data.split(/\bbad\b/),
length = texts.length;
if (length > 1) {
var fragment = texts.reduce((fragment, text, i) => {
fragment.appendChild(document.createTextNode(text));
if (i < length - 1) fragment.appendChild(highlight("bad", "red"));
return fragment;
}, document.createDocumentFragment());
node.parentNode.replaceChild(fragment, node);
}
});
<div id="content">are they good or bad, tell me how badly it is <bad>bad bad bad</bad> if bad.</div>
See also: JavaScript: Add elements in textNode

Split a string of HTML into an array by particular tags

Given this HTML as a string "html", how can I split it into an array where each header <h marks the start of an element?
Begin with this:
<h1>A</h1>
<h2>B</h2>
<p>Foobar</p>
<h3>C</h3>
Result:
["<h1>A</h1>", "<h2>B</h2><p>Foobar</p>", "<h3>C</h3>"]
What I've tried:
I wanted to use Array.split() with a regex, but the result splits each <h into its own element. I need to figure out how to capture from the start of one <h until the next <h. Then include the first one but exclude the second one.
var html = '<h1>A</h1><h2>B</h2><p>Foobar</p><h3>C</h3>';
var foo = html.split(/(<h)/);
Edit: Regex is not a requirement in anyway, it's just the only solution that I thought would work for generally splitting HTML strings in this way.
In your example you can use:
/
<h // Match literal <h
(.) // Match any character and save in a group
> // Match literal <
.*? // Match any character zero or more times, non greedy
<\/h // Match literal </h
\1 // Match what previous grouped in (.)
> // Match literal >
/g
var str = '<h1>A</h1><h2>B</h2><p>Foobar</p><h3>C</h3>'
str.match(/<h(.)>.*?<\/h\1>/g); // ["<h1>A</h1>", "<h2>B</h2>", "<h3>C</h3>"]
But please don't parse HTML with regexp, read RegEx match open tags except XHTML self-contained tags
From the comments to the question, this seems to be the task:
I'm taking dynamic markdown that I'm scraping from GitHub. Then I want to render it to HTML, but wrap every title element in a ReactJS <WayPoint> component.
The following is a completely library-agnostic, DOM-API based solution.
function waypointify(html) {
var div = document.createElement("div"), nodes;
// parse HTML and convert into an array (instead of NodeList)
div.innerHTML = html;
nodes = [].slice.call(div.childNodes);
// add <waypoint> elements and distribute nodes by headings
div.innerHTML = "";
nodes.forEach(function (node) {
if (!div.lastChild || /^h[1-6]$/i.test(node.nodeName)) {
div.appendChild( document.createElement("waypoint") );
}
div.lastChild.appendChild(node);
});
return div.innerHTML;
}
Doing the same in a modern library with less lines of code is absolutely possible, see it as a challenge.
This is what it produces with your sample input:
<waypoint><h1>A</h1></waypoint>
<waypoint><h2>B</h2><p>Foobar</p></waypoint>
<waypoint><h3>C</h3></waypoint>
I'm sure someone could reduce the for loop to put the angle brackets back in but this is how I'd do it.
var html = '<h1>A</h1><h2>B</h2><p>Foobar</p><h3>C</h3>';
//split on ><
var arr = html.split(/></g);
//split removes the >< so we need to determine where to put them back in.
for(var i = 0; i < arr.length; i++){
if(arr[i].substring(0, 1) != '<'){
arr[i] = '<' + arr[i];
}
if(arr[i].slice(-1) != '>'){
arr[i] = arr[i] + '>';
}
}
Additionally, we could actually remove the first and last bracket, do the split and then replace the angle brackets to the whole thing.
var html = '<h1>A</h1><h2>B</h2><p>Foobar</p><h3>C</h3>';
//remove first and last characters
html = html.substring(1, html.length-1);
//do the split on ><
var arr = html.split(/></g);
//add the brackets back in
for(var i = 0; i < arr.length; i++){
arr[i] = '<' + arr[i] + '>';
}
Oh, of course this will fail with elements that have no content.
Hi I used this function to convert html String Dom in array
static getArrayTagsHtmlString(str){
let htmlSplit = str.split(">")
let arrayElements = []
let nodeElement =""
htmlSplit.forEach((element)=>{
if (element.includes("<")) {
nodeElement = element+">"
}else{
nodeElement = element
}
arrayElements.push(nodeElement)
})
return arrayElements
}
Happy code

Match all whitespace in an HTML string with JavaScript

Lets say you have an HTML string like this:
<div id="loco" class="hey" >lorem ipsum pendus <em>hey</em>moder <hr /></div>
And need to place <br/> elements after every space character.... which I was doing with:
HTMLtext.replace(/\s{1,}/g, ' <br/>');
However, the problem is that this inserts breaks after space characters in-between tags (between tag properties) too and I'd of course like to do this for tag textual contents only. Somehow I was always really bad with regular expressions - could anyone help out?
So basically do my original whitespace match but only if its not between < and > ?
Regex is not a good tool for this. You should be working with the DOM, not with the raw HTML string.
For a quick-and-dirty solution that presupposes that there are no < or > character in your string except those delimiting a tag, you can try this, though:
result = subject.replace(/\s+(?=[^<>]*<)/g, "$&<br/>");
This inserts a <br/> after whitespace only if the next angle bracket is an opening angle bracket.
Explanation:
\s+ # Match one or more whitespace characters (including newlines!)
(?= # but only if (positive lookahead assertion) it's possible to match...
[^<>]* # any number of non-angle brackets
< # followed by an opening angle bracket
) # ...from this position in the string onwards.
Replace that with $& (which contains the matched characters) plus <br/>.
This regex does not check if there is a > further behind, as this would require a positive look*behind* assertion, and JavaScript does not support these. So you can't check for that, but if you control the HTML and are sure that the conditions I mentioned above are met, that shouldn't be a problem.
See this answer for iterating the dom and replacing whitespaces with <br /> elements. The adapted code would be:
(function iterate_node(node) {
if (node.nodeType === 3) { // Node.TEXT_NODE
var text = node.data,
words = text.split(/\s/);
if (words.length > 1) {
node.data = words[0];
var next = node.nextSibling,
parent = node.parentNode;
for (var i=1; i<words.length; i++) {
var tnode = document.createTextNode(words[i]),
br = document.createElement("br");
parent.insertBefore(br, next);
parent.insertBefore(tnode, next);
}
}
} else if (node.nodeType === 1) { // Node.ELEMENT_NODE
for (var i=node.childNodes.length-1; i>=0; i--) {
iterate_node(node.childNodes[i]); // run recursive on DOM
}
}
})(content); // any dom node
(Demo at jsfiddle.net)
Okay, so you don't want to match spaces inside HTML tags. Only regular expressions isn't sufficient for this. I'll use a lexer to do the job. You can see the output here.
var lexer = new Lexer;
var result = "";
lexer.addRule(/</, function (c) { // start of a tag
this.state = 2; // go to state 2 - exclusive tag state
result += c; // copy to output
});
lexer.addRule(/>/, function (c) { // end of a tag
this.state = 0; // go back to state 0 - initial state
result += c; // copy to output
}, [2]); // only apply this rule when in state 2
lexer.addRule(/.|\n/, function (c) { // match any character
result += c; // copy to output
}, [2]); // only apply this rule when in state 2
lexer.addRule(/\s+/, function () { // match one or more spaces
result += "<br/>"; // replace with "<br/>"
});
lexer.addRule(/.|\n/, function (c) { // match any character
result += c; // copy to output
}); // everything else
lexer.input = '<div id="loco" class="hey" >lorem ipsum pendus <em>hey</em>moder <hr /></div>';
lexer.lex();
Of course, a lexer is a very powerful tool. You may also skip angled brackets inside the value of an attribute in a tag. However I'll leave that for you to implement. Good luck.

Help write regex that will surround certain text with <strong> tags, only if the <strong> tag isn't present

I have several posts on a website; all these posts are chat conversations of this type:
AD: Hey!
BC: What's up?
AD: Nothing
BC: Okay
They're marked up as simple paragraphs surrounded by <p> tags.
Using the javascript replace function, I want all instances of "AD" in the beginning of a conversation (ie, all instances of "AD" at the starting of a line followed by a ":") to be surrounded by <strong> tags, but only if the instance isn't already surrounded by a <strong> tag.
What regex should I use to accomplish this? Am I trying to do what this advises against?
The code I'm using is like this:
var posts = document.getElementsByClassName('entry-content');
for (var i = 0; i < posts.length; i++) {
posts[i].innerHTML = posts[i].innerHTML.replace(/some regex here/,
'replaced content here');
}
If AD: is always at the start of a line then the following regex should work, using the m switch:
.replace(/^AD:/gm, "<strong>AD:</strong>");
You don't need to check for the existence of <strong> because ^ will match the start of the line and the regex will only match if the sequence of characters that follows the start of the line are AD:.
You're not going against the "Don't use regex to parse HTML" advice because you're not parsing HTML, you're simply replacing a string with another string.
An alternative to regex would be to work with ranges, creating a range selecting the text and then using execCommand to make the text bold. However, I think this would be much more difficult and you would likely face differences in browser implementations. The regex way should be enough.
After seeing your comment, the following regex would work fine:
.replace(/<(p|br)>AD:/gm, "<$1><strong>AD:</strong>");
Wouldn't it be easier to set the class or style property of found paragraph to text-weight: bold or a class that does roughly the same? That way you wouldn't have to worry about adding in tags, or searching for existing tags. Might perform better, too, if you don't have to do any string replaces.
If you really want to add the strong tags anyway, I'd suggest using DOM functions to find childNodes of your paragraph that are <strong>, and if you don't find one, add it and move the original (text) childNode of the paragraph into it.
Using regular expressions on the innerHTML isn't reliable and will potentially lead to problems. The correct way to do this is a tiresome process but is much more reliable.
E.g.
for (var i = 0, l = posts.length; i < l; i++) {
findAndReplaceInDOM(posts[i], /^AD:/g, function(match, node){
// Make sure current node does note have a <strong> as a parent
if (node.parentNode.nodeName.toLowerCase() === 'strong') {
return false;
}
// Create and return new <strong>
var s = document.createElement('strong');
s.appendChild(document.createTextNode(match[0]));
return s;
});
}
And the findAndReplaceInDOM function:
function findAndReplaceInDOM(node, regex, replaceFn) {
// Note: regex MUST have global flag
if (!regex || !regex.global || typeof replaceFn !== 'function') {
return;
}
var start, end, match, parent, leftNode,
rightNode, replacementNode, text,
d = document;
// Loop through all childNodes of "node"
if (node = node && node.firstChild) do {
if (node.nodeType === 1) {
// Regular element, recurse:
findAndReplaceInDOM(node, regex, replaceFn);
} else if (node.nodeType === 3) {
// Text node, introspect
parent = node.parentNode;
text = node.data;
regex.lastIndex = 0;
while (match = regex.exec(text)) {
replacementNode = replaceFn(match, node);
if (!replacementNode) {
continue;
}
end = regex.lastIndex;
start = end - match[0].length;
// Effectively split node up into three parts:
// leftSideOfReplacement + REPLACEMENT + rightSideOfReplacement
leftNode = d.createTextNode( text.substring(0, start) );
rightNode = d.createTextNode( text.substring(end) );
parent.insertBefore(leftNode, node);
parent.insertBefore(replacementNode, node);
parent.insertBefore(rightNode, node);
// Remove original node from document
parent.removeChild(node);
}
}
} while (node = node.nextSibling);
}

Categories

Resources