I need to use Javascript to do three things:
Select all nodes with a class of "foo".
Find all words inside these nodes that begin with "*".
Surround those words with <span class="xyz"> ... </span>, where xyz is the word itself.
For example, the content:
<ul>
<li class="foo">
*abc def *ghi
</li>
<li class="bar">
abc *def *ghi
</li>
</ul>
would become
<ul>
<li class="foo">
<span class="abc">*abc</span> def <span class="ghi">*ghi</span>
</li>
<li class="bar">
abc *def *ghi <!-- Not part of a node with class "foo", so
</li> no changes made. -->
</ul>
How might I do this? (P.S. Solutions involving jQuery work too, but other than that I'd prefer not include any additional dependencies.)
No jQuery required:
UE_replacer = function (node) {
// just for performance, skip attribute and
// comment nodes (types 2 and 8, respectively)
if (node.nodeType == 2) return;
if (node.nodeType == 8) return;
// for text nodes (type 3), wrap words of the
// form *xyzzy with a span that has class xyzzy
if (node.nodeType == 3) {
// in the actual text, the nodeValue, change
// all strings ('g'=global) that start and end
// on a word boundary ('\b') where the first
// character is '*' and is followed by one or
// more ('+'=one or more) 'word' characters
// ('\w'=word character). save all the word
// characters (that's what parens do) so that
// they can be used in the replacement string
// ('$1'=re-use saved characters).
var text = node.nodeValue.replace(
/\b\*(\w+)\b/g,
'<span class="$1">*$1</span>' // <== Wrong!
);
// set the new text back into the nodeValue
node.nodeValue = text;
return;
}
// for all other node types, call this function
// recursively on all its child nodes
for (var i=0; i<node.childNodes.length; ++i) {
UE_replacer( node.childNodes[i] );
}
}
// start the replacement on 'document', which is
// the root node
UE_replacer( document );
Updated: To contrast the direction of strager's answer, I got rid of my botched jQuery and kept the regular expression as simple as possible. This 'raw' javascript approach turns out to be much easier than I expected.
Although jQuery is clearly good for manipulating DOM structure, it's actually not easy to figure out how to manipulate text elements.
Don't try to process the innerHTML/html() of an element. This will never work because regex is not powerful enough to parse HTML. Just walk over the Text nodes looking for what you want:
// Replace words in text content, recursively walking element children.
//
function wrapWordsInDescendants(element, tagName, className) {
for (var i= element.childNodes.length; i-->0;) {
var child= element.childNodes[i];
if (child.nodeType==1) // Node.ELEMENT_NODE
wrapWordsInDescendants(child, tagName, className);
else if (child.nodeType==3) // Node.TEXT_NODE
wrapWordsInText(child, tagName, className);
}
}
// Replace words in a single text node
//
function wrapWordsInText(node, tagName, className) {
// Get list of *word indexes
//
var ixs= [];
var match;
while (match= starword.exec(node.data))
ixs.push([match.index, match.index+match[0].length]);
// Wrap each in the given element
//
for (var i= ixs.length; i-->0;) {
var element= document.createElement(tagName);
element.className= className;
node.splitText(ixs[i][1]);
element.appendChild(node.splitText(ixs[i][0]));
node.parentNode.insertBefore(element, node.nextSibling);
}
}
var starword= /(^|\W)\*\w+\b/g;
// Process all elements with class 'foo'
//
$('.foo').each(function() {
wrapWordsInDescendants(this, 'span', 'xyz');
});
// If you're not using jQuery, you'll need the below bits instead of $...
// Fix missing indexOf method on IE
//
if (![].indexOf) Array.prototype.indexOf= function(item) {
for (var i= 0; i<this.length; i++)
if (this[i]==item)
return i;
return -1;
}
// Iterating over '*' (all elements) is not fast; if possible, reduce to
// all elements called 'li', or all element inside a certain element etc.
//
var elements= document.getElementsByTagName('*');
for (var i= elements.length; i-->0;)
if (elements[i].className.split(' ').indexOf('foo')!=-1)
wrapWordsInDescendants(elements[i], 'span', 'xyz');
The regexp would look something like this (sed-ish syntax):
s/\*\(\w+\)\b\(?![^<]*>\)/<span class="\1">*\1</span>/g
Thus:
$('li.foo').each(function() {
var html = $(this).html();
html = html.replace(/\*(\w+)\b(?![^<]*>)/g, "<span class=\"$1\">*$1</span>");
$(this).html(html);
});
The \*(\w+)\b segment is the important piece. It finds an asterisk followed by one or more word characters followed by some sort of word termination (e.g. end of line, or a space). The word is captured into $1, which is then used as the text and the class of the output.
The part just after that ((?![^<]*>)) is a negative lookahead. It asserts that a closing angle bracket does not follow, unless there is an opening angle bracket before it. This prevents a match where the string is inside an HTML tag. This doesn't handle malformed HTML, but that shouldn't be the case anyway.
Related
I am writing a Chrome extension which adds a <span> ... </span> around every string that matches a certain regular expression. The RegEx match works perfectly, but I cannot seem to find a way to correctly add the span tag around the text.
My code thus far is:
// main.js
var regex_pattern = new RegEx('(apple)', 'g'); // Let's pretend I want to match every instance of 'apple'
var textNodes = getTextNodes(); // A function that returns a list of every text node from the DOM
for (var i = 0; i < textNodes.length; i++) {
if (textNodes[i].nodeValue.match(regex_pattern)) {
textNodes[i].nodeValue = textNodes[i].nodeValue.replace(regex_pattern, "<span class='highlight'>$&</span>");
}
}
This will correctly identify every match of my RegEx pattern (in this case 'apple') and output <span class="highlight">apple</span>. The only problem is that this is not treated as HTML by Chrome, it's treated as text - so instead of seeing the world 'apple' styled according to the highlight class, one would see the literal output: <span class="highlight">apple<span>
Why does this happen, and how can I fix it so that the style is correctly applied? Realizing that this was less than desirable, I tried using the insertBefore() method to wrap the matched text in a span, but this didn't do anything, it would either error or fail to add the span node, depending on how I tweaked the code. Thanks for any insight you can provide!
You can't use nodeValue to replace a text node with arbitrary HTML.
You must do it manually:
function replaceNodeWithHTML(node, html) {
var parent = node.parentNode;
if(!parent) return;
var next = node.nextSibling;
var parser = document.createElement('div');
parser.innerHTML = html;
while(parser.firstChild)
parent.insertBefore(parser.firstChild, next);
parent.removeChild(node);
}
var regex_pattern = /(apple)/g;
var textNodes = [document.querySelector('div').firstChild];
for (var i = 0; i < textNodes.length; i++)
if (textNodes[i].nodeValue.match(regex_pattern))
replaceNodeWithHTML(
textNodes[i],
textNodes[i].nodeValue.replace(regex_pattern, "<span class='highlight'>$&</span>")
);
.highlight {
background: yellow;
}
<div>I have an (apple). You have an (apple) too.</div>
It would be easier if nodes had insertAdjacentHTML method, but only elements do.
Set .innerHTML on the element. Setting textNode.nodeValue value direcly sets the text.
Lets say you have an HTML string like this:
<div id="loco" class="hey" >lorem ipsum pendus <em>hey</em>moder <hr /></div>
And need to place <br/> elements after every space character.... which I was doing with:
HTMLtext.replace(/\s{1,}/g, ' <br/>');
However, the problem is that this inserts breaks after space characters in-between tags (between tag properties) too and I'd of course like to do this for tag textual contents only. Somehow I was always really bad with regular expressions - could anyone help out?
So basically do my original whitespace match but only if its not between < and > ?
Regex is not a good tool for this. You should be working with the DOM, not with the raw HTML string.
For a quick-and-dirty solution that presupposes that there are no < or > character in your string except those delimiting a tag, you can try this, though:
result = subject.replace(/\s+(?=[^<>]*<)/g, "$&<br/>");
This inserts a <br/> after whitespace only if the next angle bracket is an opening angle bracket.
Explanation:
\s+ # Match one or more whitespace characters (including newlines!)
(?= # but only if (positive lookahead assertion) it's possible to match...
[^<>]* # any number of non-angle brackets
< # followed by an opening angle bracket
) # ...from this position in the string onwards.
Replace that with $& (which contains the matched characters) plus <br/>.
This regex does not check if there is a > further behind, as this would require a positive look*behind* assertion, and JavaScript does not support these. So you can't check for that, but if you control the HTML and are sure that the conditions I mentioned above are met, that shouldn't be a problem.
See this answer for iterating the dom and replacing whitespaces with <br /> elements. The adapted code would be:
(function iterate_node(node) {
if (node.nodeType === 3) { // Node.TEXT_NODE
var text = node.data,
words = text.split(/\s/);
if (words.length > 1) {
node.data = words[0];
var next = node.nextSibling,
parent = node.parentNode;
for (var i=1; i<words.length; i++) {
var tnode = document.createTextNode(words[i]),
br = document.createElement("br");
parent.insertBefore(br, next);
parent.insertBefore(tnode, next);
}
}
} else if (node.nodeType === 1) { // Node.ELEMENT_NODE
for (var i=node.childNodes.length-1; i>=0; i--) {
iterate_node(node.childNodes[i]); // run recursive on DOM
}
}
})(content); // any dom node
(Demo at jsfiddle.net)
Okay, so you don't want to match spaces inside HTML tags. Only regular expressions isn't sufficient for this. I'll use a lexer to do the job. You can see the output here.
var lexer = new Lexer;
var result = "";
lexer.addRule(/</, function (c) { // start of a tag
this.state = 2; // go to state 2 - exclusive tag state
result += c; // copy to output
});
lexer.addRule(/>/, function (c) { // end of a tag
this.state = 0; // go back to state 0 - initial state
result += c; // copy to output
}, [2]); // only apply this rule when in state 2
lexer.addRule(/.|\n/, function (c) { // match any character
result += c; // copy to output
}, [2]); // only apply this rule when in state 2
lexer.addRule(/\s+/, function () { // match one or more spaces
result += "<br/>"; // replace with "<br/>"
});
lexer.addRule(/.|\n/, function (c) { // match any character
result += c; // copy to output
}); // everything else
lexer.input = '<div id="loco" class="hey" >lorem ipsum pendus <em>hey</em>moder <hr /></div>';
lexer.lex();
Of course, a lexer is a very powerful tool. You may also skip angled brackets inside the value of an attribute in a tag. However I'll leave that for you to implement. Good luck.
In a situation that the body area of a webpage is the only accessible part, is there a way to remove all instances of a particular text-phrase (written in HTML) using inline JavaScript or another inline capable language?
This could be useful in many situations, such as people using a Tiny.cc/customurl and wanting to remove the portion stating "tiny.cc/"
If specifics are allowed, we're modifying a calendar plugin using Tiny.cc to create a custom URLs (tiny.cc/customurl). The plugin shows the full URL by default so we'd like to strip the text "tiny.cc/" and keep the "customurl" portion in our code:
<div class="ews_cal_grid_custom_item_3">
<div class="ews_cal_grid_select_checkbox_clear" id="wGridTagChk" onclick="__doPostBack('wGridTagChk', 'tiny.cc/Baseball-JV');" > </div>
tiny.cc/Baseball-JV
</div>
The part we'd like to remove is the http://tiny.cc/ on the 3rd line by itself.
To do this without replacing all the HTML (which wrecks all event handlers) and to do it without recursion (which is generally faster), you can do this:
function removeText(top, txt) {
var node = top.firstChild, index;
while(node && node != top) {
// if text node, check for our text
if (node.nodeType == 3) {
// without using regular expressions (to avoid escaping regex chars),
// replace all copies of this text in this text node
while ((index = node.nodeValue.indexOf(txt)) != -1) {
node.nodeValue = node.nodeValue.substr(0, index) + node.nodeValue.substr(index + txt.length);
}
}
if (node.firstChild) {
// if it has a child node, traverse down into children
node = node.firstChild;
} else if (node.nextSibling) {
// if it has a sibling, go to the next sibling
node = node.nextSibling;
} else {
// go up the parent chain until we find a parent that has a nextSibling
// so we can keep going
while ((node = node.parentNode) != top) {
if (node.nextSibling) {
node = node.nextSibling;
break;
}
}
}
}
}
Working demo here: http://jsfiddle.net/jfriend00/2y9eH/
To do this on the entire document, you would just call:
removeText(document.body, "http://tiny.cc/Baseball-JV");
As long as you can supply the data in string format, you can use Regular Expressions to do this for you.
You could parse the whole innerHTML of the body tag, if that is all that you can access. This is a slow and kinda-bad-practice method, but for explanation's sake:
document.body.innerHTML = document.body.innerHTML.replace(
/http:\/\/tiny\.cc\//i, // The regular expression to search for
""); // Waht to replace with (nothing).
The whole expression is contained within forward slashes, so any forward slashes inside the regexp need to be escaped with a backslash.
This goes for other characters that have special meaning in regexp, such as the period. A single period (.) denotes matching 'any' character. To match a period, it must be escaped (\.)
EDIT:
If you wish to keep the reference to the URL in the onclick, you can modify the regexp to not match when inside single quotes (as your example):
/([^']http:\/\/tiny\.cc\/[^'])/i
If you don't want to replace all the instances of that string in the HTML, then you'll have to recursively iterate over the node structure, for instance:
function textFilter(element, search, replacement) {
for (var i = 0; i < element.childNodes.length; i++) {
var child = element.childNodes[i];
var nodeType = child.nodeType;
if (nodeType == 1) { // element
textFilter(child, search, replacement);
} else if (nodeType == 3) { // text node
child.nodeValue = child.nodeValue.replace(search, replacement);
}
}
}
Then you just grab hold of the appropriate element, and call this function on it:
var el = document.getElementById('target');
textFilter(el, /http:\/\/tiny.cc\//g, ""); // You could use a regex
textFilter(el, "Baseball", "Basketball"); // or just a simple string
I'm working on a browser plugin that replaces all instances of "someString" (as defined by a complicated regex) with $1. This generally works ok just doing a global replace on the body's innerHTML. However it breaks the page when it finds (and replaces) the "someString" inside <script> tags (i.e. as a JS variable or other JS reference). It also breaks if "someString" is already part of an anchor.
So basically I want to do a global replace on all instances of "someString" unless it falls inside a <script></script> or <a></a> tag set.
Essentially what I have now is:
var body = document.getElementsByTagName('body')[0].innerHTML;
body = body.replace(/(someString)/gi, '$1');
document.getElementsByTagName('body')[0].innerHTML = body;
But obviously that's not good enough. I've been struggling for a couple hours now and reading all of the answers here (including the many adamant ones that insist regex should not be used with HTML), so I'm open to suggestions on how to do this. I'd prefer using straight JS, but can use jQuery if necessary.
Edit - Sample HTML:
<body>
someString
<script type="text/javascript">
var someString = 'blah';
console.log(someString);
</script>
someString
</body>
In that case, only the very first instance of "someString" should be replaced.
Try this and see if it meets your needs (tested in IE 8 and Chrome).
<script src="jquery-1.4.4.js" type="text/javascript"></script>
<script>
var pattern = /(someString)/gi;
var replacement = "$1";
$(function() {
$("body :not(a,script)")
.contents()
.filter(function() {
return this.nodeType == 3 && this.nodeValue.search(pattern) != -1;
})
.each(function() {
var span = document.createElement("span");
span.innerHTML = " " + $.trim(this.nodeValue.replace(pattern, replacement));
this.parentNode.insertBefore(span, this);
this.parentNode.removeChild(this);
});
});
</script>
The code uses jQuery to find all the text nodes within the document's <body>that are not in <anchor> or <script> blocks, and contain the search pattern. Once those are found, a span is injected containing the target node's modified content, and the old text node is removed.
The only issue I saw was that IE 8 handles text nodes containing only whitespace differently than Chrome, so sometimes a replacement would lose a leading space, hence the insertion of the non-breaking space before the text containing the regex replacements.
Well, You can use XPath with Mozilla (assuming you're writing the plugin for FireFox). The call is document.evaluate. Or you can use an XPath library to do it (there are a few out there)...
var matches = document.evaluate(
'//*[not(name() = "a") and not(name() = "script") and contains(., "string")]',
document,
null,
XPathResult.UNORDERED_NODE_ITERATOR_TYPE
null
);
Then replace using a callback function:
var callback = function(node) {
var text = node.nodeValue;
text = text.replace(/(someString)/gi, '$1');
var div = document.createElement('div');
div.innerHTML = text;
for (var i = 0, l = div.childNodes.length; i < l; i++) {
node.parentNode.insertBefore(div.childNodes[i], node);
}
node.parentNode.removeChild(node);
};
var nodes = [];
//cache the tree since we want to modify it as we iterate
var node = matches.iterateNext();
while (node) {
nodes.push(node);
node = matches.iterateNext();
}
for (var key = 0, length = nodes.length; key < length; key++) {
node = nodes[key];
// Check for a Text node
if (node.nodeType == Node.TEXT_NODE) {
callback(node);
} else {
for (var i = 0, l = node.childNodes.length; i < l; i++) {
var child = node.childNodes[i];
if (child.nodeType == Node.TEXT_NODE) {
callback(child);
}
}
}
}
I know you don't want to hear this, but this doesn't sound like a job for a regex. Regular expressions don't do negative matches very well before becoming complicated and unreadable.
Perhaps this regex might be close enough though:
/>[^<]*(someString)[^<]*</
It captures any instance of someString that are inbetween a > and a <.
Another idea is if you do use jQuery, you can use the :contains pseudo-selector.
$('*:contains(someString)').each(function(i)
{
var markup = $(this).html();
// modify markup to insert anchor tag
$(this).html(markup)
});
This will grab any DOM item that contains 'someString' in it's text. I dont think it will traverse <script> tags or so you should be good.
You could try the following:
/(someString)(?![^<]*?(<\/a>|<\/script>))/
I didn't test every schenario, but it is basically using a negative lookahead to look for the next opening bracket following someString, and if that bracket is part of an anchor or script closing tag, it does not match.
Your example seems to work in this fiddle, although it certainly doesn't cover all possibilities. In cases where the innerHTML in your <a></a> contains tags (like <b> or <span>), or the code in your script tags generates html (contains strings with tags in it), you would need something more complex.
I have several posts on a website; all these posts are chat conversations of this type:
AD: Hey!
BC: What's up?
AD: Nothing
BC: Okay
They're marked up as simple paragraphs surrounded by <p> tags.
Using the javascript replace function, I want all instances of "AD" in the beginning of a conversation (ie, all instances of "AD" at the starting of a line followed by a ":") to be surrounded by <strong> tags, but only if the instance isn't already surrounded by a <strong> tag.
What regex should I use to accomplish this? Am I trying to do what this advises against?
The code I'm using is like this:
var posts = document.getElementsByClassName('entry-content');
for (var i = 0; i < posts.length; i++) {
posts[i].innerHTML = posts[i].innerHTML.replace(/some regex here/,
'replaced content here');
}
If AD: is always at the start of a line then the following regex should work, using the m switch:
.replace(/^AD:/gm, "<strong>AD:</strong>");
You don't need to check for the existence of <strong> because ^ will match the start of the line and the regex will only match if the sequence of characters that follows the start of the line are AD:.
You're not going against the "Don't use regex to parse HTML" advice because you're not parsing HTML, you're simply replacing a string with another string.
An alternative to regex would be to work with ranges, creating a range selecting the text and then using execCommand to make the text bold. However, I think this would be much more difficult and you would likely face differences in browser implementations. The regex way should be enough.
After seeing your comment, the following regex would work fine:
.replace(/<(p|br)>AD:/gm, "<$1><strong>AD:</strong>");
Wouldn't it be easier to set the class or style property of found paragraph to text-weight: bold or a class that does roughly the same? That way you wouldn't have to worry about adding in tags, or searching for existing tags. Might perform better, too, if you don't have to do any string replaces.
If you really want to add the strong tags anyway, I'd suggest using DOM functions to find childNodes of your paragraph that are <strong>, and if you don't find one, add it and move the original (text) childNode of the paragraph into it.
Using regular expressions on the innerHTML isn't reliable and will potentially lead to problems. The correct way to do this is a tiresome process but is much more reliable.
E.g.
for (var i = 0, l = posts.length; i < l; i++) {
findAndReplaceInDOM(posts[i], /^AD:/g, function(match, node){
// Make sure current node does note have a <strong> as a parent
if (node.parentNode.nodeName.toLowerCase() === 'strong') {
return false;
}
// Create and return new <strong>
var s = document.createElement('strong');
s.appendChild(document.createTextNode(match[0]));
return s;
});
}
And the findAndReplaceInDOM function:
function findAndReplaceInDOM(node, regex, replaceFn) {
// Note: regex MUST have global flag
if (!regex || !regex.global || typeof replaceFn !== 'function') {
return;
}
var start, end, match, parent, leftNode,
rightNode, replacementNode, text,
d = document;
// Loop through all childNodes of "node"
if (node = node && node.firstChild) do {
if (node.nodeType === 1) {
// Regular element, recurse:
findAndReplaceInDOM(node, regex, replaceFn);
} else if (node.nodeType === 3) {
// Text node, introspect
parent = node.parentNode;
text = node.data;
regex.lastIndex = 0;
while (match = regex.exec(text)) {
replacementNode = replaceFn(match, node);
if (!replacementNode) {
continue;
}
end = regex.lastIndex;
start = end - match[0].length;
// Effectively split node up into three parts:
// leftSideOfReplacement + REPLACEMENT + rightSideOfReplacement
leftNode = d.createTextNode( text.substring(0, start) );
rightNode = d.createTextNode( text.substring(end) );
parent.insertBefore(leftNode, node);
parent.insertBefore(replacementNode, node);
parent.insertBefore(rightNode, node);
// Remove original node from document
parent.removeChild(node);
}
}
} while (node = node.nextSibling);
}