replace and regex exception - javascript

I want to wrap all the words of a text in a <trans> tag, to be able to work on each words. Hover them, translate on click etc.
For that I need an exception in my replace function to ignore html tags like <br> or <span>.
Here is the function I have :
function wrapWords(str, tmpl) {
return str.replace(/(?![<br>\<span class="gras">\</span>])[a-zA-ZÀ-ÿ]+/gi, tmpl || "<trans>$&</trans>");
}
This function is working well with russian characters but not with french ones. The problem is that the <br> and <span> exception is excluding french characters b,r,s,p,a... Because of that some words are not wrapped correctly in my <trans> tag.
Does anyone knows how could I exclude a group of characters like specific tags <br> for example without affecting letters b and r in french ?
Thanks for any answer!

Properly using DOM, it is a bit more complex, but no corner cases to worry about, as it is very straightforward.
You want to split the text, thus it makes sense to only operate on text nodes. To find all text nodes, we could evaluate an XPath, or we could construct a TreeWalker.
Once we know which nodes we want to operate on, we take one node at a time and get all-space and no-space sequences. Each will be transformed into another text node, but the no-space sequences will additionally be wrapped inside a <span>. We append them one by one in front of the original node, which will guarantee the correct order, then finally we'll remove the original node, when the replacement nodes are all in their place.
function getTextNodes(node) {
let walker = document.createTreeWalker(node, NodeFilter.SHOW_TEXT, null, false);
let textnodes = [];
let textnode;
while (textnode = walker.nextNode()) {
textnodes.push(textnode);
}
return textnodes;
}
function wrap(element) {
getTextNodes(element).forEach(node => {
node.textContent.replace(/(\S+)|(\s+)/g, (match, word, space) => {
let textnode = document.createTextNode(match);
let newnode;
if (word) {
newnode = document.createElement('trans');
newnode.appendChild(textnode);
} else {
newnode = textnode;
}
node.parentNode.insertBefore(newnode, node);
});
node.remove();
});
}
wrap(document.getElementById('wrapthis'));
trans {
background-color: pink;
}
Not affected<br/>
<div id="wrapthis">
This is affected<br>
<span class="gras">HTML tags are fine</span><br/>
This as well<br/>
</div>
Not affected<br/>

Here's a quick way:
"foo bar baz".split(" ").map(w => "<trans>" + w + "</trans>").join(" ");
Explanation:
sentence is splitted by space character, which gives an Array. Each element of this Array is then wrapped in <trans> tags. Then everything is joined to create back a string.
Edit: usage in the DOM:
var sourceTextNode = document.createElement("div"); // here you're supposed to get an existing node...
sourceTextNode.textContent = "foo bar baz"; // ... and doing this is for the example purposes
sourceTextNode.innerHTML = sourceTextNode.textContent.split(" ").map(w => "<trans>" + w + "</trans>").join(" ");
sourceTextNode is:
<div>
<trans>foo</trans>
<trans>bar</trans>
<trans>baz</trans>
</div>
Note: You may want to exclude empty elements in the splitted Array that you'll get when there are multiple consecutive space charcaters.
One way to do this is testing the non-emptiness of the elements in a filter:
sourceText.split(" ").filter(Boolean)...

Related

Wrap all individual words in a span tag based on their first letter

I am trying to wrap each individual word on a webpage in a tag so I can style them individually based on their starting letter.
I have found this method of wrapping each word in a span tag individually, but I can't figure out how to vary the class based on the first letter of the word.
let e = document.getElementById('words');
e.innerHTML = e.innerHTML.replace(/(^|<\/?[^>]+>|\s+)([^\s<]+)/g, '$1<span class="word">$2</span>');
What you're trying to achieve can't be done with regex if you want to reference the individual words.
I've wrote a little snippet that uses document.querySelector() instead
outerText property on the query selector object returns a plain text string which is later converted to an array with split() function
Then it simply loops over the array and appends the style tag and to get the first letter I've used substring() function
const words = document.querySelector("#words").outerText.split(" ");
const wordsDiv = document.querySelector("#words")
wordsDiv.innerHTML = ""
words.map((el) => {
wordsDiv.innerHTML += `<span class="${el.substring(0, 1)}">${el}</span> `
})
<div id="words">red green blue orange</div>
Try this:
let e = document.getElementById('words');
// Create array with words
let words = e.innerHTML.split(' ');
// Object with CSS classes and corresponding letter
const classes = {
a: 'class-one',
b: 'class-two',
// and so on
}
// Return array with the new strings
words = words.map(word => {
const firstLetter = word.substring(0,1);
return `<span class="${classes[firstLetter]}">${word}</span>`;
});
// Join the array and update the DOM
e.innerHTML = words.join(' ');
Here the following is happening:
Words are being separated by a space, creating an array;
The classes constant must receive, as in the example, the correspondence of each letter with its class;
In the function of the map method, the first letter of the word is being returned and the classes object is accessed with that letter;
Finally we do a join to join all the texts separating them with a space.
Remembering that if there is more than one space between words, inconsistencies will occur, as the first letter will be a space.
From the string.prototype.replace reference, you can also add a replace function to the method, instead of an string. The replace function has this form.
So, if I didn't misinterpret your problem, you can do something similar to this:
let e = document.getElementById('words');
e.innerHTML = e.innerHTML.replace(/(^|<\/?[^>]+>|\s+)([^\s<]+)/g, function(match, p1, p2) {
const myClasses = {
a: "aword",
b: "bword",
...
}
return `${p1}<span class="${myClasses[p2[0]]}">${p2}</span>'
});
You could do something like this:
Select all the children
map them with their text content
Split the text into single words
map again with some styled html
Join, render and enjoy.
words.innerHTML = [...words.children].flatMap(el => el.innerText.replace(/\n/ig, " ").split(" ")).map(el => `<div class="someClass">${el}</div>`).join("")
.someClass {
color: red;
background: orange;
margin: 10px;
}
<div id="words">
<div>
Some dummy content
<span>Some other nested dummy content</span>
</div>
<p>Some sibling content</p>
</div>

How can I remove all HTML elements from a string excluding a special class?

I've a problem. I'm currently looking for a way to remove any HTML elements from a string. But there are two conditions:
The content of the elements should be kept
Special elements with a defined class should not be removed
I've already tried lots of things and looked at plenty of questions/answers on SO, but unfortunately I can't really figure out any of the answers. Unfortunately, this exceeds my abilities by far. But I would like to know how something like this works.
Question/Answers I've tried:
How to strip HTML tags from string in JavaScript?,
Strip HTML from Text JavaScript
So when I have for example a string like this:
You have to pay <div class="keep-this">$200</div> per <span class="date">month</span> for your <span class="vehicle">car</span>
It should looks like this after stripping:
You have to pay <div class="keep-this">$200</div> per month for your car
I've actually tried following things:
jQuery(document).ready(function ($) {
let string = 'You have to pay <div class="keep-this">$200</div> per <span class="date">month</span> for your <span class="vehicle">car</span>';
console.log(string);
function removeHTMLfromString(string) {
let tmp = document.createElement("DIV");
tmp.innerHTML = string;
return tmp.textContent || tmp.innerText || "";
}
console.log(removeHTMLfromString(string));
console.log(string.replace(/<[^>]*>?/gm, ''));
});
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
And I've also tried out a regex tool to see what get's removed, but unfortunately, I'm not making much progress here either:
https://www.regexr.com/50qar
I would love if someone can help me with this task. Thanks a lot!
Update
Maybe there is a way doing it with just a regex? If yes, how can I exclude my elements with a special class when using this regex: /<\/?[^>]+(>|$)/g?
It may be a little big code. But I think it may help you.
let str = 'You have to pay <div class="keep-this">$200</div> per <span class="date">month</span> for your <span class="vehicle">car</span> <div class="keep-this">$500</div> also';
const el = document.createElement("div");
el.innerHTML = str;
// Get all the elements to keep
const keep = el.querySelectorAll(".keep-this");
// Replace the keeping element from the original string
// With special pattern and index so that we can replace
// the pattern with original keeping element
keep.forEach((v, i) => {
const keepStr = v.outerHTML;
str = str.replace(keepStr, `_k${i}_`);
});
// Replace created element's innerHTML by patternised string.
el.innerHTML = str;
// Get the text only
let stringify = el.innerText;
// Replace patterns from the text string by keeping element
keep.forEach((v,i) => {
const keepStr = v.outerHTML;
stringify = stringify.replace(`_k${i}_`, keepStr);
});
console.log(stringify);
Leave me comment if anything misleading.
Update: Regular Expression approach
The same task can be done by using a regular expression. The approach is-
Find all the keepable elements by regex and store them.
Replace all the keepable elements from the input string by an identical pattern
Remove all the HTML tags from the sting.
Replace the identical patterns by keepable elements.
let htmlString = 'You have to pay <div class="keep-this">$200</div> per <span class="date">month</span> for your <span class="vehicle">car</span> Another <div class="keep-this">$400</div> here';
// RegExp for keep elements
const keepRegex = /<([a-z1-6]+)\s+(class=[\'\"](keep-this\s*.*?)[\'\"])[^>]*>.*?<\/\1>/ig;
// RegExp for opening tag
const openRegex = /<([a-z1-6]+)\b[^>]*>/ig;
// RegExp for closing tag
const closeRegex = /<\/[a-z1-6]+>/ig;
// Find all the matches for the keeping elements
const matches = [...htmlString.matchAll(keepRegex)];
// Replace the input string with any pattern so that it could be replaced later
matches.forEach((match, i) => {
htmlString = htmlString.replace(match[0], `_k${i}_`);
});
// Remove opening tags from the input string
htmlString = htmlString.replace(openRegex, '');
// Remove closing tags from the input string
htmlString = htmlString.replace(closeRegex, '');
// Replace the previously created pattern by keeping element
matches.forEach((match, index) => {
htmlString = htmlString.replace(`_k${index}_`, match[0]);
})
console.log(htmlString);
If date and vehicles div and class are coming from another function, you should just get rid of it from there.

Search body for document for {~ contents ~}

Alright, so basically I would like to search the Body tags for {~ , then get whatever follows that until ~} and turn that into a string (not including the {~ or ~} ).
const match = document.body.innerHTML.match(/\{~(.+)~\}/);
if (match) console.log(match[1]);
else console.log('No match found');
<body>text {~inner~} text </body>
$(function(){
var bodyText = document.getElementsByTagName("body")[0].innerHTML;
found=bodyText.match(/{~(.*?)~}/gi);
$.each(found, function( index, value ) {
var ret = value.replace(/{~/g,'').replace(/~}/g,'');
console.log(ret);
});
});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.8.3/jquery.min.js"></script>
<body> {~Content 1~}
{~Content 2~}
</body>
There you go, put gi at the end of the regex.
This is a harder problem to solve than it would first appear; things like script tags and comments can throw a wrench into things if you just grab the innerHTML of the body. The following function takes a base element to search, in your case you'll want to pass in document.body, and returns an array containing any of the strings found.
function getMyTags (baseElement) {
const rxFindTags = /{~(.*?)~}/g;
// .childNodes contains not only elements, but any text that
// is not inside of an element, comments as their own node, etc.
// We will need to filter out everything that isn't a text node
// or a non-script tag.
let nodes = baseElement.childNodes;
let matches = [];
nodes.forEach(node => {
let nodeType = node.nodeType
// if this is a text node or an element, and it is not a script tag
if (nodeType === 3 || nodeType === 1 && node.nodeName !== 'SCRIPT') {
let html;
if (node.nodeType === 3) { // text node
html = node.nodeValue;
} else { // element
html = node.innerHTML; // or .innerText if you don't want the HTML
}
let match;
// search the html for matches until it can't find any more
while ((match = rxFindTags.exec(html)) !== null) {
// the [1] is to get the first capture group, which contains
// the text we want
matches.push(match[1]);
}
}
});
return matches;
}
console.log('All the matches in the body:', getMyTags(document.body));
console.log('Just in header:', getMyTags(document.getElementById('title')));
<h1 id="title"><b>{~Foo~}</b>{~bar~}</h1>
Some text that is {~not inside of an element~}
<!-- This {~comment~} should not be captured -->
<script>
// this {~script~} should not be captured
</script>
<p>Something {~after~} the stuff that shouldn't be captured</p>
The regular expression /{~(.*?)~}/g works like this:
{~ start our match at {~
(.*?) capture anything after it; the ? makes it "non-greedy" (also known as "lazy") so, if you have two instances of {~something~} in any of the strings we are searching it captures each individually instead of capturing from the first {~ to the last ~} in the string.
~} says there has to be a ~} after our match.
The g option makes it a 'global' search, meaning it will look for all matches in the string, not just the first one.
Further reading
childNodes
nodeType
Regular-Expressions.info has a great regular expression tutorial.
MDN RegExp documentation
Tools
There are lots of different tools out there to help you develop regular expressions. Here are a couple I've used:
RegExr has a great tool that explains how a particular regular expression works.
RegExPal

How to get numbers in elements' inner text by javascript's regex

I want to get numbers in the inner text of an html by javascript regex to replace them.
for example in the below code I want to get 1,2,3,4,5,6,1,2,3,1,2,3, but not the 444 inside of the div tag.
<body>
aaaa123aaa456
<div style="background: #444">aaaa123aaaa</div>
aaaa123aaa
</body>
What could be the regular expression?
Your best bet is to use innerText or textContent to get at the text without the tags and then just use the regex /\d/g to get the numbers.
function digitsInText(rootDomNode) {
var text = rootDomNode.textContent || rootDomNode.innerText;
return text.match(/\d/g) || [];
}
For example,
alert(digitsInText(document.body));
If your HTML is not in the DOM, you can try to strip the tags yourself : JavaScript: How to strip HTML tags from string?
Since you need to do a replacement, I would still try to walk the DOM and operate on text nodes individually, but if that is out of the question, try
var HTML_TOKEN = /(?:[^<\d]|<(?!\/?[a-z]|!--))+|<!--[\s\S]*?-->|<\/?[a-z](?:[^">']|"[^"]*"|'[^']*')*>|(\d+)/gi;
function incrementAllNumbersInHtmlTextNodes(html) {
return html.replace(HTML_TOKEN, function (all, digits) {
if ("string" === typeof digits) {
return "" + (+digits + 1);
}
return all;
});
}
then
incrementAllNumbersInHtmlTextNodes(
'<b>123</b>Hello, World!<p>I <3 Ponies</p><div id=123>245</div>')
produces
'<b>124</b>Hello, World!<p>I <4 Ponies</p><div id=123>246</div>'
It will get confused around where special elements like <script> end and won't recognize digits that are entity encoded, but should work otherwise.
You don't necessarily need RegExp to get the text contents of an element excluding its descendant elements' — in fact I'd advise against it as RegExp matching for HTML is notoriously difficult — there are DOM solutions:
function getImmediateText(element){
var text = '';
// Text and elements are all DOM nodes. We can grab the lot of immediate descendants and cycle through them.
for(var i = 0, l = element.childNodes.length, node; i < l, node = element.childNodes[i]; ++i){
// nodeType 3 is text
if(node.nodeType === 3){
text += node.nodeValue;
}
}
return text;
}
var bodyText = getImmediateText(document.getElementsByTagName('body')[0]);
So here there's a function that will return only the immediate text content as a string. Of course, you could then strip that for numbers with the RegExp using something like this:
var numberString = bodyText.match(/\d+/g).join('');
Just to answer my old question:
It is possible to achieve it by lookahead.
/\d(?=[^<>]*(<|$))/g
to replace the numbers
html.replace(/\d(?=[^<>]*(<|$))/g, function($0) {
return map[$0]
});
the source of the answer https://www.drupal.org/node/619198#comment-5710052

Help write regex that will surround certain text with <strong> tags, only if the <strong> tag isn't present

I have several posts on a website; all these posts are chat conversations of this type:
AD: Hey!
BC: What's up?
AD: Nothing
BC: Okay
They're marked up as simple paragraphs surrounded by <p> tags.
Using the javascript replace function, I want all instances of "AD" in the beginning of a conversation (ie, all instances of "AD" at the starting of a line followed by a ":") to be surrounded by <strong> tags, but only if the instance isn't already surrounded by a <strong> tag.
What regex should I use to accomplish this? Am I trying to do what this advises against?
The code I'm using is like this:
var posts = document.getElementsByClassName('entry-content');
for (var i = 0; i < posts.length; i++) {
posts[i].innerHTML = posts[i].innerHTML.replace(/some regex here/,
'replaced content here');
}
If AD: is always at the start of a line then the following regex should work, using the m switch:
.replace(/^AD:/gm, "<strong>AD:</strong>");
You don't need to check for the existence of <strong> because ^ will match the start of the line and the regex will only match if the sequence of characters that follows the start of the line are AD:.
You're not going against the "Don't use regex to parse HTML" advice because you're not parsing HTML, you're simply replacing a string with another string.
An alternative to regex would be to work with ranges, creating a range selecting the text and then using execCommand to make the text bold. However, I think this would be much more difficult and you would likely face differences in browser implementations. The regex way should be enough.
After seeing your comment, the following regex would work fine:
.replace(/<(p|br)>AD:/gm, "<$1><strong>AD:</strong>");
Wouldn't it be easier to set the class or style property of found paragraph to text-weight: bold or a class that does roughly the same? That way you wouldn't have to worry about adding in tags, or searching for existing tags. Might perform better, too, if you don't have to do any string replaces.
If you really want to add the strong tags anyway, I'd suggest using DOM functions to find childNodes of your paragraph that are <strong>, and if you don't find one, add it and move the original (text) childNode of the paragraph into it.
Using regular expressions on the innerHTML isn't reliable and will potentially lead to problems. The correct way to do this is a tiresome process but is much more reliable.
E.g.
for (var i = 0, l = posts.length; i < l; i++) {
findAndReplaceInDOM(posts[i], /^AD:/g, function(match, node){
// Make sure current node does note have a <strong> as a parent
if (node.parentNode.nodeName.toLowerCase() === 'strong') {
return false;
}
// Create and return new <strong>
var s = document.createElement('strong');
s.appendChild(document.createTextNode(match[0]));
return s;
});
}
And the findAndReplaceInDOM function:
function findAndReplaceInDOM(node, regex, replaceFn) {
// Note: regex MUST have global flag
if (!regex || !regex.global || typeof replaceFn !== 'function') {
return;
}
var start, end, match, parent, leftNode,
rightNode, replacementNode, text,
d = document;
// Loop through all childNodes of "node"
if (node = node && node.firstChild) do {
if (node.nodeType === 1) {
// Regular element, recurse:
findAndReplaceInDOM(node, regex, replaceFn);
} else if (node.nodeType === 3) {
// Text node, introspect
parent = node.parentNode;
text = node.data;
regex.lastIndex = 0;
while (match = regex.exec(text)) {
replacementNode = replaceFn(match, node);
if (!replacementNode) {
continue;
}
end = regex.lastIndex;
start = end - match[0].length;
// Effectively split node up into three parts:
// leftSideOfReplacement + REPLACEMENT + rightSideOfReplacement
leftNode = d.createTextNode( text.substring(0, start) );
rightNode = d.createTextNode( text.substring(end) );
parent.insertBefore(leftNode, node);
parent.insertBefore(replacementNode, node);
parent.insertBefore(rightNode, node);
// Remove original node from document
parent.removeChild(node);
}
}
} while (node = node.nextSibling);
}

Categories

Resources