Check if HTML snippet is valid with JavaScript - javascript

I need a reliable JavaScript library / function to check if an HTML snippet is valid that I can call from my code. For example, it should check that opened tags and quotation marks are closed, nesting is correct, etc.
I don't want the validation to fail because something is not 100% standard (but would work anyway).

Update: this answer is limited - please see the edit below.
Expanding on #kolink's answer, I use:
var checkHTML = function(html) {
var doc = document.createElement('div');
doc.innerHTML = html;
return ( doc.innerHTML === html );
}
I.e., we create a temporary div with the HTML. In order to do this, the browser will create a DOM tree based on the HTML string, which may involve closing tags etc.
Comparing the div's HTML contents with the original HTML will tell us if the browser needed to change anything.
checkHTML('<a>hell<b>o</b>')
Returns false.
checkHTML('<a>hell<b>o</b></a>')
Returns true.
Edit: As #Quentin notes below, this is excessively strict for a variety of reasons: browsers will often fix omitted closing tags, even if closing tags are optional for that tag. Eg:
<p>one para
<p>second para
...is considered valid (since Ps are allowed to omit closing tags) but checkHTML will return false. Browsers will also normalise tag cases, and alter white space. You should be aware of these limits when deciding to use this approach.

Well, this code:
function tidy(html) {
var d = document.createElement('div');
d.innerHTML = html;
return d.innerHTML;
}
This will "correct" malformed HTML to the best of the browser's ability. If that's helpful to you, it's a lot easier than trying to validate HTML.

None of the solutions presented so far is doing a good job in answering the original question, especially when it comes to
I don't want the validation to fail because something is not 100%
standard (but would work anyways).
tldr >> check the JSFiddle
So I used the input of the answers and comments on this topic and created a method that does the following:
checks html string tag by tag if valid
trys to render html string
compares theoretically to be created tag count with actually rendered html dom tag count
if checked 'strict', <br/> and empty attribute normalizations ="" are not ignored
compares rendered innerHTML with given html string (while ignoring whitespaces and quotes)
Returns
true if rendered html is same as given html string
false if one of the checks fails
normalized html string if rendered html seems valid but is not equal to given html string
normalized means, that on rendering, the browser ignores or repairs sometimes specific parts of the input (like adding missing closing-tags for <p> and converts others (like single to double quotes or encoding of ampersands).
Making a distinction between "failed" and "normalized" allows to flag the content to the user as "this will not be rendered as you might expect it".
Most times normalized gives back an only slightly altered version of the original html string - still, sometimes the result is quite different. So this should be used e.g. to flag user-input for further review before saving it to a db or rendering it blindly. (see JSFiddle for examples of normalization)
The checks take the following exceptions into consideration
ignoring of normalization of single quotes to double quotes
image and other tags with a src attribute are 'disarmed' during rendering
(if non strict) ignoring of <br/> >> <br> conversion
(if non strict) ignoring of normalization of empty attributes (<p disabled> >> <p disabled="">)
encoding of initially un-encoded ampersands when reading .innerHTML, e.g. in attribute values
.
function simpleValidateHtmlStr(htmlStr, strictBoolean) {
if (typeof htmlStr !== "string")
return false;
var validateHtmlTag = new RegExp("<[a-z]+(\s+|\"[^\"]*\"\s?|'[^']*'\s?|[^'\">])*>", "igm"),
sdom = document.createElement('div'),
noSrcNoAmpHtmlStr = htmlStr
.replace(/ src=/, " svhs___src=") // disarm src attributes
.replace(/&/igm, "#svhs#amp##"), // 'save' encoded ampersands
noSrcNoAmpIgnoreScriptContentHtmlStr = noSrcNoAmpHtmlStr
.replace(/\n\r?/igm, "#svhs#nl##") // temporarily remove line breaks
.replace(/(<script[^>]*>)(.*?)(<\/script>)/igm, "$1$3") // ignore script contents
.replace(/#svhs#nl##/igm, "\n\r"), // re-add line breaks
htmlTags = noSrcNoAmpIgnoreScriptContentHtmlStr.match(/<[a-z]+[^>]*>/igm), // get all start-tags
htmlTagsCount = htmlTags ? htmlTags.length : 0,
tagsAreValid, resHtmlStr;
if(!strictBoolean){
// ignore <br/> conversions
noSrcNoAmpHtmlStr = noSrcNoAmpHtmlStr.replace(/<br\s*\/>/, "<br>")
}
if (htmlTagsCount) {
tagsAreValid = htmlTags.reduce(function(isValid, tagStr) {
return isValid && tagStr.match(validateHtmlTag);
}, true);
if (!tagsAreValid) {
return false;
}
}
try {
sdom.innerHTML = noSrcNoAmpHtmlStr;
} catch (err) {
return false;
}
// compare rendered tag-count with expected tag-count
if (sdom.querySelectorAll("*").length !== htmlTagsCount) {
return false;
}
resHtmlStr = sdom.innerHTML.replace(/&/igm, "&"); // undo '&' encoding
if(!strictBoolean){
// ignore empty attribute normalizations
resHtmlStr = resHtmlStr.replace(/=""/, "")
}
// compare html strings while ignoring case, quote-changes, trailing spaces
var
simpleIn = noSrcNoAmpHtmlStr.replace(/["']/igm, "").replace(/\s+/igm, " ").toLowerCase().trim(),
simpleOut = resHtmlStr.replace(/["']/igm, "").replace(/\s+/igm, " ").toLowerCase().trim();
if (simpleIn === simpleOut)
return true;
return resHtmlStr.replace(/ svhs___src=/igm, " src=").replace(/#svhs#amp##/, "&");
}
Here you can find it in a JSFiddle https://jsfiddle.net/abernh/twgj8bev/ , together with different test-cases, including
"<a href='blue.html id='green'>missing attribute quotes</a>" // FAIL
"<a>hell<B>o</B></a>" // PASS
'hell<b>o</b>' // PASS
'<a href=test.html>hell<b>o</b></a>', // PASS
"<a href='test.html'>hell<b>o</b></a>", // PASS
'<ul><li>hell</li><li>hell</li></ul>', // PASS
'<ul><li>hell<li>hell</ul>', // PASS
'<div ng-if="true && valid">ampersands in attributes</div>' // PASS
.

9 years later, how about using DOMParser?
It accepts string as parameter and returns Document type, just like HTML.
Thus, when it has an error, the returned document object has <parsererror> element in it.
If you parse your html as xml, at least you can check your html is xhtml compliant.
Example
> const parser = new DOMParser();
> const doc = parser.parseFromString('<div>Input: <input /></div>', 'text/xml');
> (doc.documentElement.querySelector('parsererror') || {}).innerText; // undefined
To wrap this as a function
function isValidHTML(html) {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/xml');
if (doc.documentElement.querySelector('parsererror')) {
return doc.documentElement.querySelector('parsererror').innerText;
} else {
return true;
}
}
Testing the above function
isValidHTML('<a>hell<B>o</B></a>') // true
isValidHTML('hell') // true
isValidHTML('<a href='test.html'>hell</a>') // true
isValidHTML("<a href=test.html>hell</a>") // This page contains the following err..
isValidHTML('<ul><li>a</li><li>b</li></ul>') // true
isValidHTML('<ul><li>a<li>b</ul>') // This page contains the following err..
isValidHTML('<div><input /></div>' // true
isValidHTML('<div><input></div>' // This page contains the following err..
The above works for very simple html.
However if your html has some code-like texts; <script>, <style>, etc, you need to manipulate just for XML validation although it's valid HTML
The following updates code-like html to a valid XML syntax.
export function getHtmlError(html) {
const parser = new DOMParser();
const htmlForParser = `<xml>${html}</xml>`
.replace(/(src|href)=".*?&.*?"/g, '$1="OMITTED"')
.replace(/<script[\s\S]+?<\/script>/gm, '<script>OMITTED</script>')
.replace(/<style[\s\S]+?<\/style>/gm, '<style>OMITTED</style>')
.replace(/<pre[\s\S]+?<\/pre>/gm, '<pre>OMITTED</pre>')
.replace(/ /g, ' ');
const doc = parser.parseFromString(htmlForParser, 'text/xml');
if (doc.documentElement.querySelector('parsererror')) {
console.error(htmlForParser.split(/\n/).map( (el, ndx) => `${ndx+1}: ${el}`).join('\n'));
return doc.documentElement.querySelector('parsererror');
}
}

function validHTML(html) {
var openingTags, closingTags;
html = html.replace(/<[^>]*\/\s?>/g, ''); // Remove all self closing tags
html = html.replace(/<(br|hr|img).*?>/g, ''); // Remove all <br>, <hr>, and <img> tags
openingTags = html.match(/<[^\/].*?>/g) || []; // Get remaining opening tags
closingTags = html.match(/<\/.+?>/g) || []; // Get remaining closing tags
return openingTags.length === closingTags.length ? true : false;
}
var htmlContent = "<p>your html content goes here</p>" // Note: String without any html tag will consider as valid html snippet. If it’s not valid in your case, in that case you can check opening tag count first.
if(validHTML(htmlContent)) {
alert('Valid HTML')
}
else {
alert('Invalid HTML');
}

Using pure JavaScript you may check if an element exists using the following function:
if (typeof(element) != 'undefined' && element != null)
Using the following code we can test this in action:
HTML:
<input type="button" value="Toggle .not-undefined" onclick="toggleNotUndefined()">
<input type="button" value="Check if .not-undefined exists" onclick="checkNotUndefined()">
<p class=".not-undefined"></p>
CSS:
p:after {
content: "Is 'undefined'";
color: blue;
}
p.not-undefined:after {
content: "Is not 'undefined'";
color: red;
}
JavaScript:
function checkNotUndefined(){
var phrase = "not ";
var element = document.querySelector('.not-undefined');
if (typeof(element) != 'undefined' && element != null) phrase = "";
alert("Element of class '.not-undefined' does "+phrase+"exist!");
// $(".thisClass").length checks to see if our elem exists in jQuery
}
function toggleNotUndefined(){
document.querySelector('p').classList.toggle('not-undefined');
}
It can be found on JSFiddle.

function isHTML(str)
{
var a = document.createElement('div');
a.innerHTML = str;
for(var c= a.ChildNodes, i = c.length; i--)
{
if (c[i].nodeType == 1) return true;
}
return false;
}
Good Luck!

It depends on js-library which you use.
Html validatod for node.js https://www.npmjs.com/package/html-validator
Html validator for jQuery https://api.jquery.com/jquery.parsehtml/
But, as mentioned before, using the browser to validate broken HTML is a great idea:
function tidy(html) {
var d = document.createElement('div');
d.innerHTML = html;
return d.innerHTML;
}

Expanding on #Tarun's answer from above:
function validHTML(html) { // checks the validity of html, requires all tags and property-names to only use alphabetical characters and numbers (and hyphens, underscore for properties)
html = html.toLowerCase().replace(/(?<=<[^>]+?=\s*"[^"]*)[<>]/g,"").replace(/(?<=<[^>]+?=\s*'[^']*)[<>]/g,""); // remove all angle brackets from tag properties
html = html.replace(/<script.*?<\/script>/g, ''); // Remove all script-elements
html = html.replace(/<style.*?<\/style>/g, ''); // Remove all style elements tags
html = html.toLowerCase().replace(/<[^>]*\/\s?>/g, ''); // Remove all self closing tags
html = html.replace(/<(\!|br|hr|img).*?>/g, ''); // Remove all <br>, <hr>, and <img> tags
//var tags=[...str.matchAll(/<.*?>/g)]; this would allow for unclosed initial and final tag to pass parsing
html = html.replace(/^[^<>]+|[^<>]+$|(?<=>)[^<>]+(?=<)/gs,""); // remove all clean text nodes, note that < or > in text nodes will result in artefacts for which we check and return false
tags = html.split(/(?<=>)(?=<)/);
if (tags.length%2==1) {
console.log("uneven number of tags in "+html)
return false;
}
var tagno=0;
while (tags.length>0) {
if (tagno==tags.length) {
console.log("these tags are not closed: "+tags.slice(0,tagno).join());
return false;
}
if (tags[tagno].slice(0,2)=="</") {
if (tagno==0) {
console.log("this tag has not been opened: "+tags[0]);
return false;
}
var tagSearch=tags[tagno].match(/<\/\s*([\w\-\_]+)\s*>/);
if (tagSearch===null) {
console.log("could not identify closing tag "+tags[tagno]+" after "+tags.slice(0,tagno).join());
return false;
} else tags[tagno]=tagSearch[1];
if (tags[tagno]==tags[tagno-1]) {
tags.splice(tagno-1,2);
tagno--;
} else {
console.log("tag '"+tags[tagno]+"' trying to close these tags: "+tags.slice(0,tagno).join());
return false;
}
} else {
tags[tagno]=tags[tagno].replace(/(?<=<\s*[\w_\-]+)(\s+[\w\_\-]+(\s*=\s*(".*?"|'.*?'|[^\s\="'<>`]+))?)*/g,""); // remove all correct properties from tag
var tagSearch=tags[tagno].match(/<(\s*[\w\-\_]+)/);
if ((tagSearch===null) || (tags[tagno]!="<"+tagSearch[1]+">")) {
console.log("fragmented tag with the following remains: "+tags[tagno]);
return false;
}
var tagSearch=tags[tagno].match(/<\s*([\w\-\_]+)/);
if (tagSearch===null) {
console.log("could not identify opening tag "+tags[tagno]+" after "+tags.slice(0,tagno).join());
return false;
} else tags[tagno]=tagSearch[1];
tagno++;
}
}
return true;
}
This performs a few additional checks, such as testing whether tags match and whether properties would parse. As it does not depend on an existing DOM, it can be used in a server environment, but beware: it is slow. Also, in theory, tags can be names much more laxly, as you can basically use any unicode (with a few exceptions) in tag- and property-names. This would not pass my own sanity-check, however.

Related

JavaScript Regex To strip the html Tag with no attribute

I have an Html String like <font>New</font><font face="Akronim---Regular" color="#00ff00">Text</font>, With the use of JavaScript Regex i wanted to remove font tags with no attributes.
So the output of the above html string should be New<font face="Akronim---Regular" color="#00ff00">Text</font>
Below is the code, which helps me to strip all the font tags, But i am only require to strip font tags with no attribute.
var replace = new RegExp('<'+'font'+'[^><]*>|<.'+'font'+'[^><]*>','g')
var text = '<font>New</font><font face="Akronim---Regular" color="#00ff00">Text</font>';
console.log(text.replace(replace, ''))
Thanks in advance.
If you want to do it with a regex, you'll have to go tag by tag while keeping track of nesting levels so that you know when to remove a closing tag and when not.
To do so simply use an array that you constantly push/pop to/from it the type of the tag you encounter. When you encounter an opening tag, you push true if it has attributes and false if it doesn't, you then remove it if it doesn't have attributes. When you encounter a closing tag, you pop the type of the last encountered opening tag, if it had attributes (true), you skip to the next one, if it didn't have them (false) you remove it.
The regex should go over the opening and the closing tags in one run while giving us info about whether its a closing or an opening one and whether it has attributes or not. To do so we use regex like so <\/?font [^\s]*?>, we group (\/) and ([^\s]*?) because whether or not those groups get matched, we will know if it is a closing tag or not and if it has attributes or not respectively (for example, if we match the / then it's a closing tag). We add in the \s* to handle empty spaces and the resulting regex is /<(\/)?\s*font\s*([^\s]*?)\s*>/g.
Here is the function that does the job:
function stripEmptyFonts(htmlString) {
var tagTypes = [];
return htmlString.replace(/<(\/)?\s*font\s*([^\s]*?)\s*>/g, function(match, closingSlash, attributes) {
if(!closingSlash) { // if it is an opening tag (no '/' was matched)
tagTypes.push(!!attributes); // push true to the array tagTypes if it has attributes, otherwise push false (attributes will either be a string or null, we use the double negation !! to convert it to a boolean)
return attributes ? match : ""; // remove it if it has no attributes, otherwise keep it as is (read the docs of String#replace method)
} else { // if it is a closing tag (a '/' was matched)
return tagTypes.pop() ? match : ""; // if the last tag we encounterd had attributes (pop returned true) we skip this closing tag, otherwise (pop return false) we remove it
}
});
}
Example:
function stripEmptyFonts(htmlString) {
var tagTypes = [];
return htmlString.replace(/<(\/)?\s*font\s*([^\s]*?)\s*>/g, function(match, closingSlash, attributes) {
if(!closingSlash) {
tagTypes.push(!!attributes);
return attributes ? match : "";
} else {
return tagTypes.pop() ? match : "";
}
});
}
var html = `
<font>New</font>
<font color="red">Hello <font>world</font>!</font>
<font>Hello <font color="blue">back</font>!</font>
<font>ABCD<font>EFGH<font color="black">IJKL<font>MNOP<font color="red">QRST</font>UVWX</font>YZ</font>1234</font>5678</font>`
console.log(stripEmptyFonts(html));
Have a look at this. Do not use RegEx to manipulate HTML
The container could be created in memory if needed
On Node you can use https://www.npmjs.com/package/jsdom
document.querySelectorAll("font").forEach(f => {
const parent = f.parentNode;
if (f.attributes.length === 0) {
if (f.innerHTML === "") { // remove empty fonts - we could do this before too
parent.removeChild(f);
} else {
f.childNodes.forEach(child => parent.insertBefore(child.cloneNode(true), f));
parent.removeChild(f);
}
}
});
console.log(document.getElementById("container").innerHTML)
<div id="container">
<font>Hello
<font face="Akronim---Regular">world</font>
</font>
<font></font>
<font>New</font>
<font face="Akronim---Regular" color="#00ff00">Text</font>
</div>
if you have no access of DOM element then you can try this answer
<script>
let htmlString = '<font>Hello </font><font>New <font face="Akronim-Regular">world</font></font>';
let expoString = htmlString.split('<font>');
expoString = expoString.filter(function(el) {
return el != null && el != "";
});
for (let i = 0; i < expoString.length; i++) {
let startTag = expoString[i].split('<font').length - 1;
let endTag = expoString[i].split('</font>').length - 1;
for (let j = 1; j <= endTag - startTag; j++) {
expoString[i] = expoString[i].replace('</font>', '');
}
}
console.log(expoString.join('')); // here you can return string instead

Firefox Extension is Preventing Javascript Running

I'm writing a simple Firefox web extension, which will replace certain words (e.g. acronyms) with expanded versions. My code is as follows:
var replacements = [];
replacements["example1"] = "My First Example";
replacements["example2"] = "Second Example";
if(!window.location.href.startsWith('https://ignore.thissite.com/')){
for(key in replacements){
replaceOnDocument(new RegExp('\\b'+key+'\\b', 'gi'), '{{{REPLACE_'+key+'}}}');
document.body.innerHTML = document.body.innerHTML.replace(new RegExp('{{{REPLACE_'+key+'}}}', 'g'), '<abbr title="'+key+'">'+replacements[key]+'</abbr>');
}
}
function replaceOnDocument(pattern, string){
Array.from(document.querySelectorAll("body, body *:not(script):not(noscript):not(style):not(code):not(pre)"))
.forEach(someNode => Array.from(someNode.childNodes)
.filter(childNode => childNode.nodeType == 3)
.forEach(textNode => textNode.textContent = textNode.textContent.replace(pattern, string)));
}
This seems to replace all the instances as expected, but I've noticed that Javascript doesn't seem to run correctly on pages that the script has run on. I've been staring at the code for half an hour now and can't work out why that would be the case. To make things worse, the problem seems to be intermittent (although happens more often than not).
Any ideas as to why my code would prevent Javascript running as expected?
I already speculated that replacing the content of body.innerHTML may cause issues, so how about using the following approach for replacing text? codepen example
const walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
{
// filter / exclude tags
acceptNode: function (node) {
return node.parentElement.nodeName === "SCRIPT" ? NodeFilter.FILTER_SKIP : NodeFilter.FILTER_ACCEPT;
}
}
);
while (walker.nextNode()) {
// replace every H with #
walker.currentNode.nodeValue = walker.currentNode.nodeValue.replace("H", "#");
}
This iterates over every text node excluding script text nodes and replaces their content.

Check if input fields contains certain text

I'm trying to find if what the user typing in to an input field contain certain text - I've kinda got it works, but only working for an exact match as opposed to a partial match. If a user types anything before the text i'm matching, of course the code doesn't trigger.
What i need to do is check if the input contains #nyu.edu at all in the input field.
$('.email').keyup(function(){
if ($(".email").val() == "#nyu.edu") {
$("p.warning").css("visibility", "visible");
}
else if ($(".email").val() != "#nyu.edu") {
$("p.warning").css("visibility", "hidden");
}
});
Checking if a string contains a substring is pretty easily done by taking haystack.indexOf(needle) and checking against -1 (not found).
if ($(".email").val().indexOf("#nyu.edu") !== -1) {
// contains
} else {
// does not contain
}
There is a function in the ES6 draft which you may find more natural, called includes
You can add support for ES6's String.prototype.includes like this
if (!String.prototype.includes) {
String.prototype.includes = function (needle, pos) {
return this.indexOf(needle, pos || 0) !== -1;
};
}
"foobar".includes('foo'); // true
Working Ex:
http://codepen.io/anon/pen/XJKMjo
$('.email').keyup(function() {
var exp = /#nyu\.edu$/; // reg ex
var input = $(this).val(); // email input
var matches = exp.test(input); // boolean
// changes to hidden or visible
$("p.warning").css("visibility", (matches) ? "visible" : "hidden");
});
You can filter based on HTML elements' attribute contents with jQuery to either contain, start, or end with a string that you want. To match elements with an attribute that contain a string, you'd use [attr*="value"] in your jQuery selector, but I think that you want something where the end matches your string. Here's how that would work:
var elements = $(".email[value$='#nyu.edu']");
if(elements.length > 0) {
// Do something with the elements, such as removing an .error class
} else {
// Email doesn't end with #nyu.edu
}
Read more about the jQuery ends-with attribute selector or browse through other attribute selectors like this.
jsBin demo
var $warning = $("p.warning"); // P.S. make sure not to target a wrong .warning!
$('.email').on("input", function(){
$warning.toggle( /#nyu\.edu/ig.test(this.value) );
});
as I've mentioned .warning being a class can represent any first .warning occurrence. use rather some other selector like .next() or .closest(selector).find(".warning")
your boss fill fire you if your client loses $$$ cause you forgot to watch for copy-paste cases. ;) Kidding, use "input" event.
P.S use .warning{display:none;} instead of visibility

How to get first text node of a string while containing bold and italic tags?

String(s) is dynamic
It is originated from onclick event when user clicks anywhere in dom
if string(s)'s first part that is:
"login<b>user</b>account"
is enclosed in some element like this :
"<div>login<b>user</b>account</div>",
then I can get it with this:
alert($(s).find('*').andSelf().not('b,i').not(':empty').first().html());
// result is : login<b>user</b>account
But how can i get the same result in this condition when it is not enclosed in any element .i.e. when it is not enclosed in any element?
I tried this below code which works fine when first part do not include any <b></b> but it only gives "login" when it does include these tags.
var s = $.trim('login<b>user</b> account<tbody> <tr> <td class="translated">Lorem ipsum dummy text</td></tr><tr><td class="translated">This is a new paragraph</td></tr><tr><td class="translated"><b>Email</b></td></tr><tr><td><i>This is yet another text</i></td> </tr></tbody>');
if(s.substring(0, s.indexOf('<')) != ''){
alert(s.substring(0, s.indexOf('<')));
}
Note:
Suggest a generic solution that is not specific for this above string only. It should work for both the cases when there is bold tags and when there ain't any.
So it's just a b or a i, heh?
A recursive function is always the way to go. And this time, it's probably the best way to go.
var s = function getEm(elem) {
var ret = ''
// TextNode? Great!
if (elem.nodeType === 3) {
ret += elem.nodeValue;
}
else if (elem.nodeType === 1 &&
(elem.nodeName === 'B' || elem.nodeName === 'I')) {
// Element? And it's a B or an I? Get his kids!
ret += getEm(elem.firstChild);
}
// Ain't nobody got time fo' empty stuff.
if (elem.nextSibling) {
ret += getEm(elem.nextSibling);
}
return ret;
}(elem);
Jsfiddle demonstrating this: http://jsfiddle.net/Ralt/TZKsP/
PS: Parsing HTML with regex or custom tokenizer is bad and shouldn't be done.
You're trying to retrieve all of the text up to the first element that's not a <b> or <i>, but this text could be wrapped in an element itself. This is SUPER tricky. I feel like there's a better way to implement whatever it is you're trying to accomplish, but here's a solution that works.
function initialText(s){
var test = s.match(/(<.+?>)?.*?<(?!(b|\/|i))/);
var match = test[0];
var prefixed_element = test[1];
// if the string was prefixed with an element tag
// remove it (ie '<div> blah blah blah')
if(prefixed_element) match = match.slice(prefixed_element.length);
// remove the matching < and return the string
return match.slice(0,-1);
}
You're lucky I found this problem interesting and challenging because, again, this is ridiculous.
You're welcome ;-)
Try this:
if (s.substring(0, s.indexOf('<')) != '') {
alert(s.substring(0, s.indexOf('<tbody>')));
}

Convert HTML Character Entities back to regular text using javascript

the questions says it all :)
eg. we have >, we need > using only javascript
Update: It seems jquery is the easy way out. But, it would be nice to have a lightweight solution. More like a function which is capable to do this by itself.
You could do something like this:
String.prototype.decodeHTML = function() {
var map = {"gt":">" /* , … */};
return this.replace(/&(#(?:x[0-9a-f]+|\d+)|[a-z]+);?/gi, function($0, $1) {
if ($1[0] === "#") {
return String.fromCharCode($1[1].toLowerCase() === "x" ? parseInt($1.substr(2), 16) : parseInt($1.substr(1), 10));
} else {
return map.hasOwnProperty($1) ? map[$1] : $0;
}
});
};
function decodeEntities(s){
var str, temp= document.createElement('p');
temp.innerHTML= s;
str= temp.textContent || temp.innerText;
temp=null;
return str;
}
alert(decodeEntities('<'))
/* returned value: (String)
<
*/
I know there are libraries out there, but here are a couple of solutions for browsers. These work well when placing html entity data strings into human editable areas where you want the characters to be shown, such as textarea's or input[type=text].
I add this answer as I have to support older versions of IE and I feel that it wraps up a few days worth of research and testing. I hope somebody finds this useful.
First this is for more modern browsers using jQuery, Please note that this should NOT be used if you have to support versions of IE before 10 (7, 8, or 9) as it will strip out the newlines leaving you with just one long line of text.
if (!String.prototype.HTMLDecode) {
String.prototype.HTMLDecode = function () {
var str = this.toString(),
$decoderEl = $('<textarea />');
str = $decoderEl.html(str)
.text()
.replace(/<br((\/)|( \/))?>/gi, "\r\n");
$decoderEl.remove();
return str;
};
}
This next one is based on kennebec's work above, with some differences which are mostly for the sake of older IE versions. This does not require jQuery, but does still require a browser.
if (!String.prototype.HTMLDecode) {
String.prototype.HTMLDecode = function () {
var str = this.toString(),
//Create an element for decoding
decoderEl = document.createElement('p');
//Bail if empty, otherwise IE7 will return undefined when
//OR-ing the 2 empty strings from innerText and textContent
if (str.length == 0) {
return str;
}
//convert newlines to <br's> to save them
str = str.replace(/((\r\n)|(\r)|(\n))/gi, " <br/>");
decoderEl.innerHTML = str;
/*
We use innerText first as IE strips newlines out with textContent.
There is said to be a performance hit for this, but sometimes
correctness of data (keeping newlines) must take precedence.
*/
str = decoderEl.innerText || decoderEl.textContent;
//clean up the decoding element
decoderEl = null;
//replace back in the newlines
return str.replace(/<br((\/)|( \/))?>/gi, "\r\n");
};
}
/*
Usage:
var str = ">";
return str.HTMLDecode();
returned value:
(String) >
*/
Here is a "class" for decoding whole HTML document.
HTMLDecoder = {
tempElement: document.createElement('span'),
decode: function(html) {
var _self = this;
html.replace(/&(#(?:x[0-9a-f]+|\d+)|[a-z]+);/gi,
function(str) {
_self.tempElement.innerHTML= str;
str = _self.tempElement.textContent || _self.tempElement.innerText;
return str;
}
);
}
}
Note that I used Gumbo's regexp for catching entities but for fully valid HTML documents (or XHTML) you could simpy use /&[^;]+;/g.
There is nothing built in, but there are many libraries that have been written to do this.
Here is one.
And here one that is a jQuery plugin.

Categories

Resources