Javascript to add cdata section on the fly? - javascript

I'm having trouble with special characters that exist in an xml node attribute. To combat this, I'm trying to render the attributes as child nodes and, where necessary, using cdata sections to get around the special characters. The problem is, I can't seem to get the cdata section appended to the node correctly.
I'm iterating over the source xml node's attributes and creating new nodes. If the attribute.name = "description" I want to put the attribute.text() in a cdata section and append the new node. That's where I jump the track.
// newXMLData is the new xml document that I've created in memory
for (var ctr =0;ctr< this.attributes.length;ctr++){ // iterate over the attributes
if( this.attributes[ctr].name =="Description"){ // if the attribute name is "Description" add a CDATA section
var thisNodeName = this.attributes[ctr].name;
newXMLDataNode.append("<"+thisNodeName +"></"+ thisNodeName +">" );
var cdata = newXMLData.createCDATASection('test'); // here's where it breaks.
} else {
// It's not "Description" so just append the new node.
newXMLDataNode.append("<"+ this.attributes[ctr].name +">" + $(this.attributes[ctr]).text() + "</"+ this.attributes[ctr].name +">" );
}
}
Any ideas? Is there another way to add a cdata section?
Here's a sample snippet of the source...
<row
pSiteID="4"
pSiteTile="Test Site Name "
pSiteURL="http://www.cnn.com"
ID="1"
Description="<div>blah blah blah since June 2007.&nbsp; T<br>&nbsp;<br>blah blah blah blah&nbsp; </div>"
CreatedDate="2010-09-20 14:46:18"
Comments="Comments example.
" >
here's what I'm trying to create...
<Site>
<PSITEID>4</PSITEID>
<PSITETILE>Test Site Name</PSITETILE>
<PSITEURL>http://www.cnn.com</PSITEURL>
<ID>1</ID>
<DESCRIPTION><![CDATA[<div>blah blah blah since June 2007.&nbsp; T<br>&nbsp;<br>blah blah blah blah&nbsp; </div ]]></DESCRIPTION>
<CREATEDDATE>2010-09-20 14:46:18</CREATEDDATE>
<COMMENTS><![CDATA[ Comments example.
]]></COMMENTS>
</Site>

I had the same issue. i was trying to append CDATA to xml nodes, so i thought its as easy as adding like so:
valueNode[0].text = "<![CDATA["+ tmpVal +"]]>";
//valueNode[0] represents "<value></value>"
This does not work because the whole thing will get interpreted as text therefore <(less than) and > (great than) will be replaced automatically.
what you need to do is use createCDATASection by doing the following:
var tmpCdata = $xmlDoc[0].createCDATASection(escape("muzi test 002"));
//i'm also escaping special charactures as well
valueNode[0].appendChild(tmpCdata);
results will be:
<value><![CDATA[muzi%20test%20002]]></value>
Brettz9 (in previous answer) explains how to do this but quite complex, therefore i just wanted to add my solution which is much simpler.
thanks,

Not sure of browser support for document.implementation.createDocument or createCDataSection, but this works in Mozilla at least:
<script>
// Define some helpers (not available IE < 9)
function parse (str) {
return new DOMParser().parseFromString(str, 'text/xml').documentElement;
}
function ser (dom) {
return new XMLSerializer().serializeToString(dom);
}
// Simulate your XML retrieval
var row = '<row pSiteID="4" pSiteTile="Test Site Name " pSiteURL="http://www.cnn.com" ID="1" Description="<div>blah blah blah since June 2007.&nbsp; T<br>&nbsp;<br>blah blah blah blah&nbsp; </div>" CreatedDate="2010-09-20 14:46:18" Comments="Comments example.
" />';
// Hack to convert source to well-formed XML, or otherwise you can't use DOM methods on it which
// depend on well-formed XML
row = row.replace(/(=\s*")([\s\S]*?)(")/g, function (n0, n1, n2, n3) {
return n1+ // Add back equal sign and opening quote
n2.replace(/</g, '<'). // Create well-formed XML by avoiding less-than signs inside attributes
replace(/&nbsp;/g, '&#160;')+ // HTML entities (except for gt, lt, amp, quot) must be either converted to numeric character references or your XML must define the same entities
n3; // Add back closing quote
});
// Simulate your retrieval of DOM attributes, though in this context, we're just making attributes into a global
this.attributes = parse(row).attributes;
// Simulate your creation of an XML document
var newXMLData = document.implementation.createDocument(null, 'Site', null);
// Modify your code to avoid jQuery dependency for easier testing and to
// avoid confusion (error?) of having two variables, newXMLData and newXMLDataNode
for (var ctr =0;ctr< this.attributes.length;ctr++){ // iterate over the attributes
if (this.attributes[ctr].name =="Description") { // if the attribute name is "Description" add a CDATA section
var thisNodeName = this.attributes[ctr].name;
var str = "<"+thisNodeName +"></"+ thisNodeName +">";
var node = parse(str);
var cdata = newXMLData.createCDATASection(this.attributes[ctr].textContent);
node.appendChild(cdata);
newXMLData.documentElement.appendChild(node);
}
else {
// It's not "Description" so just append the new node.
var str= "<"+ this.attributes[ctr].name +">" + this.attributes[ctr].textContent + "</"+ this.attributes[ctr].name +">";
newXMLData.documentElement.appendChild(parse(str));
}
}
// Prove its working (though you may wish to use toUpperCase() if you need the element names upper-cased);
// if you need CDATA for Comments, you can follow the pattern above to add support for that too
alert(ser(newXMLData));
</script>

Related

Removing invalid characters from XML before serializing it with XMLSerializer()

I'm trying to store user-input in an XML document on the client-side (javascript), and transmit that to the server for persistence.
One user, for example, pasted in text that included an STX character (0x2). The XMLSerializer did not escape the STX character, and therefore, did not serialize to well-formed XML. Or perhaps the .attr() call should have escaped the STX character, but in either case, invalid XML was produced.
I'm finding the output of in-browser XMLSerializer() isn't always well-formed, (and doesn't even satisfy the browser's own DOMParser()
This example shows that the STX character is not properly encoded by XMLSerializer():
> doc = $.parseXML('<?xml version="1.0" encoding="utf-8" ?>\n<elem></elem>');
#document
> $(doc).find("elem").attr("someattr", String.fromCharCode(0x2));
[ <elem someattr=​"">​</elem>​ ]
> serializedDoc = new XMLSerializer().serializeToString(doc);
"<?xml version="1.0" encoding="utf-8"?><elem someattr=""/></elem>"
> $.parseXML(serializedDoc);
Error: Invalid XML: <?xml version="1.0" encoding="utf-8"?><elem someattr=""/></elem>
How should I construct an XML document in-browser (with params determined by arbitrary user-input) such that it will always be well-formed (everything properly escaped)? I don't need to support IE8 or IE7.
(And yes, I do validate the XML on the server side, but if the browser hands the server a document that is not well-formed, the best the server can do is reject it, which isn't that helpful to the poor user)
Here's a function sanitizeStringForXML() which can either be used to cleanse strings before assignment, or a derivative function removeInvalidCharacters(xmlNode) which can be passed a DOM tree and will automatically sanitize attributes and textNodes so they are safe to store.
var stringWithSTX = "Bad" + String.fromCharCode(2) + "News";
var xmlNode = $("<myelem/>").attr("badattr", stringWithSTX);
var serializer = new XMLSerializer();
var invalidXML = serializer.serializeToString(xmlNode);
// Now cleanse it:
removeInvalidCharacters(xmlNode);
var validXML = serializer.serializeToString(xmlNode);
I based this on a list of characters from the non-restricted characters section of this wikipedia article, but the supplementary planes require 5-hex-digit unicode characters, and the Javascript regex does not include a syntax for this, so for now, I'm just stripping them out (you aren't missing too much...):
// WARNING: too painful to include supplementary planes, these characters (0x10000 and higher)
// will be stripped by this function. See what you are missing (heiroglyphics, emoji, etc) at:
// http://en.wikipedia.org/wiki/Plane_(Unicode)#Supplementary_Multilingual_Plane
var NOT_SAFE_IN_XML_1_0 = /[^\x09\x0A\x0D\x20-\xFF\x85\xA0-\uD7FF\uE000-\uFDCF\uFDE0-\uFFFD]/gm;
function sanitizeStringForXML(theString) {
"use strict";
return theString.replace(NOT_SAFE_IN_XML_1_0, '');
}
function removeInvalidCharacters(node) {
"use strict";
if (node.attributes) {
for (var i = 0; i < node.attributes.length; i++) {
var attribute = node.attributes[i];
if (attribute.nodeValue) {
attribute.nodeValue = sanitizeStringForXML(attribute.nodeValue);
}
}
}
if (node.childNodes) {
for (var i = 0; i < node.childNodes.length; i++) {
var childNode = node.childNodes[i];
if (childNode.nodeType == 1 /* ELEMENT_NODE */) {
removeInvalidCharacters(childNode);
} else if (childNode.nodeType == 3 /* TEXT_NODE */) {
if (childNode.nodeValue) {
childNode.nodeValue = sanitizeStringForXML(childNode.nodeValue);
}
}
}
}
}
Note that this only removes invalid characters from nodeValues of attributes and textNodes. It does not check tag names or attribute names, comments, etc etc.
Check
https://gist.github.com/john-doherty/b9195065884cdbfd2017a4756e6409cc,
very useful gist, example usage:
const resultXml = removeXMLInvalidChars(INPUT_XML_STRING, true);

How to get first text node of a string while containing bold and italic tags?

String(s) is dynamic
It is originated from onclick event when user clicks anywhere in dom
if string(s)'s first part that is:
"login<b>user</b>account"
is enclosed in some element like this :
"<div>login<b>user</b>account</div>",
then I can get it with this:
alert($(s).find('*').andSelf().not('b,i').not(':empty').first().html());
// result is : login<b>user</b>account
But how can i get the same result in this condition when it is not enclosed in any element .i.e. when it is not enclosed in any element?
I tried this below code which works fine when first part do not include any <b></b> but it only gives "login" when it does include these tags.
var s = $.trim('login<b>user</b> account<tbody> <tr> <td class="translated">Lorem ipsum dummy text</td></tr><tr><td class="translated">This is a new paragraph</td></tr><tr><td class="translated"><b>Email</b></td></tr><tr><td><i>This is yet another text</i></td> </tr></tbody>');
if(s.substring(0, s.indexOf('<')) != ''){
alert(s.substring(0, s.indexOf('<')));
}
Note:
Suggest a generic solution that is not specific for this above string only. It should work for both the cases when there is bold tags and when there ain't any.
So it's just a b or a i, heh?
A recursive function is always the way to go. And this time, it's probably the best way to go.
var s = function getEm(elem) {
var ret = ''
// TextNode? Great!
if (elem.nodeType === 3) {
ret += elem.nodeValue;
}
else if (elem.nodeType === 1 &&
(elem.nodeName === 'B' || elem.nodeName === 'I')) {
// Element? And it's a B or an I? Get his kids!
ret += getEm(elem.firstChild);
}
// Ain't nobody got time fo' empty stuff.
if (elem.nextSibling) {
ret += getEm(elem.nextSibling);
}
return ret;
}(elem);
Jsfiddle demonstrating this: http://jsfiddle.net/Ralt/TZKsP/
PS: Parsing HTML with regex or custom tokenizer is bad and shouldn't be done.
You're trying to retrieve all of the text up to the first element that's not a <b> or <i>, but this text could be wrapped in an element itself. This is SUPER tricky. I feel like there's a better way to implement whatever it is you're trying to accomplish, but here's a solution that works.
function initialText(s){
var test = s.match(/(<.+?>)?.*?<(?!(b|\/|i))/);
var match = test[0];
var prefixed_element = test[1];
// if the string was prefixed with an element tag
// remove it (ie '<div> blah blah blah')
if(prefixed_element) match = match.slice(prefixed_element.length);
// remove the matching < and return the string
return match.slice(0,-1);
}
You're lucky I found this problem interesting and challenging because, again, this is ridiculous.
You're welcome ;-)
Try this:
if (s.substring(0, s.indexOf('<')) != '') {
alert(s.substring(0, s.indexOf('<tbody>')));
}

Dojo Toolkit: how to escape an HTML string?

A user of my HTML 5 application can enter his name in a form, and this name will be displayed elsewhere. More specifically, it will become the innerHTML of some HTML element.
The problem is that this can be exploited if one enters valid HTML markup in the form, i.e. some sort of HTML injection, if you will.
The user's name is only stored and displayed on the client side so in the end the user himself is the only one who is affected, but it's still sloppy.
Is there a way to escape a string before I put it in an elements innerHTML in Dojo? I guess that Dojo at one point did in fact have such a function (dojo.string.escape()) but it doesn't exist in version 1.7.
Thanks.
dojox.html.entities.encode(myString);
Dojo has the module dojox/html/entities for HTML escaping. Unfortunately, the official documentation still provides only pre-1.7, non-AMD example.
Here is an example how to use that module with AMD:
var str = "<strong>some text</strong>"
require(['dojox/html/entities'], function(entities) {
var escaped = entities.encode(str)
console.log(escaped)
})
Output:
<strong>some text</strong>
As of Dojo 1.10, the escape function is still part of the string module.
http://dojotoolkit.org/api/?qs=1.10/dojo/string
Here's how you can use it as a simple template system.
require([
'dojo/string'
], function(
string
){
var template = '<h1>${title}</h1>';
var message = {title: 'Hello World!<script>alert("Doing something naughty here...")</script>'}
var html = string.substitute(
template
, message
, string.escape
);
});
I tried to find out how other libraries implement this function and I stole the idea of the following from MooTools:
var property = (document.createElement('div').textContent == null) ? 'innerText': 'textContent';
elem[property] = "<" + "script" + ">" + "alert('a');" + "</" + "script" + ">";
So according to MooTools there is either the innerText or the textContent property which can escape HTML.
Check this example of dojo.replace:
require(["dojo/_base/lang"], function(lang){
function safeReplace(tmpl, dict){
// convert dict to a function, if needed
var fn = lang.isFunction(dict) ? dict : function(_, name){
return lang.getObject(name, false, dict);
};
// perform the substitution
return lang.replace(tmpl, function(_, name){
if(name.charAt(0) == '!'){
// no escaping
return fn(_, name.slice(1));
}
// escape
return fn(_, name).
replace(/&/g, "&").
replace(/</g, "<").
replace(/>/g, ">").
replace(/"/g, """);
});
}
// that is how we use it:
var output = safeReplace("<div>{0}</div",
["<script>alert('Let\' break stuff!');</script>"]
);
});
Source: http://dojotoolkit.org/reference-guide/1.7/dojo/replace.html#escaping-substitutions

Check if HTML snippet is valid with JavaScript

I need a reliable JavaScript library / function to check if an HTML snippet is valid that I can call from my code. For example, it should check that opened tags and quotation marks are closed, nesting is correct, etc.
I don't want the validation to fail because something is not 100% standard (but would work anyway).
Update: this answer is limited - please see the edit below.
Expanding on #kolink's answer, I use:
var checkHTML = function(html) {
var doc = document.createElement('div');
doc.innerHTML = html;
return ( doc.innerHTML === html );
}
I.e., we create a temporary div with the HTML. In order to do this, the browser will create a DOM tree based on the HTML string, which may involve closing tags etc.
Comparing the div's HTML contents with the original HTML will tell us if the browser needed to change anything.
checkHTML('<a>hell<b>o</b>')
Returns false.
checkHTML('<a>hell<b>o</b></a>')
Returns true.
Edit: As #Quentin notes below, this is excessively strict for a variety of reasons: browsers will often fix omitted closing tags, even if closing tags are optional for that tag. Eg:
<p>one para
<p>second para
...is considered valid (since Ps are allowed to omit closing tags) but checkHTML will return false. Browsers will also normalise tag cases, and alter white space. You should be aware of these limits when deciding to use this approach.
Well, this code:
function tidy(html) {
var d = document.createElement('div');
d.innerHTML = html;
return d.innerHTML;
}
This will "correct" malformed HTML to the best of the browser's ability. If that's helpful to you, it's a lot easier than trying to validate HTML.
None of the solutions presented so far is doing a good job in answering the original question, especially when it comes to
I don't want the validation to fail because something is not 100%
standard (but would work anyways).
tldr >> check the JSFiddle
So I used the input of the answers and comments on this topic and created a method that does the following:
checks html string tag by tag if valid
trys to render html string
compares theoretically to be created tag count with actually rendered html dom tag count
if checked 'strict', <br/> and empty attribute normalizations ="" are not ignored
compares rendered innerHTML with given html string (while ignoring whitespaces and quotes)
Returns
true if rendered html is same as given html string
false if one of the checks fails
normalized html string if rendered html seems valid but is not equal to given html string
normalized means, that on rendering, the browser ignores or repairs sometimes specific parts of the input (like adding missing closing-tags for <p> and converts others (like single to double quotes or encoding of ampersands).
Making a distinction between "failed" and "normalized" allows to flag the content to the user as "this will not be rendered as you might expect it".
Most times normalized gives back an only slightly altered version of the original html string - still, sometimes the result is quite different. So this should be used e.g. to flag user-input for further review before saving it to a db or rendering it blindly. (see JSFiddle for examples of normalization)
The checks take the following exceptions into consideration
ignoring of normalization of single quotes to double quotes
image and other tags with a src attribute are 'disarmed' during rendering
(if non strict) ignoring of <br/> >> <br> conversion
(if non strict) ignoring of normalization of empty attributes (<p disabled> >> <p disabled="">)
encoding of initially un-encoded ampersands when reading .innerHTML, e.g. in attribute values
.
function simpleValidateHtmlStr(htmlStr, strictBoolean) {
if (typeof htmlStr !== "string")
return false;
var validateHtmlTag = new RegExp("<[a-z]+(\s+|\"[^\"]*\"\s?|'[^']*'\s?|[^'\">])*>", "igm"),
sdom = document.createElement('div'),
noSrcNoAmpHtmlStr = htmlStr
.replace(/ src=/, " svhs___src=") // disarm src attributes
.replace(/&/igm, "#svhs#amp##"), // 'save' encoded ampersands
noSrcNoAmpIgnoreScriptContentHtmlStr = noSrcNoAmpHtmlStr
.replace(/\n\r?/igm, "#svhs#nl##") // temporarily remove line breaks
.replace(/(<script[^>]*>)(.*?)(<\/script>)/igm, "$1$3") // ignore script contents
.replace(/#svhs#nl##/igm, "\n\r"), // re-add line breaks
htmlTags = noSrcNoAmpIgnoreScriptContentHtmlStr.match(/<[a-z]+[^>]*>/igm), // get all start-tags
htmlTagsCount = htmlTags ? htmlTags.length : 0,
tagsAreValid, resHtmlStr;
if(!strictBoolean){
// ignore <br/> conversions
noSrcNoAmpHtmlStr = noSrcNoAmpHtmlStr.replace(/<br\s*\/>/, "<br>")
}
if (htmlTagsCount) {
tagsAreValid = htmlTags.reduce(function(isValid, tagStr) {
return isValid && tagStr.match(validateHtmlTag);
}, true);
if (!tagsAreValid) {
return false;
}
}
try {
sdom.innerHTML = noSrcNoAmpHtmlStr;
} catch (err) {
return false;
}
// compare rendered tag-count with expected tag-count
if (sdom.querySelectorAll("*").length !== htmlTagsCount) {
return false;
}
resHtmlStr = sdom.innerHTML.replace(/&/igm, "&"); // undo '&' encoding
if(!strictBoolean){
// ignore empty attribute normalizations
resHtmlStr = resHtmlStr.replace(/=""/, "")
}
// compare html strings while ignoring case, quote-changes, trailing spaces
var
simpleIn = noSrcNoAmpHtmlStr.replace(/["']/igm, "").replace(/\s+/igm, " ").toLowerCase().trim(),
simpleOut = resHtmlStr.replace(/["']/igm, "").replace(/\s+/igm, " ").toLowerCase().trim();
if (simpleIn === simpleOut)
return true;
return resHtmlStr.replace(/ svhs___src=/igm, " src=").replace(/#svhs#amp##/, "&");
}
Here you can find it in a JSFiddle https://jsfiddle.net/abernh/twgj8bev/ , together with different test-cases, including
"<a href='blue.html id='green'>missing attribute quotes</a>" // FAIL
"<a>hell<B>o</B></a>" // PASS
'hell<b>o</b>' // PASS
'<a href=test.html>hell<b>o</b></a>', // PASS
"<a href='test.html'>hell<b>o</b></a>", // PASS
'<ul><li>hell</li><li>hell</li></ul>', // PASS
'<ul><li>hell<li>hell</ul>', // PASS
'<div ng-if="true && valid">ampersands in attributes</div>' // PASS
.
9 years later, how about using DOMParser?
It accepts string as parameter and returns Document type, just like HTML.
Thus, when it has an error, the returned document object has <parsererror> element in it.
If you parse your html as xml, at least you can check your html is xhtml compliant.
Example
> const parser = new DOMParser();
> const doc = parser.parseFromString('<div>Input: <input /></div>', 'text/xml');
> (doc.documentElement.querySelector('parsererror') || {}).innerText; // undefined
To wrap this as a function
function isValidHTML(html) {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/xml');
if (doc.documentElement.querySelector('parsererror')) {
return doc.documentElement.querySelector('parsererror').innerText;
} else {
return true;
}
}
Testing the above function
isValidHTML('<a>hell<B>o</B></a>') // true
isValidHTML('hell') // true
isValidHTML('<a href='test.html'>hell</a>') // true
isValidHTML("<a href=test.html>hell</a>") // This page contains the following err..
isValidHTML('<ul><li>a</li><li>b</li></ul>') // true
isValidHTML('<ul><li>a<li>b</ul>') // This page contains the following err..
isValidHTML('<div><input /></div>' // true
isValidHTML('<div><input></div>' // This page contains the following err..
The above works for very simple html.
However if your html has some code-like texts; <script>, <style>, etc, you need to manipulate just for XML validation although it's valid HTML
The following updates code-like html to a valid XML syntax.
export function getHtmlError(html) {
const parser = new DOMParser();
const htmlForParser = `<xml>${html}</xml>`
.replace(/(src|href)=".*?&.*?"/g, '$1="OMITTED"')
.replace(/<script[\s\S]+?<\/script>/gm, '<script>OMITTED</script>')
.replace(/<style[\s\S]+?<\/style>/gm, '<style>OMITTED</style>')
.replace(/<pre[\s\S]+?<\/pre>/gm, '<pre>OMITTED</pre>')
.replace(/ /g, ' ');
const doc = parser.parseFromString(htmlForParser, 'text/xml');
if (doc.documentElement.querySelector('parsererror')) {
console.error(htmlForParser.split(/\n/).map( (el, ndx) => `${ndx+1}: ${el}`).join('\n'));
return doc.documentElement.querySelector('parsererror');
}
}
function validHTML(html) {
var openingTags, closingTags;
html = html.replace(/<[^>]*\/\s?>/g, ''); // Remove all self closing tags
html = html.replace(/<(br|hr|img).*?>/g, ''); // Remove all <br>, <hr>, and <img> tags
openingTags = html.match(/<[^\/].*?>/g) || []; // Get remaining opening tags
closingTags = html.match(/<\/.+?>/g) || []; // Get remaining closing tags
return openingTags.length === closingTags.length ? true : false;
}
var htmlContent = "<p>your html content goes here</p>" // Note: String without any html tag will consider as valid html snippet. If it’s not valid in your case, in that case you can check opening tag count first.
if(validHTML(htmlContent)) {
alert('Valid HTML')
}
else {
alert('Invalid HTML');
}
Using pure JavaScript you may check if an element exists using the following function:
if (typeof(element) != 'undefined' && element != null)
Using the following code we can test this in action:
HTML:
<input type="button" value="Toggle .not-undefined" onclick="toggleNotUndefined()">
<input type="button" value="Check if .not-undefined exists" onclick="checkNotUndefined()">
<p class=".not-undefined"></p>
CSS:
p:after {
content: "Is 'undefined'";
color: blue;
}
p.not-undefined:after {
content: "Is not 'undefined'";
color: red;
}
JavaScript:
function checkNotUndefined(){
var phrase = "not ";
var element = document.querySelector('.not-undefined');
if (typeof(element) != 'undefined' && element != null) phrase = "";
alert("Element of class '.not-undefined' does "+phrase+"exist!");
// $(".thisClass").length checks to see if our elem exists in jQuery
}
function toggleNotUndefined(){
document.querySelector('p').classList.toggle('not-undefined');
}
It can be found on JSFiddle.
function isHTML(str)
{
var a = document.createElement('div');
a.innerHTML = str;
for(var c= a.ChildNodes, i = c.length; i--)
{
if (c[i].nodeType == 1) return true;
}
return false;
}
Good Luck!
It depends on js-library which you use.
Html validatod for node.js https://www.npmjs.com/package/html-validator
Html validator for jQuery https://api.jquery.com/jquery.parsehtml/
But, as mentioned before, using the browser to validate broken HTML is a great idea:
function tidy(html) {
var d = document.createElement('div');
d.innerHTML = html;
return d.innerHTML;
}
Expanding on #Tarun's answer from above:
function validHTML(html) { // checks the validity of html, requires all tags and property-names to only use alphabetical characters and numbers (and hyphens, underscore for properties)
html = html.toLowerCase().replace(/(?<=<[^>]+?=\s*"[^"]*)[<>]/g,"").replace(/(?<=<[^>]+?=\s*'[^']*)[<>]/g,""); // remove all angle brackets from tag properties
html = html.replace(/<script.*?<\/script>/g, ''); // Remove all script-elements
html = html.replace(/<style.*?<\/style>/g, ''); // Remove all style elements tags
html = html.toLowerCase().replace(/<[^>]*\/\s?>/g, ''); // Remove all self closing tags
html = html.replace(/<(\!|br|hr|img).*?>/g, ''); // Remove all <br>, <hr>, and <img> tags
//var tags=[...str.matchAll(/<.*?>/g)]; this would allow for unclosed initial and final tag to pass parsing
html = html.replace(/^[^<>]+|[^<>]+$|(?<=>)[^<>]+(?=<)/gs,""); // remove all clean text nodes, note that < or > in text nodes will result in artefacts for which we check and return false
tags = html.split(/(?<=>)(?=<)/);
if (tags.length%2==1) {
console.log("uneven number of tags in "+html)
return false;
}
var tagno=0;
while (tags.length>0) {
if (tagno==tags.length) {
console.log("these tags are not closed: "+tags.slice(0,tagno).join());
return false;
}
if (tags[tagno].slice(0,2)=="</") {
if (tagno==0) {
console.log("this tag has not been opened: "+tags[0]);
return false;
}
var tagSearch=tags[tagno].match(/<\/\s*([\w\-\_]+)\s*>/);
if (tagSearch===null) {
console.log("could not identify closing tag "+tags[tagno]+" after "+tags.slice(0,tagno).join());
return false;
} else tags[tagno]=tagSearch[1];
if (tags[tagno]==tags[tagno-1]) {
tags.splice(tagno-1,2);
tagno--;
} else {
console.log("tag '"+tags[tagno]+"' trying to close these tags: "+tags.slice(0,tagno).join());
return false;
}
} else {
tags[tagno]=tags[tagno].replace(/(?<=<\s*[\w_\-]+)(\s+[\w\_\-]+(\s*=\s*(".*?"|'.*?'|[^\s\="'<>`]+))?)*/g,""); // remove all correct properties from tag
var tagSearch=tags[tagno].match(/<(\s*[\w\-\_]+)/);
if ((tagSearch===null) || (tags[tagno]!="<"+tagSearch[1]+">")) {
console.log("fragmented tag with the following remains: "+tags[tagno]);
return false;
}
var tagSearch=tags[tagno].match(/<\s*([\w\-\_]+)/);
if (tagSearch===null) {
console.log("could not identify opening tag "+tags[tagno]+" after "+tags.slice(0,tagno).join());
return false;
} else tags[tagno]=tagSearch[1];
tagno++;
}
}
return true;
}
This performs a few additional checks, such as testing whether tags match and whether properties would parse. As it does not depend on an existing DOM, it can be used in a server environment, but beware: it is slow. Also, in theory, tags can be names much more laxly, as you can basically use any unicode (with a few exceptions) in tag- and property-names. This would not pass my own sanity-check, however.

DOM navigation: eliminating the text nodes

I have a js script that reads and parses XML.
It obtains the XML from an XMLHttpRequest request (which contacts with a php script which returns XML).
The script is supposed to receive 2 or more nodes under the first parentNode.
The 2 nodes it requires have the name well defined, the other ones can be any name.
The output from the php may be:
<?xml version='1.0'?>
<things>
<carpet>
<id>1</id>
<name>1</name>
<desc>1.5</desc>
</carpet>
<carpet>
<id>2</id>
<name>2</name>
<height>unknown</height>
</carpet>
</things>
Here all carpets have 7 nodes.
but it also may be:
<?xml version='1.0'?>
<things>
<carpet>
<id>1</id>
<name>1</name>
<desc>1.5</desc>
</carpet>
<carpet><id>2</id><name>2</name><height>unknown</height></carpet>
</things>
Here the first carpet has 7 nodes, the 2nd carpet has 3 nodes.
I want my javascript code to treat both exactly the same way in a quick and clean way.
If possible, I'd like to remove all the text nodes between each tag. So a code like the one above would always be treated as:
<?xml version='1.0'?>
<things><carpet><id>1</id><name>1</name><desc>1.5</desc></carpet><carpet><id>2</id><name>2</name><height>unknown</height></carpet></things>
Is that possible in a quick and efficient way? I'd like not to use any get function (getElementsByTagName(), getElementById, ...), if possible and if more efficient.
It's pretty straightforward to walk the DOM and remove the nodes you consider empty (containing only whitespace).
This is untested (tested and fixed, live copy here), but it would look something like this (replace those magic numbers with symbols, obviously):
var reBlank = /^\s*$/;
function walk(node) {
var child, next;
switch (node.nodeType) {
case 3: // Text node
if (reBlank.test(node.nodeValue)) {
node.parentNode.removeChild(node);
}
break;
case 1: // Element node
case 9: // Document node
child = node.firstChild;
while (child) {
next = child.nextSibling;
walk(child);
child = next;
}
break;
}
}
walk(xmlDoc); // Where xmlDoc is your XML document instance
There my definition of "blank" is anything which only has whitespace according to the JavaScript interpreter's understanding of the \s (whitespace) RegExp class. Note that some implementations have issues with \s not being inclusive enough (several Unicode "blank" characters outside the ASCII range not being matched, etc.), so be sure to test with your sample data.
I would just try a very crude string replace: assuming you store this in a variable called xml:
var rex = /(\<(\/)?[A-Za-z0-9]+\>)(\s)+/gi;
var a = xml.replace( rex, "$1" );
here's the complete test I put together:
<html><head></head>
<body>
<script type="text/javascript">
var xml = "<?xml version='1.0'?>\n" +
"<things>\n" +
" <carpet>\n" +
" <id>1</id>\n" +
" <name>1</name>\n" +
" <desc>1.5</desc>\n" +
" </carpet>\n" +
" <carpet>\n" +
" <id>2</id>\n" +
" <name>2</name>\n" +
" <height>unknown</height>\n" +
" </carpet>\n" +
"</things>";
var rex = /(\<(\/)?[A-Za-z0-9]+\>)(\s)+/gi;
var a = xml.replace( rex, "$1" );
alert( a );
</script>
</body></html>

Categories

Resources