DOM navigation: eliminating the text nodes - javascript

I have a js script that reads and parses XML.
It obtains the XML from an XMLHttpRequest request (which contacts with a php script which returns XML).
The script is supposed to receive 2 or more nodes under the first parentNode.
The 2 nodes it requires have the name well defined, the other ones can be any name.
The output from the php may be:
<?xml version='1.0'?>
<things>
<carpet>
<id>1</id>
<name>1</name>
<desc>1.5</desc>
</carpet>
<carpet>
<id>2</id>
<name>2</name>
<height>unknown</height>
</carpet>
</things>
Here all carpets have 7 nodes.
but it also may be:
<?xml version='1.0'?>
<things>
<carpet>
<id>1</id>
<name>1</name>
<desc>1.5</desc>
</carpet>
<carpet><id>2</id><name>2</name><height>unknown</height></carpet>
</things>
Here the first carpet has 7 nodes, the 2nd carpet has 3 nodes.
I want my javascript code to treat both exactly the same way in a quick and clean way.
If possible, I'd like to remove all the text nodes between each tag. So a code like the one above would always be treated as:
<?xml version='1.0'?>
<things><carpet><id>1</id><name>1</name><desc>1.5</desc></carpet><carpet><id>2</id><name>2</name><height>unknown</height></carpet></things>
Is that possible in a quick and efficient way? I'd like not to use any get function (getElementsByTagName(), getElementById, ...), if possible and if more efficient.

It's pretty straightforward to walk the DOM and remove the nodes you consider empty (containing only whitespace).
This is untested (tested and fixed, live copy here), but it would look something like this (replace those magic numbers with symbols, obviously):
var reBlank = /^\s*$/;
function walk(node) {
var child, next;
switch (node.nodeType) {
case 3: // Text node
if (reBlank.test(node.nodeValue)) {
node.parentNode.removeChild(node);
}
break;
case 1: // Element node
case 9: // Document node
child = node.firstChild;
while (child) {
next = child.nextSibling;
walk(child);
child = next;
}
break;
}
}
walk(xmlDoc); // Where xmlDoc is your XML document instance
There my definition of "blank" is anything which only has whitespace according to the JavaScript interpreter's understanding of the \s (whitespace) RegExp class. Note that some implementations have issues with \s not being inclusive enough (several Unicode "blank" characters outside the ASCII range not being matched, etc.), so be sure to test with your sample data.

I would just try a very crude string replace: assuming you store this in a variable called xml:
var rex = /(\<(\/)?[A-Za-z0-9]+\>)(\s)+/gi;
var a = xml.replace( rex, "$1" );
here's the complete test I put together:
<html><head></head>
<body>
<script type="text/javascript">
var xml = "<?xml version='1.0'?>\n" +
"<things>\n" +
" <carpet>\n" +
" <id>1</id>\n" +
" <name>1</name>\n" +
" <desc>1.5</desc>\n" +
" </carpet>\n" +
" <carpet>\n" +
" <id>2</id>\n" +
" <name>2</name>\n" +
" <height>unknown</height>\n" +
" </carpet>\n" +
"</things>";
var rex = /(\<(\/)?[A-Za-z0-9]+\>)(\s)+/gi;
var a = xml.replace( rex, "$1" );
alert( a );
</script>
</body></html>

Related

regexp looping and logic in javascript

Not certain if this can be done in regexp under javascript, but thought it would be interesting to see if it is possible.
So thought I would clean up a piece of html to remove most tags, literally just dropping them, so <H1><img><a href ....>. And that would be relatively simple (well, stole the basis from another post, thanks karim79 Remove HTML Tags in Javascript with Regex).
function(inString, maxlength, callback){
console.log("Sting is " + inString)
console.log("Its " + inString.length)
var regex = /(<([^>]+)>)/ig
var outString = inString.replace(regex, "");
console.log("No HTML sting " + outString);
if ( outString.length < maxlength){
callback(outString)
} else {
console.log("Lets cut first bit")
}
}
But then I started thinking, is there a way where I can control regex execution. So lets say that I want to keep certain tabs, like b,br,i and maybe change H1-6 to b. So in pseudo code, something like:
for ( var i in inString.regex.hits ) {
if ( hits[i] == H1 ) {
hits[i] = b;
}
}
The issue is that I want the text thats not HTML tags to stay as it is, and I want it to just cut out by default. One option would of course be to change the ones I want to keep. Say change <b> to [[b]], once that is done to all the ones of interest. Then put them back to <b> once all unknown have been removed. So like this (only for b, and not certain the code below would work):
function(inString, maxlength, callback){
console.log("Sting is " + inString)
console.log("Its " + inString.length)
var regex-remHTML = /(<([^>]+)>)/ig
var regex-hideB = /(<b>)/ig
var regex-showB = /([b])/ig
var outString = inString.replace(regex-hideB, "[b]");
outString = outString.replace(regex-remHTML, "");
outString = outString.replace(regex-showB, "<b>");
console.log("No HTML sting " + outString);
if ( outString.length < maxlength){
callback(outString)
} else {
console.log("Lets cut first bit")
}
}
But would it be possible to be smarter, writing cod ethat says here is a peice of HTML tag, run this code against the match.
As Tim Biegeleisen sai in its comment, maybe a better solution could be using a parser instead of a Regex...
By the way, if you want to control what is going to be changed by the regex you can pass a callback to the String.prototype.replace:
var input = "<div><h1>CIAO Bello</h1></div>";
var output = input.replace(/(<([^>]+)>)/gi, (val) => {
if(val.indexOf("div") > -1) {
return "";
}
return val;
})
;
console.log("output", output);

Is there any generic function for subscripting?

I have a web page in which contents are loaded dynamically from json. Now i need to find the texts like so2,co2,h2o after the page gets loaded and have to apply subscript for those texts. Is it possible to do this?? If yes please let me know the more efficient way of achieving it.
for example :
var json = { chemA: "value of CO2 is", chemB: "value of H2O is" , chemC: "value in CTUe is"};
in the above json i need to change CO2,H2O and e in CTUe as subscript. how to achieve this??
Take a look at this JSfiddle which shows two approaches:
HTML-based using the <sub> tag
Pure Javascript-based by replacing the matched number with the subscript equivalent in unicode:
http://jsfiddle.net/7gzbjxz3/
var json = { chemA: "CO2", chemB: "H2O" };
var jsonTxt = JSON.stringify(json).replace(/(\d)+/g, function (x){
return String.fromCharCode(8320 + parseInt(x));
});
Option 2 has the advantage of being more portable since you're actually replacing the character. I.e., you can copy and paste the text into say notepad and still see the subscripts there.
The JSFiddle shows both approaches. Not sure why the magic number is 8320 when I was expecting it to be 2080...
So you are generating DOM element as per JSON data you are getting. So before displaying it to DOM you can check if that JSON data contains so2,co2,h2o and if it is then replace that with <sub> tag.
For ex:
var text = 'CO2';
text.replace(/(\d+)/g, "<sub>" + "$1" + "</sub>") ;
And this will returns something like this: "CO2".
As per JSON provided by you:
// Only working for integer right now
var json = { chemA: "value of CO2 is", chemB: "value of H2O is" , chemC: "value in CTUe is"};
$.each(json, function(index, value) {
json[index] = value.replace(/(\d+)/g, "<sub>" + "$1" + "</sub>");
});
console.log(json);
Hope this will helps!
To do this, I would create a prototype function extending String and name it .toSub(). Then, when you create your html from your json, call .toSub() on any value that might contain text that should be in subscript:
// here is the main function
String.prototype.toSub = function() {
var str=this;
var subs = [
['CO2','CO<sub>2</sub>'],
['H2O','H<sub>2O</sub>'],
['CTUe','CO<sub>e</sub>'] // add more here as needed.
];
for(var i=0;i<subs.length;i++){
var chk = subs[i][0];
var rep = subs[i][1];
var pattern = new RegExp('^'+chk+'([ .?!])|( )'+chk+'([ .?!])|( )'+chk+'[ .?!]?$','ig'); // makes a regex like this: /^CO2([ .?!])|( )CO2([ .?!])|( )CO2[ .?!]?$/gi using the surrent sub
// the "empty" capture groups above may seem pointless but they are not
// they allow you to capture the spaces easily so you dont have to deal with them some other way
rep = '$2$4'+rep+'$1$3'; // the $1 etc here are accessing the capture groups from the regex above
str = str.replace(pattern,rep);
}
return str;
};
// below is just for the demo
var json = { chemA: "value of CO2 is", chemB: "value of H2O is" , chemC: "value in CTUe is", chemD: "CO2 is awesome", chemE: "I like H2O!", chemF: "what is H2O?", chemG: "I have H2O. Do you?"};
$.each(json, function(k, v) {
$('#result').append('Key '+k+' = '+v.toSub()+'<br>');
});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<div id="result"></div>
Note:
Anytime you do something like this with regex, you run the chance of unintentionally matching and converting some unwanted bit of text. However, this approach will have far fewer edge cases than searching and replacing text in your whole document as it is much more targeted.

Replace array-mapped variables with the actual variable name/string?

I am trying to edit a Greasemonkey/jQuery script. I can't post the link here.
The code is obfuscated and compressed with minify.
It starts like this:
var _0x21e9 = ["\x67\x65\x74\x4D\x6F\x6E\x74\x68", "\x67\x65\x74\x55\x54\x43\x44\x61\x74\x65", ...
After "decoding" it, I got this:
var _0x21e9=["getMonth","getUTCDate","getFullYear", ...
It is a huge list (500+ ). Then, it has some variables like this:
month = date[_0x21e9[0]](), day = date[_0x21e9[1]](), ...
_0x21e9[0] is getMonth, _0x21e9[1] is getUTCDate, etc.
Is it possible to replace the square brackets with the actual variable name? How?
I have little knowledge in javascript/jQuery and can not "read" the code the way it is right now.
I just want to use some functions from this huge script and remove the others I do not need.
Update: I tried using jsbeautifier.org as suggested here and in the duplicated question but nothing changed, except the "indent".
It did not replace the array variables with the decoded names.
For example:
jsbeautifier still gives: month = date[_0x21e9[0]]().
But I need: month = date["getMonth"]().
None of the online deobfuscators seem to do this, How can I?
Is there a way for me to share the code with someone, at least part of it? I read I can not post pastebin, or similar here. I can not post it the full code here.
Here is another part of the code:
$(_0x21e9[8] + vid)[_0x21e9[18]]();
[8] is "." and [18] is "remove". Manually replacing it gives a strange result.
I haven't seen any online deobfuscator that does this yet, but the principle is simple.
Construct a text filter that parses the "key" array and then replaces each instance that that array is referenced, with the appropriate array value.
For example, suppose you have a file, evil.js that looks like this (AFTER you have run it though jsbeautifier.org with the Detect packers and obfuscators? and the Unescape printable chars... options set):
var _0xf17f = ["(", ")", 'div', "createElement", "id", "log", "console"];
var _0x41dcx3 = eval(_0xf17f[0] + '{id: 3}' + _0xf17f[1]);
var _0x41dcx4 = document[_0xf17f[3]](_0xf17f[2]);
var _0x41dcx5 = _0x41dcx3[_0xf17f[4]];
window[_0xf17f[6]][_0xf17f[5]](_0x41dcx5);
In that case, the "key" variable would be _0xf17f and the "key" array would be ["(", ")", ...].
The filter process would look like this:
Extract the key name using text processing on the js file. Result: _0xf17f
Extract the string src of the key array. Result:
keyArrayStr = '["(", ")", \'div\', "createElement", "id", "log", "console"]';
In javascript, we can then use .replace() to parse the rest of the JS src. Like so:
var keyArrayStr = '["(", ")", \'div\', "createElement", "id", "log", "console"]';
var restOfSrc = "var _0x41dcx3 = eval(_0xf17f[0] + '{id: 3}' + _0xf17f[1]);\n"
+ "var _0x41dcx4 = document[_0xf17f[3]](_0xf17f[2]);\n"
+ "var _0x41dcx5 = _0x41dcx3[_0xf17f[4]];\n"
+ "window[_0xf17f[6]][_0xf17f[5]](_0x41dcx5);\n"
;
var keyArray = eval (keyArrayStr);
//-- Note that `_0xf17f` is the key name we already determined.
var keyRegExp = /_0xf17f\s*\[\s*(\d+)\s*\]/g;
var deObsTxt = restOfSrc.replace (keyRegExp, function (matchStr, p1Str) {
return '"' + keyArray[ parseInt(p1Str, 10) ] + '"';
} );
console.log (deObsTxt);
if you run that code, you get:
var _0x41dcx3 = eval("(" + '{id: 3}' + ")");
var _0x41dcx4 = document["createElement"]("div");
var _0x41dcx5 = _0x41dcx3["id"];
window["console"]["log"](_0x41dcx5);
-- which is a bit easier to read/understand.
I've also created an online page that takes JS source and does all 3 remapping steps in a slightly more automated and robust manner. You can see it at:
jsbin.com/hazevo
(Note that that tool expects the source to start with the "key" variable declaration, like your code samples do)
#Brock Adams solution is brilliant, but there is a small bug: it doesn't take into account simple quoted vars.
Example:
var _0xbd34 = ["hello ", '"my" world'];
(function($) {
alert(_0xbd34[0] + _0xbd34[1])
});
If you try to decipher this example, it will result on this:
alert("hello " + ""my" world")
To resolve this, just edit the replacedSrc.replace into #Brock code:
replacedSrc = replacedSrc.replace (nameRegex, function (matchStr, p1Str) {
var quote = keyArry[parseInt (p1Str, 10)].indexOf('"')==-1? '"' : "'";
return quote + keyArry[ parseInt (p1Str, 10) ] + quote;
} );
Here you have a patched version.
for (var i = 0; i < _0x21e9.length; i++) {
var funcName = _0x21e9[i];
_0x21e9[funcName] = funcName;
}
this will add all the function names as keys to the array. allowing you to do
date[_0x21e9["getMonth"]]()

eval javascript function IE6 taking long time

I have the below chunk of code. I've debugged through and located the snippet that is causing a long delay in IE6.
Basically the code loops through a document converting it to XML and sending to a PDF. On Ubuntu and Firefox 4 it takes 3 seconds. On IE it can take up to 40 seconds regularly.
/**
* This function builds up the XML to be saved to the DM.
*/
function getXMLToSave(){
var text="<workbook><sheet><name>Adv4New</name>";
//show_props(document.adv4.row10col1, "document.adv4.row10col1");
for(i=1;i<157;i++){
text = text + "<row number='" + i + "'>";
for(j=1;j<=7;j++){
text = text + "<col ";
//alert(eval('document.adv4.row'+i+'col'+j+'.readonly'));
try{
text = text + "number='" + j + "' label='" + eval('document.adv4.row'+i+'col'+j+'.className')+ "'";
}
catch (e) {
text = text + "number='" + j + "' label=''";
}
try {
if(eval('document.adv4.row'+i+'col'+j).readOnly)
text = text + " type='readonly'";
else
text = text + " type=''";
}
catch (e) {
text = text + " type=''";
}
try {
text = text + " color='" + eval('document.adv4.row'+i+'col'+j+'.style.color') + "'";
}
catch (e) {
text = text + " color=''";
}
text = text + ">";
try {
// don't wrap in a CDATA (like previously), but run cleanNode
// this fixes html entities
var content = eval('document.adv4.row'+i+'col'+j+'.value');
text = text + cleanNode(content);
}
catch (e) {
text = text + "0";
}
text = text + "</col>";
}
text = text + "</row>";
}
text = text + "</sheet></workbook>";
return text;
}
I believe its the eval function causing the delay in IE6. Is there a neat solution to fix this. Thanks very much
Why are you using eval in the firts place?
eval('document.adv4.row'+i+'col'+j+'.style.color')
Use bracket notation!
document.adv4["row"+i+"col"+j].style.color
You don't need eval() at all:
text = text + "number='" + j + "' label='" + document.adv4['row' + i + 'col' + j].className + "'";
Also, in IE6 (but not in newer browsers), building up large strings by repeatedly adding more content is really, really slow. It was way faster in that browser to build up strings by creating an array of substrings and then joining them all together when finished with all the pieces.
Don't use eval EVAL is EVIL. Having said that, you really shouldn't care about IE6: Even MS doesn't support it any longer, why should you bother?
Anyhow, change all eval calls like:
eval('document.adv4.row'+i+'col'+j+'.value');
to
document.adv4['row' + i + 'col' + j].value;
To access the elements directly. Remember that Nodes are objects, so their properties can be accessed either using the dot-notation (foo.bar) or the "associative array" notation: foo['bar'], the latter being very useful when you need the value of a variable to access properties
Don't use eval - period. The eval() should be renamed to evil(). There is almost no situation where you really need to use the eval function.
In this case you can use document.getElementById() to find a DOM node with a specific id.
It's likely that it's all the string concatentation that makes it slow. Each time you add something to the text, it will copy all the previous text into a new string.
Newer browsers have optimised code for this special case, so for them the impact is less.
Instead of concatenating strings like this:
text = text + "something";
use an array instead:
var text = [];
then add items to the array using the push method:
text.push("<workbook><sheet><name>Adv4New</name>");
Finally just join the strings together:
return text.join('');
One solution could be generating a color array (or maybe an object if you need it) and then using it.
But then, ask yourself the question "Should I really support IE6?"

Javascript to add cdata section on the fly?

I'm having trouble with special characters that exist in an xml node attribute. To combat this, I'm trying to render the attributes as child nodes and, where necessary, using cdata sections to get around the special characters. The problem is, I can't seem to get the cdata section appended to the node correctly.
I'm iterating over the source xml node's attributes and creating new nodes. If the attribute.name = "description" I want to put the attribute.text() in a cdata section and append the new node. That's where I jump the track.
// newXMLData is the new xml document that I've created in memory
for (var ctr =0;ctr< this.attributes.length;ctr++){ // iterate over the attributes
if( this.attributes[ctr].name =="Description"){ // if the attribute name is "Description" add a CDATA section
var thisNodeName = this.attributes[ctr].name;
newXMLDataNode.append("<"+thisNodeName +"></"+ thisNodeName +">" );
var cdata = newXMLData.createCDATASection('test'); // here's where it breaks.
} else {
// It's not "Description" so just append the new node.
newXMLDataNode.append("<"+ this.attributes[ctr].name +">" + $(this.attributes[ctr]).text() + "</"+ this.attributes[ctr].name +">" );
}
}
Any ideas? Is there another way to add a cdata section?
Here's a sample snippet of the source...
<row
pSiteID="4"
pSiteTile="Test Site Name "
pSiteURL="http://www.cnn.com"
ID="1"
Description="<div>blah blah blah since June 2007.&nbsp; T<br>&nbsp;<br>blah blah blah blah&nbsp; </div>"
CreatedDate="2010-09-20 14:46:18"
Comments="Comments example.
" >
here's what I'm trying to create...
<Site>
<PSITEID>4</PSITEID>
<PSITETILE>Test Site Name</PSITETILE>
<PSITEURL>http://www.cnn.com</PSITEURL>
<ID>1</ID>
<DESCRIPTION><![CDATA[<div>blah blah blah since June 2007.&nbsp; T<br>&nbsp;<br>blah blah blah blah&nbsp; </div ]]></DESCRIPTION>
<CREATEDDATE>2010-09-20 14:46:18</CREATEDDATE>
<COMMENTS><![CDATA[ Comments example.
]]></COMMENTS>
</Site>
I had the same issue. i was trying to append CDATA to xml nodes, so i thought its as easy as adding like so:
valueNode[0].text = "<![CDATA["+ tmpVal +"]]>";
//valueNode[0] represents "<value></value>"
This does not work because the whole thing will get interpreted as text therefore <(less than) and > (great than) will be replaced automatically.
what you need to do is use createCDATASection by doing the following:
var tmpCdata = $xmlDoc[0].createCDATASection(escape("muzi test 002"));
//i'm also escaping special charactures as well
valueNode[0].appendChild(tmpCdata);
results will be:
<value><![CDATA[muzi%20test%20002]]></value>
Brettz9 (in previous answer) explains how to do this but quite complex, therefore i just wanted to add my solution which is much simpler.
thanks,
Not sure of browser support for document.implementation.createDocument or createCDataSection, but this works in Mozilla at least:
<script>
// Define some helpers (not available IE < 9)
function parse (str) {
return new DOMParser().parseFromString(str, 'text/xml').documentElement;
}
function ser (dom) {
return new XMLSerializer().serializeToString(dom);
}
// Simulate your XML retrieval
var row = '<row pSiteID="4" pSiteTile="Test Site Name " pSiteURL="http://www.cnn.com" ID="1" Description="<div>blah blah blah since June 2007.&nbsp; T<br>&nbsp;<br>blah blah blah blah&nbsp; </div>" CreatedDate="2010-09-20 14:46:18" Comments="Comments example.
" />';
// Hack to convert source to well-formed XML, or otherwise you can't use DOM methods on it which
// depend on well-formed XML
row = row.replace(/(=\s*")([\s\S]*?)(")/g, function (n0, n1, n2, n3) {
return n1+ // Add back equal sign and opening quote
n2.replace(/</g, '<'). // Create well-formed XML by avoiding less-than signs inside attributes
replace(/&nbsp;/g, '&#160;')+ // HTML entities (except for gt, lt, amp, quot) must be either converted to numeric character references or your XML must define the same entities
n3; // Add back closing quote
});
// Simulate your retrieval of DOM attributes, though in this context, we're just making attributes into a global
this.attributes = parse(row).attributes;
// Simulate your creation of an XML document
var newXMLData = document.implementation.createDocument(null, 'Site', null);
// Modify your code to avoid jQuery dependency for easier testing and to
// avoid confusion (error?) of having two variables, newXMLData and newXMLDataNode
for (var ctr =0;ctr< this.attributes.length;ctr++){ // iterate over the attributes
if (this.attributes[ctr].name =="Description") { // if the attribute name is "Description" add a CDATA section
var thisNodeName = this.attributes[ctr].name;
var str = "<"+thisNodeName +"></"+ thisNodeName +">";
var node = parse(str);
var cdata = newXMLData.createCDATASection(this.attributes[ctr].textContent);
node.appendChild(cdata);
newXMLData.documentElement.appendChild(node);
}
else {
// It's not "Description" so just append the new node.
var str= "<"+ this.attributes[ctr].name +">" + this.attributes[ctr].textContent + "</"+ this.attributes[ctr].name +">";
newXMLData.documentElement.appendChild(parse(str));
}
}
// Prove its working (though you may wish to use toUpperCase() if you need the element names upper-cased);
// if you need CDATA for Comments, you can follow the pattern above to add support for that too
alert(ser(newXMLData));
</script>

Categories

Resources