syntax highlighting design - javascript

I'm writing my own syntax highlighter in javascript for fun and see a couple of approaches but they both have pros and some pretty serious cons that I can't get around. What do you guys think about these approaches and are there better methods that I'm missing?
Assumption
Code to highlight exists in a single string.
Approaches
Treat code in it's string form and use regular expressions to find patterns.
Pros
Simple to define and search for patterns
Cons
Hard to disregard keywords inside of quotes or comments
Split the string by spaces and linebreaks and loop over the array.
Pros
Easy to keep track of scope
Cons
Hard to keep track of spaces and linebreaks after the split
EDIT: Lexical Analysis
So, if I understand it, using Lexical Analysis you break the string into tokens. This somehow sounds a lot like approach number 2? How do you approach reassembling the tokens into the original string?

Note: This uses jQuery. It can pretty well be rewritten to work with straight javascript if you want.
I actually wrote a little plugin for fun that does this:
(function($) {
$.fn.codeBlock = function(blockComment) {
// Setup keyword regex
var keywords = /(abstract|boolean|break|byte|case|catch|char|class|const|continue|debugger|default|delete|do|double|else|enum|export|extends|final|finally|float|for|function|goto|if|implements|import|in|instanceof|int|interface|long|native|new|package|private|protected|public|return|short|static|super|switch|synchronized|this|throw|throws|transient|try|typeof|var|void|volatile|while|with|true|false|prototype)(?!\w|=)/gi;
// Booleans to toggle comment, regex, quote exclusions
var comment = false;
var quote = false;
var regex = false;
/* Array used to store values of regular expressions, quotes, etc.
so they can be used to ID locations to be skipped durring keyword
regexing.
*/
var locator = new Array();
var locatorIndex = 0;
if (blockComment) locator[locatorIndex++] = 0;
var text = $(this).html();
var continuation;
var numerals = /[0-9]/;
var arr = ($(this).html()).split("");
var outhtml = "";
for (key in arr) {
// Assign three variables common 'lookup' values for faster aquisition
var keyd = key;
var val = arr[keyd];
var nVal = arr[keyd - 1];
var pVal = arr[++keyd];
if ((val == "\"" || val == "'") && nVal != "\\") {
if (quote == false) {
quote = true;
outhtml += val;
}
else {
outhtml += val;
quote = false;
}
locator[locatorIndex++] = parseInt(key);
}
else if (numerals.test(val) && quote == false && blockComment == false && regex == false) {
outhtml += '<span class="num">' + val + '</span>';
}
else if (val == "/" && nVal != "<") {
var keys = key;
if (pVal == "/") {
comment = true;
continuation = key;
break;
}
else if (pVal == "*") {
outhtml += "/";
blockComment = true;
locator[locatorIndex++] = parseInt(key);
}
else if (nVal == "*") {
outhtml += "/";
blockComment = false;
locator[locatorIndex++] = parseInt(key);
}
else if (pVal == "[" && regex == false) {
outhtml += "<span class='res'>/";
regex = true;
}
else {
outhtml += "/";
}
}
else if (val == "," || val == ";" && regex == true) {
outhtml += "</span>" + val;
regex = false;
}
else {
outhtml += val;
}
}
if (comment == true) {
outhtml = outhtml.replace(keywords, "<span class='res'>$1</span>");
outhtml += '<span class="com">';
outhtml += text.substring(continuation, text.length);
outhtml += '</span>';
}
else {
if ((locator.length % 2) != 0) locator[locator.length] = (text.length - 1);
if (locator.length != 0) {
text = outhtml;
outhtml = text.substring(0, locator[0]).replace(keywords, "<span class=\"res\">$1</span>");
for (var i = 0; i < locator.length;) {
qTest = text.substring(locator[i], locator[i] + 1);
if (qTest == "'" || qTest == "\"") outhtml += "<span class=\"quo\">";
else outhtml += "<span class=\"com\">";
outhtml += text.substring(locator[i], locator[++i] + 1) + "</span>";
outhtml += text.substring(locator[i] + 1, locator[++i]).replace(keywords, "<span class=\"res\">$1</span>");
}
}
else {
outhtml = outhtml.replace(keywords, "<span class=\"res\">$1</span>");
}
}
text = outhtml;
$(this).html(text);
return blockComment;
}
})(jQuery);
I'm not going to claim it is the most efficient way of doing this or the best but it does work. There are still probably a few bugs in there I haven't ID'd yet (and 1 I know about but haven't gotten around to fixing) but this should give you an idea of how you could go about this if you like.
My suggested implementation of this is to create a textarea or something and have the plugin run when you click a button or something (as far as testing it goes that is a decent idea) and of course you can set the text in the textarea to some starting code to make sure it works (Tip: You can put tags in between the the <textarea> tag and it will render as text, not HTML).
Also, blockComment is a boolean, make sure to pass false because true will trigger the block quoting. If you decided to parse something line by line, like:
<a>code</a>
<a>some more code</a>
Do something like:
blockComment = false;
$("a").each(function() {
blockComment = $(this).codeBlock(blockComment);
});

Related

Compare two string in js and find difference

how to compare two strings purely, and provide specific result such as highlight extra word, wrong word & skip word in 2nd string. for eg.
var x = "This is the first original string in javascript language." </br>
var y = "This is not the first org string in language."
diff = wrong word ="org"<br>
Extra word ="not"<br>
skip word ="javascript"
//here is slice of my code but in some case my program fails
var x = "here is some value of string";
var y = "here is the some val string";
var k=0;
var SkipWrd="";
for(var i=0; i<y.length;i++){
var sktmp="";
var swtmp=0;
for(var j=0; j<=2;j++) {
if(x[k]!="undefined"){
if(y[i]==x[k+j]){
SkipWrd+=sktmp;
skip+=swtmp;
H_wrd += typ_wrd[i]+" ";
org_para+= sktmp+x[k+j]+" ";
k+=j+1;
break;
}
else{
sktmp+= "<mark>"+ x[k+j]+" "+ "</mark>";
swtmp++;
if(j==2 && y[i]!=x[k+j]){
err++;
Err_Wrd+=y[i]+" ";
H_wrd += "<span id='H-Err'>" + typ_wrd[i] + "</span> ";
org_para+="<span id='O-Err'>" +x[k]+" "+ "</span> ";
k++;
}
}
}
}
}
I have used a word by word comparison approach rather than the character by character approach you used. The overall logic is similar.
The below code will work for your example, but there are many more cases that might go wrong.
var x = "This is the first original string in javascript language.";
var y = "This is not the first org string in language.";
x=x.split(' ');
y=y.split(' ');
var i=0,j=0;
while (1) {
if (!x[i] || !y[j]) break;
if (x[i] == y[j]) {
i++;
j++;
continue;
}
if (x[i] == y[j+1]) {
console.log('Extra word : ', y[j]);
i++;
j+=2;
continue;
}
if (x[i+1] == y[j]) {
console.log('Skip word: ', x[i]);
i+=2;
j++;
continue;
}
if (x[i+1] == y[j+1]) {
console.log('Wrong word: ', y[j]);
i++;
j++;
continue;
}
}

I want to remove double quotes inside a JSON string through regular expressions in javascript

I have a JSON something like this
{
"DocumentID": 28663,
"DocumentName": " Have a "HAPPY" and safe journey ",
}
I am getting error
Unexpected token H in JSON at position 315784
So I have to remove double quotes surrounding HAPPY.
How do I do this?
Since you are using C# to encode. From the MSDN website:
Json.NET should be used serialization and deserialization. Provides
serialization and deserialization functionality for AJAX-enabled
applications.
http://www.newtonsoft.com/json
Using that library should guarantee you getting a valid JSON string instead of an invalid JavaScript serialization.
" Have a "HAPPY" and safe journey " is not a correct string, you should write something like this:
" Have a 'HAPPY' and safe journey " or ' Have a "HAPPY" and safe journey '
Obviously you can solve this without reg ex. Most probably you're parsing the JSON wrong way.
But still anyway here's the code to replace double quote with single in json string
function passJSON_ItWillReplaceDoubleQuoteWithSingle(str){
var firstDoubleQuoteFound = false;
var escapeNextDouble = false;
var newStr = "";
for (var i = 0; i<str.length; i++){
var s = str.charAt(i);
if(s == '"' && !firstDoubleQuoteFound){
firstDoubleQuoteFound = true;
newStr += '"';
} else if(s == '"' && firstDoubleQuoteFound && !escapeNextDouble){
if( isNextNonSpaceAcceptable(str, i+1) ){
newStr += s;
firstDoubleQuoteFound = false;
} else {
escapeNextDouble = true;
newStr += '\'';
}
} else if(s == '"' && firstDoubleQuoteFound && escapeNextDouble){
newStr += '\'';
escapeNextDouble = false;
} else{
newStr += s;
}
}
return newStr;
}
function doubleQuoteReplaced(str){
return str.replace(/"/g, "'");
}
function isNextNonSpaceAcceptable(str, i){
for(var j=i ; j<str.length ; j++){
var s = str.charAt(j);
if( s==' ' || s=='\t')
continue;
if( s == '}' || s==',' || s == ':')
return true;
else
return false;
}
}
and here's how I checked
var str = "{\"some_key\" : \"some \"Happy\" people\", \"next_key\":\"Z\"}";
console.log(passJSON_ItWillReplaceDoubleQuoteWithSingle(str));
It will fail when there's comma or '}' or ':' just after double quote

How to remove space from RiTa.js random word generator?

I'm working on a word generator inspired by Daniel Shiffman's demo of the RiTa library. Right now, the code adds a space between all words and punctuation using the line:
output += " ";
I have been trying to figure out how to change the code so that space does not appear between punctuation (such as periods) and words. I think the simplest way to do this would be to use an if/else statement that leaves punctuation unaltered but adds space to words, but I am having a hard time figuring out what functions from the Rita library to use for this, as well as the syntax.
Any ideas? Here's my code right now:
var input;
var button;
var lexicon;
function setup() {
noCanvas();
lexicon = new RiLexicon();
input = createInput('As I say a noun is the name of a thing.');
button = createButton('submit');
input.changed(processRita);
button.mousePressed(processRita);
input.size(400);
}
function processRita() {
var s = input.value();
var rs = new RiString(s);
var words = rs.words();
var pos = rs.pos();
console.log(words);
console.log(pos);
var output = '';
for (var i = 0; i < words.length; i++) {
if (/nn.*/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
if(alliterations.length == 0){
output+=words[i];
}else{
output += alliterations[Math.floor(Math.random() * alliterations.length)];
}
//console.log("noun");
//console.log(alliterations.length);
} else if (/jj.*/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
output += alliterations[Math.floor(Math.random() * alliterations.length)];
//console.log("adjective");
} else if (/vb/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
output += alliterations[Math.floor(Math.random() * alliterations.length)];
//console.log("verbs");
}
else {
//console.log(words[i]);
output += words[i];
} {
output += " ";
}
}
createP(output);
}
Why do you need a library for this? Can't you just use the regular String functions to test whether a String is a punctuation mark?
You could just use a regular expression to test whether a String matches a punctuation character. Or just use a series of equality checks against each punctuation mark you care about.
You might also check out the startsWith() function and the endsWith() function.
after much trial and error, I had help from a coding professor who helped me to solve this problem, which was more complicated than I originally anticipated. In order to get this code to work, we added this bit toward the beginning of the for loop:
if(words[i] == "." || words[i] == "," || words[i] == "?" || words[i] == "!"){
output += words[i];
}else{
output += " ";
So the entire code now looks like this:
for (var i = 0; i < words.length; i++) {
if(words[i] == "." || words[i] == "," || words[i] == "?" || words[i] == "!"){
output += words[i];
}else{
output += " ";
if (/nn.*/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
if(alliterations.length == 0){
output+=words[i];
}else{
output += alliterations[Math.floor(Math.random() * alliterations.length)];
}
//console.log("noun");
//console.log(alliterations.length);
} else if (/jj.*/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
output += alliterations[Math.floor(Math.random() * alliterations.length)];
//console.log("adjective");
} else if (/vb/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
output += alliterations[Math.floor(Math.random() * alliterations.length)];
//console.log("verbs");
}
else {
//console.log(words[i]);
output += words[i];
}
}
}
createP(output);
}
Its much simpler if you use the RiTa library functions:
function processRita() {
var all, output = [],
words = RiTa.tokenize(input.value()),
pos = RiTa.getPosTags(words);
for (var i = 0; i < words.length; i++) {
if (/[nn|kk|vb|jj].*/.test(pos[i]) && (all = lexicon.alliterations(words[i])).length) {
output.push(RiTa.randomItem(all));
} else {
output.push(words[i]);
}
}
createP(RiTa.untokenize(output));
}

Editable iframe without allowing JavaScript to run

I have an iframe:
<iframe id="msgContainer" sandbox="allow-same-origin"></iframe>
and I'd like to insert HTML into it, but not let any JavaScript that may be contained in the HTML run (this includes both <script> tags and on* attributes. I know how to insert HTML (just use document.getElementById('msgContainer').contentDocument.body.innerHTML=myHTML but I'd like to prevent any JS in myHTML from running. The way I've tried to do this is by using the sandbox attribute and only allowing same-origin, but JS still runs. Is there any way to do this?
Thanks
I couldn't find any answer other than to parse out the JS from an html string inserted into the iframe. Here's my code (if it helps anyone else):
/** Removes javascript from html string
* html: the string to be cleaned
*/
function clean(html) {
function stripHTML(){
html = html.slice(0, strip) + html.slice(j);
j = strip;
strip = false;
}
var strip = false,
lastQuote = false,
tag = false;
const prefix = "ANYTHING",
sandbox = " sandbox=''";
for(var i=0; i<html.length; i++){
if(html[i] === "<" && html[i+1] && isValidTagChar(html[i+1])) {
i++;
tag = false;
/* Enter element */
for(var j=i; j<html.length; j++){
if(!lastQuote && html[j] === ">"){
if(strip) {
stripHTML();
}
/* sandbox iframes */
if(tag === "iframe"){
var index = html.slice(i, j).toLowerCase().indexOf("sandbox");
if(index > 0) {
html = html.slice(0, i+index) + prefix + html.slice(i+index);
j += prefix.length;
}
html = html.slice(0, j) + sandbox + html.slice(j);
j += sandbox.length;
}
i = j;
break;
}
if(!tag && html[j] === " "){
tag = html.slice(i, j).toLowerCase();
}
if(lastQuote === html[j]){
lastQuote = false;
continue;
}
if(!lastQuote && html[j-1] === "=" && (html[j] === "'" || html[j] === '"')){
lastQuote = html[j];
}
/* Find on statements */
if(!lastQuote && html[j-2] === " " && html[j-1] === "o" && html[j] === "n"){
strip = j-2;
}
if(strip && html[j] === " " && !lastQuote){
stripHTML();
}
}
}
}
html = stripScripts(html);
return html;
}
/** Returns whether or not the character is a valid first character in a tag
* str: the first character
*/
function isValidTagChar(str) {
return str.match(/[a-z?\\\/!]/i);
}
/** Strips scripts from a string of html
* html: the string of html to be stripped
*/
// NOTE: <script> tags won't run in this context
function stripScripts(html) {
var div = document.createElement('div');
div.innerHTML = html;
var scripts = div.getElementsByTagName('script');
var i = scripts.length;
while (i--) {
scripts[i].parentNode.removeChild(scripts[i]);
}
return div.innerHTML;
}

Differences between two regex JavaScript bookmarklets

I'm having some weird issues "improving" a bookmarklet.
I took this example from here - it takes a regular expression and highlights text on the page matching the expression - I've reformatted it for easy reading using JSMin for Notepad++:
javascript : (function () {
var count = 0,
text,
regexp;
text = prompt("Search regexp:", "");
if (text == null || text.length == 0)
return;
try {
regexp = new RegExp("(" + text + ")", "i");
} catch (er) {
alert("Unable to create regular expression using text '" + text + "'.\n\n" + er);
return;
}
function searchWithinNode(node, re) {
var pos,
skip,
spannode,
middlebit,
endbit,
middleclone;
skip = 0;
if (node.nodeType == 3) {
pos = node.data.search(re);
if (pos >= 0) {
spannode = document.createElement("SPAN");
spannode.style.backgroundColor = "yellow";
middlebit = node.splitText(pos);
endbit = middlebit.splitText(RegExp.$1.length);
middleclone = middlebit.cloneNode(true);
spannode.appendChild(middleclone);
middlebit.parentNode.replaceChild(spannode, middlebit);
++count;
skip = 1;
}
} else if (node.nodeType == 1 && node.childNodes && node.tagName.toUpperCase() != "SCRIPT" && node.tagName.toUpperCase != "STYLE") {
for (var child = 0; child < node.childNodes.length; ++child) {
child = child + searchWithinNode(node.childNodes[child], re);
}
}
return skip;
}
window.status = "Searching for " + regexp + "...";
searchWithinNode(document.body, regexp);
window.status = "Found " + count + " match" + (count == 1 ? "" : "es") + " for " + regexp + ".";})();
Here's my bespoke improvement to the first 10 lines for single-click highlighting for the :
javascript : (function () {
var count = 0,
regexp;
try {
regexp = /\bwho\b|\bwhom\b|\blay\b|\blie\b|\bmoot\b|\bcontinual\b|\bcontinuous\b|\benvy\b|\bjealousy\b|\benvious\b|\bjealous\b|\bnor\b|\bmay\b|\bmight\b|\bwhether\b|\bfewer\b|\bless\b|\bdisinterested\b|\buninterested\b|\bdifferent than\b|\bimpactful\b|\baffect\b|\beffect\b|\birony\b|\bironic\b|\bnauseous\b/i;
} catch (er) {
alert("Unable to create regular expression\n\n" + er);
return;
}
...
The first works, the second doesn't. The first even works when copying the expression from the second into the prompt.
When the second runs, the browser consumes CPU for a while, then highlights squat. The first is near-instant. Behaviour doesn't seem to differ between IE9/Chrome17/FF10. Using new Regex(...) in the second doesn't help - I'm using the slash notation to save having to double slash the rest, making it less readable.
Would anyone be willing to point me towards my mistake?
you left out the "(" and ")" in your expression.
This works: regexp = /(\bwho\b|\bwhom\b|\blay\b|\blie\b|\bmoot\b|\bcontinual\b|\bcontinuous\b|\benvy\b|\bjealousy\b|\benvious\b|\bjealous\b|\bnor\b|\bmay\b|\bmight\b|\bwhether\b|\bfewer\b|\bless\b|\bdisinterested\b|\buninterested\b|\bdifferent than\b|\bimpactful\b|\baffect\b|\beffect\b|\birony\b|\bironic\b|\bnauseous\b)/i;
If you ask me why the parenthesis are necessary, I don't know. Related to code further downstream is my educated guess. All I did was compare what was different between the original code and your code; given the fact that the expression worked when entered into the input box.

Categories

Resources