Differences between two regex JavaScript bookmarklets

Differences between two regex JavaScript bookmarklets - javascript

I'm having some weird issues "improving" a bookmarklet.
I took this example from here - it takes a regular expression and highlights text on the page matching the expression - I've reformatted it for easy reading using JSMin for Notepad++:
javascript : (function () {
var count = 0,
text,
regexp;
text = prompt("Search regexp:", "");
if (text == null || text.length == 0)
return;
try {
regexp = new RegExp("(" + text + ")", "i");
} catch (er) {
alert("Unable to create regular expression using text '" + text + "'.\n\n" + er);
return;
}
function searchWithinNode(node, re) {
var pos,
skip,
spannode,
middlebit,
endbit,
middleclone;
skip = 0;
if (node.nodeType == 3) {
pos = node.data.search(re);
if (pos >= 0) {
spannode = document.createElement("SPAN");
spannode.style.backgroundColor = "yellow";
middlebit = node.splitText(pos);
endbit = middlebit.splitText(RegExp.$1.length);
middleclone = middlebit.cloneNode(true);
spannode.appendChild(middleclone);
middlebit.parentNode.replaceChild(spannode, middlebit);
++count;
skip = 1;
}
} else if (node.nodeType == 1 && node.childNodes && node.tagName.toUpperCase() != "SCRIPT" && node.tagName.toUpperCase != "STYLE") {
for (var child = 0; child < node.childNodes.length; ++child) {
child = child + searchWithinNode(node.childNodes[child], re);
}
}
return skip;
}
window.status = "Searching for " + regexp + "...";
searchWithinNode(document.body, regexp);
window.status = "Found " + count + " match" + (count == 1 ? "" : "es") + " for " + regexp + ".";})();
Here's my bespoke improvement to the first 10 lines for single-click highlighting for the :
javascript : (function () {
var count = 0,
regexp;
try {
regexp = /\bwho\b|\bwhom\b|\blay\b|\blie\b|\bmoot\b|\bcontinual\b|\bcontinuous\b|\benvy\b|\bjealousy\b|\benvious\b|\bjealous\b|\bnor\b|\bmay\b|\bmight\b|\bwhether\b|\bfewer\b|\bless\b|\bdisinterested\b|\buninterested\b|\bdifferent than\b|\bimpactful\b|\baffect\b|\beffect\b|\birony\b|\bironic\b|\bnauseous\b/i;
} catch (er) {
alert("Unable to create regular expression\n\n" + er);
return;
}
...
The first works, the second doesn't. The first even works when copying the expression from the second into the prompt.
When the second runs, the browser consumes CPU for a while, then highlights squat. The first is near-instant. Behaviour doesn't seem to differ between IE9/Chrome17/FF10. Using new Regex(...) in the second doesn't help - I'm using the slash notation to save having to double slash the rest, making it less readable.
Would anyone be willing to point me towards my mistake?

you left out the "(" and ")" in your expression.
This works: regexp = /(\bwho\b|\bwhom\b|\blay\b|\blie\b|\bmoot\b|\bcontinual\b|\bcontinuous\b|\benvy\b|\bjealousy\b|\benvious\b|\bjealous\b|\bnor\b|\bmay\b|\bmight\b|\bwhether\b|\bfewer\b|\bless\b|\bdisinterested\b|\buninterested\b|\bdifferent than\b|\bimpactful\b|\baffect\b|\beffect\b|\birony\b|\bironic\b|\bnauseous\b)/i;
If you ask me why the parenthesis are necessary, I don't know. Related to code further downstream is my educated guess. All I did was compare what was different between the original code and your code; given the fact that the expression worked when entered into the input box.

Related

Creating, setting styles on a range

I am trying to auto-detect addresses on a page and add the class "address" where found.
var rangyPatternApplier = function(element, pattern, style) {
var innerText = element.innerText;
var matches = innerText.match(pattern);
if (matches) {
for (var i = 0; i < matches.length; i++) {
console.log("Match: " + matches[i]);
var start = innerText.indexOf(matches[i]);
var end = start + matches[i].length;
let range = document.createRange();
var start = innerText.indexOf(matches[i]);
console.log('inner text: ' + innerText);
console.log('start: ' + start);
console.log('starts with: ' + innerText.substring(start));
var end = start + matches[i].length;
var startNode = element.childNodes[0];
var endNode = startNode;
while (startNode.nodeValue.length < start) {
start -= startNode.nodeValue.length;
end -= startNode.nodeValue.length;
startNode = startNode.nextSibling;
endNode = startNode;
if (startNode == null) {
error.reportError("Just wrong in Sections.rangyPatternApplier");
return;
}
}
while (endNode.nodeValue.length < end) {
end -= endNode.nodeValue.length;
if (endNode.nextSibling) endNode = endNode.nextSibling;
while (!endNode.nodeValue) {
endNode = endNode.childNodes[0];
}
if (endNode == null) {
error.reportError("Just wrong in Sections.rangyPatternApplier");
}
}
range.setStart(startNode, start);
console.log("starts with: " + startNode.nodeValue.substring(start));
range.setEnd(endNode, end);
var applier = rangy.createClassApplier(style, {
elementTagName: "span",
elementProperties: {
},
});
window.getSelection().addRange(range);
applier.toggleSelection();
}
}
}
Called via:
$("P").each(function () {
rangyPatternApplier(this, new RegExp("\\d+\\s[A-z]+\\s[A-z0-9]+\\s(Street|St|Avenue|Av|Ave|Road|Rd)", "mgi"), "Address");
});
On text in a paragraph:
If the income renders the household ineligible for CA/CILOCA, the case will be systemically referred to the Administration for Children s Services Transitional Child Care Unit at 109 East 16th Street 3rd floor for evaluation of Transitional Child Care (TCC) benefits. The TCC Worker determines eligibility for up to 12 months of TCC benefits.
The regex is working, the address class is being applied. I am applying the range to the window selection because there appears to be a bug in rangy when applied just on the Range (I'm getting an error message). But somehow, when I create the range, the span appears 5 characters before the start of the address and ends 9 characters early. The early ending part could be due to the tag around the "th" in 16th street. But why is the range 5 characters earlier than what I'm finding in innerText?

Sheesh this was a pain but I got it working. Adding my solution here so hopefully at least a few people don't have to go through doing something that should be much more "built-in", in my opinion
//nextTextNode is for getting the next text node from the DOM
function nextTextNode(node) {
if (node.nodeType == 1) { //element
while (node.nodeType != 3) {
node = node.firstChild;
}
return node;
}
if (node.nodeType == 3) { //text node
if (node.nextSibling) {
if (node.nextSibling.nodeType == 3) {
return node.nextSibling;
} else {
return nextTextNode(node.nextSibling);
}
} else {
while (!node.nextSibling) {
node = node.parentNode;
if (!node) return null;
}
if (node.nextSibling.nodeType == 3) {
return node.nextSibling;
} else {
return nextTextNode(node.nextSibling);
}
}
} else {
throw new Error("nextTextNode: Node is either null, not connected to the DOM, or is not of node type 1 or 3");
}
}
And then create range. Text nodes have extra newline and space characters compared to element.innerText . In the function below I track both the number of extra characters and the total characters to keep track of the inconsistancies between innerText and node.nodeValue and how many characters "in" it is.
function createRangeForString(startElement, text) {
var extras = 0;
var innerText = startElement.innerText;
var start = innerText.indexOf(text);
if (start === -1) throw new Error ("createRangeForString. text: " + text + " not found in startElement");
var textNode = nextTextNode(startElement);
var totalCharsSeen = 0;
var range = document.createRange();
for (var i = 0; i < start; i++) { // I don't think I have to add extras in limit for i. Is already included
if ((i + extras) - totalCharsSeen >= textNode.nodeValue.length) { //check if textNode is long enough
totalCharsSeen += textNode.nodeValue.length;
textNode = nextTextNode(textNode);
}
while (textNode.nodeValue.charAt(i + extras - totalCharsSeen) == "\n") {
extras++;
}
while (textNode.nodeValue.charAt(i + extras - totalCharsSeen) == " " && innerText.charAt(i) != " ") {
extras++;
}
}
range.setStart(textNode, i + extras - totalCharsSeen);
var end = start + text.length;
for (var i = start + 1; i < end; i++) { // I don't think I have to add extras in limit for i. Is already included
if ((i + extras) - totalCharsSeen >= textNode.nodeValue.length) { //check if textNode is long enough
totalCharsSeen += textNode.nodeValue.length;
textNode = nextTextNode(textNode);
}
while (textNode.nodeValue.charAt(i + extras - totalCharsSeen) == "\n") {
extras++;
}
while (textNode.nodeValue.charAt(i + extras - totalCharsSeen) == " " && innerText.charAt(i) != " ") {
extras++;
}
}
range.setEnd(textNode, i + extras - totalCharsSeen);
return range;
}

Compare two string in js and find difference

how to compare two strings purely, and provide specific result such as highlight extra word, wrong word & skip word in 2nd string. for eg.
var x = "This is the first original string in javascript language." </br>
var y = "This is not the first org string in language."
diff = wrong word ="org"<br>
Extra word ="not"<br>
skip word ="javascript"
//here is slice of my code but in some case my program fails
var x = "here is some value of string";
var y = "here is the some val string";
var k=0;
var SkipWrd="";
for(var i=0; i<y.length;i++){
var sktmp="";
var swtmp=0;
for(var j=0; j<=2;j++) {
if(x[k]!="undefined"){
if(y[i]==x[k+j]){
SkipWrd+=sktmp;
skip+=swtmp;
H_wrd += typ_wrd[i]+" ";
org_para+= sktmp+x[k+j]+" ";
k+=j+1;
break;
}
else{
sktmp+= "<mark>"+ x[k+j]+" "+ "</mark>";
swtmp++;
if(j==2 && y[i]!=x[k+j]){
err++;
Err_Wrd+=y[i]+" ";
H_wrd += "<span id='H-Err'>" + typ_wrd[i] + "</span> ";
org_para+="<span id='O-Err'>" +x[k]+" "+ "</span> ";
k++;
}
}
}
}
}

I have used a word by word comparison approach rather than the character by character approach you used. The overall logic is similar.
The below code will work for your example, but there are many more cases that might go wrong.
var x = "This is the first original string in javascript language.";
var y = "This is not the first org string in language.";
x=x.split(' ');
y=y.split(' ');
var i=0,j=0;
while (1) {
if (!x[i] || !y[j]) break;
if (x[i] == y[j]) {
i++;
j++;
continue;
}
if (x[i] == y[j+1]) {
console.log('Extra word : ', y[j]);
i++;
j+=2;
continue;
}
if (x[i+1] == y[j]) {
console.log('Skip word: ', x[i]);
i+=2;
j++;
continue;
}
if (x[i+1] == y[j+1]) {
console.log('Wrong word: ', y[j]);
i++;
j++;
continue;
}
}

How to remove space from RiTa.js random word generator?

I'm working on a word generator inspired by Daniel Shiffman's demo of the RiTa library. Right now, the code adds a space between all words and punctuation using the line:
output += " ";
I have been trying to figure out how to change the code so that space does not appear between punctuation (such as periods) and words. I think the simplest way to do this would be to use an if/else statement that leaves punctuation unaltered but adds space to words, but I am having a hard time figuring out what functions from the Rita library to use for this, as well as the syntax.
Any ideas? Here's my code right now:
var input;
var button;
var lexicon;
function setup() {
noCanvas();
lexicon = new RiLexicon();
input = createInput('As I say a noun is the name of a thing.');
button = createButton('submit');
input.changed(processRita);
button.mousePressed(processRita);
input.size(400);
}
function processRita() {
var s = input.value();
var rs = new RiString(s);
var words = rs.words();
var pos = rs.pos();
console.log(words);
console.log(pos);
var output = '';
for (var i = 0; i < words.length; i++) {
if (/nn.*/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
if(alliterations.length == 0){
output+=words[i];
}else{
output += alliterations[Math.floor(Math.random() * alliterations.length)];
}
//console.log("noun");
//console.log(alliterations.length);
} else if (/jj.*/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
output += alliterations[Math.floor(Math.random() * alliterations.length)];
//console.log("adjective");
} else if (/vb/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
output += alliterations[Math.floor(Math.random() * alliterations.length)];
//console.log("verbs");
}
else {
//console.log(words[i]);
output += words[i];
} {
output += " ";
}
}
createP(output);
}

Why do you need a library for this? Can't you just use the regular String functions to test whether a String is a punctuation mark?
You could just use a regular expression to test whether a String matches a punctuation character. Or just use a series of equality checks against each punctuation mark you care about.
You might also check out the startsWith() function and the endsWith() function.

after much trial and error, I had help from a coding professor who helped me to solve this problem, which was more complicated than I originally anticipated. In order to get this code to work, we added this bit toward the beginning of the for loop:
if(words[i] == "." || words[i] == "," || words[i] == "?" || words[i] == "!"){
output += words[i];
}else{
output += " ";
So the entire code now looks like this:
for (var i = 0; i < words.length; i++) {
if(words[i] == "." || words[i] == "," || words[i] == "?" || words[i] == "!"){
output += words[i];
}else{
output += " ";
if (/nn.*/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
if(alliterations.length == 0){
output+=words[i];
}else{
output += alliterations[Math.floor(Math.random() * alliterations.length)];
}
//console.log("noun");
//console.log(alliterations.length);
} else if (/jj.*/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
output += alliterations[Math.floor(Math.random() * alliterations.length)];
//console.log("adjective");
} else if (/vb/.test(pos[i])) {
var alliterations = lexicon.alliterations(words[i]);
output += alliterations[Math.floor(Math.random() * alliterations.length)];
//console.log("verbs");
}
else {
//console.log(words[i]);
output += words[i];
}
}
}
createP(output);
}

Its much simpler if you use the RiTa library functions:
function processRita() {
var all, output = [],
words = RiTa.tokenize(input.value()),
pos = RiTa.getPosTags(words);
for (var i = 0; i < words.length; i++) {
if (/[nn|kk|vb|jj].*/.test(pos[i]) && (all = lexicon.alliterations(words[i])).length) {
output.push(RiTa.randomItem(all));
} else {
output.push(words[i]);
}
}
createP(RiTa.untokenize(output));
}

How can I slice a block of text so that it doesn't end on a split word?

I've written code that takes a large block of text, splits it into 995 character blocks and pushes each block into an array. However, this often results splits words when they fall at the 995-character line- how could I edit my code so that each block of text is as close as possible to 995 characters long (must be under) but ends at the last available space?
function cutUp() {
var PAname = prompt("Determine PA type and PA Name (e.g. 5=mexican food=");
var chunks = [];
var OGstring = document.getElementById('PATypes').value;
var my_long_string = OGstring.split('1').join('').split('2').join('').split('3').join('').split('4').join('').split('5').join('').split('6').join('').split('7').join('').split('8').join('').split('9').join('').split('0').join('').split('[edit]').join('').split('[citation needed]').join('').split('[').join('').split(']').join('').split('(').join('').split(')').join('');
var i = 0;
var n = 0;
while (n < my_long_string.length) {
chunks.push(my_long_string.slice(n, n += 995));
}
if (chunks[0] != null) {
$('PAType=Name=Value8').innerHTML = PAname + chunks[0];
}
if (chunks[1] != null) {
$('PAType=Name=Value9').innerHTML = PAname + chunks[1];
}
if (chunks[2] != null) {
$('PAType=Name=Value10').innerHTML = PAname + chunks[2];
}
if (chunks[3] != null) {
$('PAType=Name=Value11').innerHTML = PAname + chunks[3];
}
if (chunks[4] != null) {
$('PAType=Name=Value12').innerHTML = PAname + chunks[4];
}
if (chunks[5] != null) {
$('PAType=Name=Value13').innerHTML = PAname + chunks[5];
}
if (chunks[6] != null) {
$('PAType=Name=Value14').innerHTML = PAname + chunks[6];
}
if (chunks[7] != null) {
$('PAType=Name=Value15').innerHTML = PAname + chunks[7];
}
if (chunks[8] != null) {
$('PAType=Name=Value16').innerHTML = PAname + chunks[8];
}
if (chunks[9] != null) {
$('PAType=Name=Value17').innerHTML = PAname + chunks[9];
}
////this is to create new exportable table
$('exportTable').innerHTML += $('tableContents').innerHTML;
$("exportTable").removeClass('hidden');
///this resets to default
defaultReset();
}

Without any specific Javascript knowledge, I'd say to look ahead 995 characters, and if that character isn't itself a space, to start looking back towards the start until you find a space, and truncate there.
In C-like pseudocode, with chars being your big array of characters:
for(truncOffset = 995; truncOffset > 0; truncOffset--):
{
if chars[currentLoc + truncOffset] = ' ': /* a space character */ break;
}
Rinse, repeat to taste. Remember to move currentLoc to currentLoc + truncOffset after finding that space.

syntax highlighting design

I'm writing my own syntax highlighter in javascript for fun and see a couple of approaches but they both have pros and some pretty serious cons that I can't get around. What do you guys think about these approaches and are there better methods that I'm missing?
Assumption
Code to highlight exists in a single string.
Approaches
Treat code in it's string form and use regular expressions to find patterns.
Pros
Simple to define and search for patterns
Cons
Hard to disregard keywords inside of quotes or comments
Split the string by spaces and linebreaks and loop over the array.
Pros
Easy to keep track of scope
Cons
Hard to keep track of spaces and linebreaks after the split
EDIT: Lexical Analysis
So, if I understand it, using Lexical Analysis you break the string into tokens. This somehow sounds a lot like approach number 2? How do you approach reassembling the tokens into the original string?

Note: This uses jQuery. It can pretty well be rewritten to work with straight javascript if you want.
I actually wrote a little plugin for fun that does this:
(function($) {
$.fn.codeBlock = function(blockComment) {
// Setup keyword regex
var keywords = /(abstract|boolean|break|byte|case|catch|char|class|const|continue|debugger|default|delete|do|double|else|enum|export|extends|final|finally|float|for|function|goto|if|implements|import|in|instanceof|int|interface|long|native|new|package|private|protected|public|return|short|static|super|switch|synchronized|this|throw|throws|transient|try|typeof|var|void|volatile|while|with|true|false|prototype)(?!\w|=)/gi;
// Booleans to toggle comment, regex, quote exclusions
var comment = false;
var quote = false;
var regex = false;
/* Array used to store values of regular expressions, quotes, etc.
so they can be used to ID locations to be skipped durring keyword
regexing.
*/
var locator = new Array();
var locatorIndex = 0;
if (blockComment) locator[locatorIndex++] = 0;
var text = $(this).html();
var continuation;
var numerals = /[0-9]/;
var arr = ($(this).html()).split("");
var outhtml = "";
for (key in arr) {
// Assign three variables common 'lookup' values for faster aquisition
var keyd = key;
var val = arr[keyd];
var nVal = arr[keyd - 1];
var pVal = arr[++keyd];
if ((val == "\"" || val == "'") && nVal != "\\") {
if (quote == false) {
quote = true;
outhtml += val;
}
else {
outhtml += val;
quote = false;
}
locator[locatorIndex++] = parseInt(key);
}
else if (numerals.test(val) && quote == false && blockComment == false && regex == false) {
outhtml += '<span class="num">' + val + '</span>';
}
else if (val == "/" && nVal != "<") {
var keys = key;
if (pVal == "/") {
comment = true;
continuation = key;
break;
}
else if (pVal == "*") {
outhtml += "/";
blockComment = true;
locator[locatorIndex++] = parseInt(key);
}
else if (nVal == "*") {
outhtml += "/";
blockComment = false;
locator[locatorIndex++] = parseInt(key);
}
else if (pVal == "[" && regex == false) {
outhtml += "<span class='res'>/";
regex = true;
}
else {
outhtml += "/";
}
}
else if (val == "," || val == ";" && regex == true) {
outhtml += "</span>" + val;
regex = false;
}
else {
outhtml += val;
}
}
if (comment == true) {
outhtml = outhtml.replace(keywords, "<span class='res'>$1</span>");
outhtml += '<span class="com">';
outhtml += text.substring(continuation, text.length);
outhtml += '</span>';
}
else {
if ((locator.length % 2) != 0) locator[locator.length] = (text.length - 1);
if (locator.length != 0) {
text = outhtml;
outhtml = text.substring(0, locator[0]).replace(keywords, "<span class=\"res\">$1</span>");
for (var i = 0; i < locator.length;) {
qTest = text.substring(locator[i], locator[i] + 1);
if (qTest == "'" || qTest == "\"") outhtml += "<span class=\"quo\">";
else outhtml += "<span class=\"com\">";
outhtml += text.substring(locator[i], locator[++i] + 1) + "</span>";
outhtml += text.substring(locator[i] + 1, locator[++i]).replace(keywords, "<span class=\"res\">$1</span>");
}
}
else {
outhtml = outhtml.replace(keywords, "<span class=\"res\">$1</span>");
}
}
text = outhtml;
$(this).html(text);
return blockComment;
}
})(jQuery);
I'm not going to claim it is the most efficient way of doing this or the best but it does work. There are still probably a few bugs in there I haven't ID'd yet (and 1 I know about but haven't gotten around to fixing) but this should give you an idea of how you could go about this if you like.
My suggested implementation of this is to create a textarea or something and have the plugin run when you click a button or something (as far as testing it goes that is a decent idea) and of course you can set the text in the textarea to some starting code to make sure it works (Tip: You can put tags in between the the <textarea> tag and it will render as text, not HTML).
Also, blockComment is a boolean, make sure to pass false because true will trigger the block quoting. If you decided to parse something line by line, like:
<a>code</a>
<a>some more code</a>
Do something like:
blockComment = false;
$("a").each(function() {
blockComment = $(this).codeBlock(blockComment);
});

Develop Reference

JavaScript is the programming language of the Web.

Differences between two regex JavaScript bookmarklets - javascript

Related

Creating, setting styles on a range

Compare two string in js and find difference

How to remove space from RiTa.js random word generator?

How can I slice a block of text so that it doesn't end on a split word?

syntax highlighting design

Categories

Resources