Regex not correctly matching - javascript

Please do not be alarmed at the huge walls of text. This is not a very expert question and does not require much reading.
Problem:
I have a group of regexes in an array that are ran through and go through a string, matching and wrapping text around the match.
I am having a lot of problems with this and I have no clue why. Most of my regexes are not matching correctly with this string:
var changelog = `+ = Added
- = Removed
~ = Changed
[line]
v1 - 4chan Enhancer is released
[line]
v2 - Minor update
Added support for new settings in the future
[line]
v3 - Major update
+ Infinite scrolling
+ Remove ads completely
+ Fetch replies automatically
+ Remove comic at top
+ Random Pepe
+ Blocked keywords
+ Changelog
+ Version
[line]
If you would like to request more features, please email me at billy#billyvenner.co.uk`;
If I run the regexes, the function returns this:
<span style='color: #009<span style='color: #009110'>1</span>10'>+</span> = Added
<span style='color: #FF0000'>-</span> = Remo<span style='color: #009110'>v</span>ed
~ = Changed
[line]
v1 - 4chan Enhancer is released
[line]
v2 - Minor update
Added support for new settings in the future
[line]
v3 - Major update
+ Infinite scrolling
+ Remove ads completely
+ Fetch replies automatically
+ Remove comic at top
+ Random Pepe
+ Blocked keywords
+ Changelog
+ Version
[line]
If you would like to request more features, please email me at billy#billyvenner.co.uk
4chan Enhancer v3
Observing the new changelog using regex101 and running my regexes through it, I am returned with exactly what I'm trying to match.
For some odd reason, all of them only get matches once or zero times. Here's the array that the regexes are in:
var formats = [
["^(\\+).*","<span style='color: #009110'>","</span>"],
["^\[line\]","<hr>","",false],
["^(\\~).*","<span style='color: #0086E0'>","</span>"],
["^(\\-).*","<span style='color: #FF0000'>","</span>"],
["^(v[0-9]+).*","<span style='color: #009110'>","</span>"],
["(http[s]{0,1}:\\/\\/www\\.[a-zA-Z0-9]*\\.[a-zA-Z0-9]*\\.[a-zA-Z0-9]*(?:\\/|-|_|=|\\?|&|[a-zA-Z0-9])*)",linkify],
["((?:\\/|-|_|=|\\?|&|[a-zA-Z0-9])*#(?:\\/|-|_|=|\\?|&|[a-zA-Z0-9])*\\..*\\.(?:\\/|-|_|=|\\?|&|[a-zA-Z0-9])*)",function() {linkify("mailto")}],
];
If you try running these regexes through my changelog, it works fine.
Please note with the above regexes: you can see the double backslashes because of the escaping character in JavaScript (so they are actually just one backslash in the actual regex) and the two regexes at the bottom are being implemented later and currently do nothing in my code.
Here is the actual code to run through these regexes:
function linkify(before) {
}
function colorChangelog() {
var newChangelog = "";
newChangelog = changelog;
var formats = [
["^(\\+).*","<span style='color: #009110'>","</span>"],
["^\[line\]","<hr>","",false],
["^(\\~).*","<span style='color: #0086E0'>","</span>"],
["^(\\-).*","<span style='color: #FF0000'>","</span>"],
["^(v[0-9]+).*","<span style='color: #009110'>","</span>"],
["^(http[s]{0,1}:\\/\\/www\\.[a-zA-Z0-9]*\\.[a-zA-Z0-9]*\\.[a-zA-Z0-9]*(?:\\/|-|_|=|\\?|&|[a-zA-Z0-9])*)",linkify],
["^((?:\\/|-|_|=|\\?|&|[a-zA-Z0-9])*#(?:\\/|-|_|=|\\?|&|[a-zA-Z0-9])*\\..*\\.(?:\\/|-|_|=|\\?|&|[a-zA-Z0-9])*)",function() {linkify("mailto")}],
];
for (y = 0; y < formats.length; y++) {
console.log(y);
var leregex = new RegExp(formats[y][0],"g")
var executed2 = leregex.exec(newChangelog);
if (!!leregex.exec(newChangelog)) {
if (!!leregex.exec(newChangelog)[1]) {
var executed = executed2[1];
for (match = 0; match < executed.length; match++) {
if (typeof(formats[y][1]) == "string") {
if (formats[y][formats[y].length-1] != false) {
var newstr = formats[y][1] + executed[match] + formats[y][2];
newChangelog = newChangelog.replace(executed[match],newstr);
} else {
var newstr = formats[y][1] + formats[y][2];
newChangelog = newChangelog.replace(executed[match],newstr);
}
} else {
if (typeof(formats[y][1]) == "function") {
} else {
console.log("Invalid 2nd argument: " + formats[y][1]);
}
}
}
}
}
console.log(newChangelog);
}
changelog = newChangelog + `
4chan Enhancer ` + version;
};
colorChangelog();
console.log(changelog);
I am using new RegExp to run my regexes with the flags "gm" with the g meaning it will match as much as possible and the m meaning it will start ^s and $s at the start/end of the line.
Thank you for reading this huge daunting block of text, I hope you can help.

Your issue is with your usage of exec. .exec() will return the first matched instance not yet returned. Drop the exec into a while loop and you should get what you need. You need to make sure you dont have anything inside the loop that will turn this into an infinite issue. changing your for loop to the following appears to work:
for (y = 0; y < formats.length; y++) {
//console.log(y);
var leregex = new RegExp(formats[y][0], "gm")
while (!!(executed2 = leregex.exec(newChangelog))) {
//console.log(executed2)
if (!!executed2[1]) {
if (typeof(formats[y][1]) == "string") {
if (formats[y][formats[y].length - 1] != false) {
var newstr = formats[y][1] + executed2[0] + formats[y][2];
newChangelog = newChangelog.replace(executed2[0], newstr);
} else {
var newstr = formats[y][1] + formats[y][2];
newChangelog = newChangelog.replace(executed2[0], newstr);
}
} else {
if (typeof(formats[y][1]) == "function") {
} else {
console.log("Invalid 2nd argument: " + formats[y][1]);
}
}
}
}
//console.log(newChangelog);
}
I instantiated executed2 at the top of the function:
var newChangelog = "", executed2;
Uncomment the console.logs to see the states scroll by.
You might want to rename some of the vars to make more sense (specifically the executed2 var, as executed no longer exists
I used this reference from developer.mozilla.org for info on .exec

Related

Problems with IF condition

I'm trying to make a website that gathers information from APIs. The following code always evaluates to 'Beep Boop Beep! I can\t find the Wikipedia page with the API! :-( \n Anyways here is more info on...'! Anyone have any ideas why?
var geoNamesWiki = result.geoNamesWiki;
for (let j = 0; j < 30; j++) {
if (geoNamesWiki.geonames[j].feature == 'country' &&
(geoNamesWiki.geonames[j].countryCode == openCage.results[0].components["ISO_3166-1_alpha-2"] ||
geoNamesWiki.geonames[j].title.includes(openCage.results[0].components.country))) {
$('#summary').html(geoNamesWiki.geonames[j].summary);
$('#wikiLink').html(geoNamesWiki.geonames[j].wikipediaUrl).attr("href", "https://" + geoNamesWiki.geonames[j].wikipediaUrl);
} else {
$('#summary').html('Beep Boop Beep! I can\t find the wikipedia page with the API! :-( \n Anyways here is more info on' + openCage.results[0].components.country + ':');
$('#wikiLink').html('https://en.wikipedia.org/wiki/' + encodeURI(openCage.results[0].components.country)).attr("href", 'https://en.wikipedia.org/wiki/' + encodeURI(openCage.results[0].components.country));
}
}
Is suspect you have a string there at var geoNamesWiki = result.geoNamesWiki;
Try parsing it to a JSON object first var geoNamesWiki = JSON.parse( result.geoNamesWiki );
I found the answer thanks to #Bekim Bacaj! I was overwriting what I had already done, so just needed to add a break on the final line of the IF part.

regexp looping and logic in javascript

Not certain if this can be done in regexp under javascript, but thought it would be interesting to see if it is possible.
So thought I would clean up a piece of html to remove most tags, literally just dropping them, so <H1><img><a href ....>. And that would be relatively simple (well, stole the basis from another post, thanks karim79 Remove HTML Tags in Javascript with Regex).
function(inString, maxlength, callback){
console.log("Sting is " + inString)
console.log("Its " + inString.length)
var regex = /(<([^>]+)>)/ig
var outString = inString.replace(regex, "");
console.log("No HTML sting " + outString);
if ( outString.length < maxlength){
callback(outString)
} else {
console.log("Lets cut first bit")
}
}
But then I started thinking, is there a way where I can control regex execution. So lets say that I want to keep certain tabs, like b,br,i and maybe change H1-6 to b. So in pseudo code, something like:
for ( var i in inString.regex.hits ) {
if ( hits[i] == H1 ) {
hits[i] = b;
}
}
The issue is that I want the text thats not HTML tags to stay as it is, and I want it to just cut out by default. One option would of course be to change the ones I want to keep. Say change <b> to [[b]], once that is done to all the ones of interest. Then put them back to <b> once all unknown have been removed. So like this (only for b, and not certain the code below would work):
function(inString, maxlength, callback){
console.log("Sting is " + inString)
console.log("Its " + inString.length)
var regex-remHTML = /(<([^>]+)>)/ig
var regex-hideB = /(<b>)/ig
var regex-showB = /([b])/ig
var outString = inString.replace(regex-hideB, "[b]");
outString = outString.replace(regex-remHTML, "");
outString = outString.replace(regex-showB, "<b>");
console.log("No HTML sting " + outString);
if ( outString.length < maxlength){
callback(outString)
} else {
console.log("Lets cut first bit")
}
}
But would it be possible to be smarter, writing cod ethat says here is a peice of HTML tag, run this code against the match.
As Tim Biegeleisen sai in its comment, maybe a better solution could be using a parser instead of a Regex...
By the way, if you want to control what is going to be changed by the regex you can pass a callback to the String.prototype.replace:
var input = "<div><h1>CIAO Bello</h1></div>";
var output = input.replace(/(<([^>]+)>)/gi, (val) => {
if(val.indexOf("div") > -1) {
return "";
}
return val;
})
;
console.log("output", output);

How do I break up a string without creating malformed HTML tags?

What I am doing:
In NodeJS I am creating an email template by using MustacheJS, using data from an array of JSON objects.
The text/message that goes in the template can contain text along with basic html tags (such as b p & a).
Due to limitation of space I need to only show an excerpt of the message. For that I do a word count, and after lets say 20 words (checked by spaces) I truncate the string and append View more anchor tag. This links it to the website's post page, that contains the complete post. Something like:
Hey this is a sample post text <b>message</b>. Lorem ipsum dolor sit
amit... View more
The problem:
During word count and truncation, it is possible that I truncate the string in between an html tag as I am simply calculating words on basis of space. Something like:
I am sharing a link with you. <a style="color:... View more
Now this will break the html.
Possible solution:
Before truncating string, run a regex on it to find all the html tags in it.
Use indexOf() (or some other method) to find starting and ending indices of each tag.
After word count, get the index where I need to truncate it.
Now see that if the index intersects with any of the tags region.
If it does intersect, simply move the truncate index to the start or end of the html tag.
Question:
Is there a better way to do this. I don't know what search terms I should be searching on google, to get help with this.
P.S. The code is flexible and I can change the flow if there is a significantly better solution. Also, I am not good with post titles. If you can, please modify it to something that reflects the question.
EDIT:
This is what I came up with after Alex's answer. Hope it helps someone else:
/**
* Counter: Takes a string and returns words and characters count
* #param value
* #returns obj: {
* 'wordCount': (int),
* 'totalChars': (int),
* 'charCount': (int),
* 'charCountNoSpace': (int)
* }
*/
var counter = function(value){
var regex = /\s+/gi;
if (!value.length) {
return {
wordCount: 0,
totalChars: 0,
charCount: 0,
charCountNoSpace: 0
};
}
else {
return {
wordCount: value.trim().replace(regex, ' ').split(' ').length,
totalChars: value.length,
charCount: value.trim().length,
charCountNoSpace: value.replace(regex, '').length
};
}
}
/**
* htmlSubString - Creates excerpt from markup(or even plain text) without creating malformed HTML tags
* #param markup {string} - Markup/text to take excerpt out of
* #param limit {int} - Total word count of excerpt. Note that only text (not the html tag) counts as a valid word.
* #returns {string} - Excerpt
*/
var htmlSubString = function(markup, limit){
var htmlParser = require("htmlparser2");
var tagCount = 0;
var wordCount = 0;
var excerpt = '';
function addToExcerpt(type, text, attribs) {
if ((wordCount >= limit && tagCount == 0) || (tagCount === 1 && type === 'tagOpen' && wordCount >= limit)) {
return false;
}
else if (wordCount < limit || tagCount) {
if (type === 'text') {
var wordCountSubString = $scope.counter(text).wordCount;
if (wordCountSubString + wordCount > limit && tagCount === 0) {
var length = limit - wordCount;
var wordList = text.trim().split(' ');
for (var i = 0; i < length; i++) {
excerpt += ' ' + wordList[i];
wordCount++;
}
} else {
wordCount += wordCountSubString;
excerpt += text;
}
} else if (type === 'tagOpen') {
excerpt += '<' + text;
for (var prop in attribs) {
excerpt += ' ' + prop + '="' + attribs[prop] + '"';
}
excerpt += '>';
} else if (type === 'tagClose') {
excerpt += '</' + text + '>';
}
}
return true;
}
var parser = new htmlParser.Parser({
onopentag: function (name, attribs) {
if(wordCount < limit){
++tagCount;
addToExcerpt('tagOpen', name, attribs);
}
},
ontext: function (text) {
if(wordCount < limit){
addToExcerpt('text', text);
}
},
onclosetag: function (tagName) {
if(wordCount < limit || tagCount > 0){
addToExcerpt('tagClose', tagName);
--tagCount;
}
}
});
parser.write(markup);
parser.end();
return excerpt;
}
Usage:
var wordCountLimit = 20;
var markup = "/* some markup/text */";
var excerpt = htmlSubString(markup, wordCountLimit);
Now, you'll definitely be able to find some HTML tag matching regular expressions. That said, I don't recommend it. At first you'll be all happy and everything will work just fine. Then tomorrow you'll find a small edge-case. "No worries!" You'll say, as you modify the expression to account for the discrepancy. Then the next day, a new tweak, and a new one, and yet another, etc etc until you can't take it anymore.
I highly recommend you find an already established HTML parsing library. There appears to be quite a few on npm. This one seems to be fairly popular.
PS - You did fine with your question. I wish more questions took as much time and provided as much detail :)

encodeURIComponent throws an exception

I am programmatically building a URI with the help of the encodeURIComponent function using user provided input. However, when the user enters invalid unicode characters (such as U+DFFF), the function throws an exception with the following message:
The URI to be encoded contains an invalid character
I looked this up on MSDN, but that didn't tell me anything I didn't already know.
To correct this error
Ensure the string to be encoded contains only valid Unicode sequences.
My question is, is there a way to sanitize the user provided input to remove all invalid Unicode sequences before I pass it on to the encodeURIComponent function?
Taking the programmatic approach to discover the answer, the only range that turned up any problems was \ud800-\udfff, the range for high and low surrogates:
for (var regex = '/[', firstI = null, lastI = null, i = 0; i <= 65535; i++) {
try {
encodeURIComponent(String.fromCharCode(i));
}
catch(e) {
if (firstI !== null) {
if (i === lastI + 1) {
lastI++;
}
else if (firstI === lastI) {
regex += '\\u' + firstI.toString(16);
firstI = lastI = i;
}
else {
regex += '\\u' + firstI.toString(16) + '-' + '\\u' + lastI.toString(16);
firstI = lastI = i;
}
}
else {
firstI = i;
lastI = i;
}
}
}
if (firstI === lastI) {
regex += '\\u' + firstI.toString(16);
}
else {
regex += '\\u' + firstI.toString(16) + '-' + '\\u' + lastI.toString(16);
}
regex += ']/';
alert(regex); // /[\ud800-\udfff]/
I then confirmed this with a simpler example:
for (var i = 0; i <= 65535 && (i <0xD800 || i >0xDFFF ) ; i++) {
try {
encodeURIComponent(String.fromCharCode(i));
}
catch(e) {
alert(e); // Doesn't alert
}
}
alert('ok!');
And this fits with what MSDN says because indeed all those Unicode characters (even valid Unicode "non-characters") besides surrogates are all valid Unicode sequences.
You can indeed filter out high and low surrogates, but when used in a high-low pair, they become legitimate (as they are meant to be used in this way to allow for Unicode to expand (drastically) beyond its original maximum number of characters):
alert(encodeURIComponent('\uD800\uDC00')); // ok
alert(encodeURIComponent('\uD800')); // not ok
alert(encodeURIComponent('\uDC00')); // not ok either
So, if you want to take the easy route and block surrogates, it is just a matter of:
urlPart = urlPart.replace(/[\ud800-\udfff]/g, '');
If you want to strip out unmatched (invalid) surrogates while allowing surrogate pairs (which are legitimate sequences but the characters are rarely ever needed), you can do the following:
function stripUnmatchedSurrogates (str) {
return str.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])/g, '').split('').reverse().join('').replace(/[\uDC00-\uDFFF](?![\uD800-\uDBFF])/g, '').split('').reverse().join('');
}
var urlPart = '\uD801 \uD801\uDC00 \uDC01'
alert(stripUnmatchedSurrogates(urlPart)); // Leaves one valid sequence (representing a single non-BMP character)
If JavaScript had negative lookbehind the function would be a lot less ugly...

problem in fetching a particular cookie

This is the script that i am using to fetch a particular cookie lastvisit :
AFTER THE EDIT
// This document writes a cookie
// called from index.php
window.onload = makeLastVisitCookie;
function makeLastVisitCookie() {
var now = new Date();
var last = new Date();
now.setFullYear(2020);
// set the cookie
document.cookie = "lastvisit=" + last.toDateString() + ";path=/;expires=" + now.toGMTString();
var allCookies = document.cookie.split(";");
for( var i=0 ; i < allCookies.length ; i++ ) {
if(allCookies[i].split("=")[0]== "lastvisit") {
document.getElementById("last_visit").innerHTML = "You visited this site on" + allCookies[i].split("=")[1];
} else {
alert("testing..testing..");
}
}
}
From this script the if part never works though there are 5 cookies stored from my website. (including the cookie that i am saving from this script) What is the mistake that i am making while fetching the cookie named lastvisit ?
You're splitting the cookie by ; an comparing those tokens with lastvisit. You need to split such a token by = first. allCookies[i] looks like key=val and will never equal lastvisit. Een if allCookies[i] == "lastvisit" is true, the result will still not be as expected since you're showing the value of allCookies[i + 1] which would be this=the_cookie_after_lastvisit.
if(allCookies[i].split("=") == "lastvisit") { should be:
var pair = allCookies[i].split("=", 2);
if (pair[0].replace(/^ +/, "") == "lastvisit") {
"You visited this site on" + allCookies[i+1]; should be:
"You visited this site on" + pair[1];
The 2 argument of split makes cookies like sum=1+1=2 be read correctly. When splitting cookies by ;, the key may contain a leading space which much be removed before comparing. (/^ +/ is a regular expression where ^ matches the beginning of a string and + one or more spaces.)
Alternatively, compare it directly against a RE for matching the optional spaces as well (* matches zero or more occurences of a space character, $ matches the end of a string):
if (/^ *lastvisit$/.test(pair[0])) {
I've tested several ways to get a cookie including using regular expressions and the below was the most correct one with best performance:
function getCookie(name) {
var cookie = "; " + document.cookie + ";";
var search = "; " + encodeURIComponent(name) + "=";
var value_start = cookie.indexOf(search);
if (value_start == -1) return "";
value_start += search.length;
var value_end = cookie.indexOf(';', value_start);
return decodeURIComponent(cookie.substring(value_start, value_end))
}
You need to remove possible white space around the cookie key before comparing to the string "lastvisit". This is done conveniently using regular expressions. /^\s+/ matches all white space at the beginning, /\s+$/ matches all white space at the end. The matches are replaced by the empty string, i.e. removed:
for( var i = 0 ; i < allCookies.length ; i++ ) {
var c = allCookies[i].split("="); // split only once
var key = c[0].replace(/^\s+/, '').replace (/\s+$/, ''); // remove blanks around key
if (key == "lastvisit") {
document.getElementById("last_visit").innerHTML = "You visited on " + c[1];
}
//...
}

Categories

Resources