So I managed to extract some text, and then I saved it as a variable for later use, how can I test for certain keywords within said text?
Here's an example, checkTitle is the text I extracted, and I want to search it for certain keywords, in this example, the words delimited by commas within compareTitle. I want to search for the strings '5' and 'monkeys'.
var checkTitle = "5 monkeys jumping on the bed";
var compareTitle = "5 , monkeys";
if (checkTitle === compareTitle) {
// ...
}
You can use regular expressions to search for strings, and the test() function to return true/false if the string contains the words.
/(?:^|\s)(?:(?:monkeys)|(?:5))\s+/gi.test("5 monkeys jumping on the bed");
will return true, because the string contains either (in this case, both) the words 5 and monkeys.
See my example here: http://regexr.com/39ti7 to use the site's tools to analyse each aspect of the regular expression.
If you need to change the words you are testing for each time, then you can use this code:
function checkForMatches(){
var words = checkTitle.split(" , ");
var patt;
for (var i = 0; i < words.length; i++){
patt = patt + "(" + words[i] + ")|";
}
patt[patt.length-1] = "";
patt = new RegExp(patt, "i");
return patt.test(checkTitle);
}
will return true, and allow for any words you want to be checked against. They must be separated like this:
compareText = "word1 , word2 , word3 , so on , ...";
// ^^^notice, space-comma-space.
And you can use it in an if statement like this:
var checkTitle = "5 monkeys jumping on the bed";
var compareTitle = "5 , monkeys";
if (checkForMatches()){
//a match was found!
}
Using the String.prototype.contains() method you can search for substrings within strings. For example,
"Hi there everyone!".contains("Hi"); //true
"Hi there everyone!".contains("there e"); //true
"Hi there everyone!".contains("goodbye"); //false
You can also use Array.prototype.indexOf() to a similar effect:
("Hi there everyone!".indexOf("Hi") > -1) //true
("Hi there everyone!".indexOf("Hmmn") > -1) //false
However, in this method, we don't even need to use any of the above methods, as the .match() method will do it all for us:
To return an array of the matched keywords, use .match(). It accepts a regular expression as an argument, and returns an array of substrings which match the regular expression.
var checkTitle = "5 monkeys jumping on the bed";
var compareTitle = "5 , monkeys";
/* ^^^
It is very important to separate each word with space-comma-space */
function getWords(str){ //returns an array containing key words
return str.split(" , ");
}
function matches(str, words){
var patt = "";
for (var i = 0; i < words.length; i++){
patt = (i > 0)? patt + "|" + "(" + words[i] + ")" : patt + "(" + words[i] + ")";
}
patt[patt.length-1] = "";
patt = new RegExp(patt, 'gi');
// ^ i makes it case-insensitive
return str.match(patt);
}
matches(compareTitle, getWords(checkTitle)); // should return ['5','monkeys']
you can check if a match exists by doing this:
if (matches(compareTitle, getWords(checkTitle))){
// matches found
}
If this answer is better than the first, accept this one instead. Ask for any more help, or if it doesn't work.
Related
I am trying to capture the counts associated with the keywords in the string txt. All the keywords are loaded into an array ahead of time.
This code is in jquery/javascript. I cannot hard code string keywords so that is why they are stored in an array. Please assist me in finding what goes in place of "Reg Expression" before and/or after the keyword variable within the loop.
The html br can be used to end that regexmatch in that iteration of the loop.
Trying to end up with keywordCount = "2, 5, 11"
//String I need to search through
var txt = "Edit Req'd2<br>Errors5<br>Pndg App11<br>";
//array of keywords I can use to find associated counts to keywords
var keyword = ["Edit Req'd", "Errors", "Pndg App"];
//empty string declared before loop
var keywordCount = '';
for (i = 0; i < keyword.length; i++) {
// takes the comma off end of first entry in array
// might not be needed or another method might be better?
keyword[i] = $.trim(keyword[i]);
//regex expression generated using keyword and unknown expression
var regexmatch = RegExp("Reg Expression" + keyword + "Reg Expression")
//use regex expression to generate string containing counts
keywordCount += (txt.match(regexmatch)) + ",";
}
Here is the example which may helps you in achieving your required output.
//String I need to search through
var txt = "Edit Req'd2<br>Errors5<br>Pndg App11<br>";
//array of keywords I can use to find associated counts to keywords
var keyword = ["Edit Req'd", "Errors", "Pndg App"];
//empty string declared before loop
var keywordCount = '';
var i = 0;
var keywordCount = "";
var splittedValue = "";
while (i < keyword.length) {
if (txt.indexOf(keyword[i]/i)) {
splittedValue = txt.split(keyword[i]);
if (keywordCount === "") {
keywordCount = splittedValue[1].split("<br>")[0];
} else {
keywordCount += ", " + splittedValue[1].split("<br>")[0];
}
}
i += 1;
}
console.log(keywordCount);
I would use a digit/numeric range to match the number ([0-9]; this is pretty basic regex stuff), and use a group to match any of the keywords:
Any number of digits: [0-9]+
Any of your keywords: (something|somethingelse|other)
You can use capture-groups as well to have match() return them separately:
var keyword = ["Edit Req'd", "Errors", "Pndg App"];
var regexmatch = RegExp('(\b' + keyword.join('|') + ')([0-9]+)', 'g')
(note that we use the word-boundary \b to make sure keyword is not part of a longer word, and the 'g' flag for global to signal we want multiple results)
Now, match() will only match one result, and we want matches for every keyword, so we’ll use exec() instead:
var txt = "Edit Req'd2<br>Errors5<br>Pndg App11<br>";
var keyword = ["Edit Req'd", "Errors", "Pndg App"];
var regex = RegExp('(\b' + keyword.join('|') + ')([0-9]+)', 'g')
var match = regex.exec( txt ); // ...find the first match
var numbers = [];
while ( match !== null ) {
numbers.push( match[ 2 ]); // #0 of exec() its return-value is the result as a whole: "Edit Req'd2"
// #1 is the value of the first capture-group: "Edit Req'd"
// #2 is the value of the second capture-group: "2"
match = regex.exec( txt ); // ...find the next match
}
console.log("the numbers are:", numbers);
Lastly, do note that regular expressions may look cool, they are not always the fastest (performance-wise). If performance matters a lot, you could use (for example) indexOf() instead.
From your question it seems like you could brush up on your knowledge of regular expressions a little bit. There’s a ton of articles around (just search for “regular expressions basics” or “regex 101”) – like this one:
https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285*
I have made this very simplified version of a translation tool similar to Google Translate. The idea is to build this simple tool for a minority language in sweden called "jamska". The app is built up with a function that takes the string from a textarea with the ID #svenska and replaces words in the string using RegExp.
I've made an array called arr that's used in a for loop of the function as a dictionary. Each array item looks like this: var arr = [["eldröd", "eillrau"], ["oväder", "over"] ...]. The first word in each array item is in swedish, and the second word is in jamska. If the RegExp finds a matching word in the loop it replaces that word using this code:
function translate() {
var str = $("#svenska").val();
var newStr = "";
for (var i = 0; i < arr.length; i++) {
var replace = arr[i][0];
var replaceWith = arr[i][1];
var re = new RegExp('(^|[^a-z0-9åäö])' + replace + '([^a-z0-9åäö]|$)', 'ig');
str = str.replace(re, "$1" + replaceWith + '$2');
}
$("#jamska").val(str);
}
The translate() is then called in an event handler for when the #svenska textarea gets a keyup, like this: $("#svenska").keyup(function() { translate(); });
The translated string is then assigned as the value of another textarea with the ID #jamska. So far, so good.
I have a problem though: if the translated word in jamska also is a word in swedish, the function translates that word too. This problem is occurring because I'm assigning the variable str to the translated version of the same variable, using: str = str.replace(re, "$1" + replaceWith + '$2');. The function is using the same variable over and over again to perform the translation.
Example:
The swedish word "brydd" is "fel" in jamska. "Fel" is also a word in swedish, so the word that I get after the translation is "felht", since the swedish word "fel" is "felht" in jamska.
Does anyone have any idea for how to work around this problem?
Instead of looking for each Jamska word in the input and replacing them with the respective translation, I would recommend to find any word ([a-z0-9åäö]+) in your text and replace this word either with its translation if one is found in the dictionary or with itself otherwise:
//var arr = [["eldröd", "eillrau"], ["oväder", "over"] ...]
// I'd better use dictionary instead of array to define your dictionary
var dict = {
eldröd: "oväder",
eillrau: "over"
// ...
};
var str = "eldröd test eillrau eillrau oväder over";
var translated = str.replace(/[a-z0-9åäö]+/ig, function(m) {
var word = m.toLowerCase();
var trans = dict[word];
return trans === undefined ? word : trans;
});
console.log(translated);
Update:
If dictionary keys may be represented by phrases (i.e. technically appear as strings with spaces), the regex should be extended to include all these phrases explicitly. So the final regex would look like
(?:phrase 1|phrase 2|etc...)(?![a-z0-9åäö])|[a-z0-9åäö]+
It will try to match one of the phrases explicitly first and only then single words. The (?![a-z0-9åäö]) lookbehind helps to filter out phrases immediately followed by letters (e.g. varken bättre eller sämreåäö).
Phrases immediately preceded by letters are implicitly filtered out by the fact that a match is either the fist one (and therefore is not preceded by any letter) or it's not the first and therefore the previous one is separated from the current by some spaces.
//var arr = [["eldröd", "eillrau"], ["oväder", "over"] ...]
// I'd better use dictionary instead of array to define your dictionary
var dict = {
eldröd: "oväder",
eillrau: "over",
bättre: "better",
"varken bättre eller sämre": "vär å int viller",
"test test": "double test"
// ...
};
var str = "eldröd test eillrau eillrau oväder over test test ";
str += "varken bättre eller sämre ";
str += "don't trans: varken bättre eller sämreåäö";
str += "don't trans again: åäövarken bättre eller sämre";
var phrases = Object.keys(dict)
.filter(function(k) { return /\s/.test(k); })
.sort(function(a, b) { return b.length - a.length; })
.join('|');
var re = new RegExp('(?:' + phrases + ')(?![a-z0-9åäö])|[a-z0-9åäö]+', 'ig');
var translated = str.replace(re, function(m) {
var word = m.toLowerCase();
var trans = dict[word];
return trans === undefined ? word : trans;
});
console.log(translated);
I'm learning how to use regular expressions and am a bit confused by something hopefully some one can clarify, if I use the following string and expression I get the expected results with match but the exact opposite if I use split. Beating my head against the wall I don't understand why?
var a = "212,0,,456,,0,67889";
var patt = /,\d{1,5},/gmi;
pos=a.match(patt);
alert(pos);// returns ,0, ,456, and ,0,
pos=a.split(patt);
alert(pos); //returns 212, and ,67889
Split means, look for a match of the pattern on the string and split that string every time you see a match. Also Remove each match you find.
This link has some good examples:
http://www.tizag.com/javascriptT/javascript-string-split.php
"~ a delimiter is used by the split function as a way of breaking up the string. Every time it sees the delimiter we specified, it will create a new element in an array. The first argument of the split function is the delimiter." (The delimiter is the pattern)
Example one:
<script type="text/javascript">
var myString = "123456789";
var mySplitResult = myString.split("5");
document.write("The first element is " + mySplitResult[0]);
document.write("<br /> The second element is " + mySplitResult[1]);
</script>
Output:
The first element is 1234
The second element is 6789
"Make sure you realize that because we chose the 5 to be our delimiter, it is not in our result. This is because the delimiter is removed from the string and the remaining characters are separated by the chasm of space that the 5 used to occupy."
Example Two:
<script type="text/javascript">
var myString = "zero one two three four";
var mySplitResult = myString.split(" ");
for(i = 0; i < mySplitResult.length; i++){
document.write("<br /> Element " + i + " = " + mySplitResult[i]);
}
</script>
Output:
Element 0 = zero
Element 1 = one
Element 2 = two
Element 3 = three
Element 4 = four
Consecutive splits are ignored, so you are just getting the two strings that don't match the regex.
You can use split or match to achieve the same, but you need different regex. For instance, you can use for match:
\d+
Working demo
Code
var re = /\d+/g;
var str = '212,0,,456,,0,67889';
var m;
while ((m = re.exec(str)) !== null) {
if (m.index === re.lastIndex) {
re.lastIndex++;
}
// View your result using the m-variable.
// eg m[0] etc.
}
Or you can use this regex to split:
,+
Working demo
I'm writing a code for live replacement of specific words in a text field as the user types.
I'm using regex and javascript:
The first array has the regular expressions to be found, and the second array has the words that should replace any them.
source = new Array(/\srsrs\s/,/\sñ\s/,/\snaum\s/,/\svc\s/,/\scd\s/,/\sOq\s/,/\soke\s/,/\so\sq\s/,
/\soque\s/,/\soqe\s/,/\spq\s/,/\sq\s/,/\sp\/\s/g,/\spra\s/,/\sp\s/,/\stbm\s/,
/\stb\s/,/\std\s/,/\sblz\s/,/\saki\s/,/\svlw\s/,/\smara\s/,/\sqlq\s/,/\sqq\s/,
/\srpz\s/,/\smsm\s/,/\smto\s/,/\smtu\s/,/\sqro\s/,/\sqdo\s/,/\sqd\s/,/\sqnd\s/,
/\sqto\s/,/\sqm\s/,/\sjah\s/, /\sc\/\s/,/\scmg\s/,/\s\+\sou\s\-\s/,/\sflw\s/,
/\sxau\s/,/\sto\s/,/\sta\s/);
after = new Array("risos","não","não","você","cadê","o que","o que","o que","o que","o que","porque",
"que","para","para","para","também","também","tudo","beleza","aqui","valeu","maravilhoso",
"qualquer","qualquer","rapaz","mesmo","muito","muito","quero","quando","quando","quando",
"quanto","quem","Já","com","comego","mais ou menos","falow","tchau","estou","está");
This is the function that does the replacement:
function replacement(){
for(i=0; i<source.length; i++){
newtext = " "+document.getElementById("translation").value+" ";
console.log(newtext);
if(myregex = newtext.match(source[i])){
newafter = after[i];
rafael = myregex+" ";
document.getElementById("translation").value = document.getElementById("translation").value.replace(rafael, newafter);
}
}
}
My problem is every time the function is called to replace an expression with only one letter, the replacement is being made on the first occurrence of that letter, even within a word. I thought looking for that letter with \s before and after would solve it, but it didn't.
If you're looking only to match a word, you should put \b before and after (word boundary). This will ensure that you don't match parts of words. Also note that you are corrupting your regex by concatenating a string. Try this instead:
var in = document.getElementById("translation").value;
if( in.charAt(in.length-1) == " ") { // user has just finished typing a word
// this avoids interrupting the word being typed
var l = source.length, i;
for( i=0; i<l; i++) in = in.replace(source[i],after[i]);
document.getElementById("translation").value = in;
}
You need to add a g (global) modified to regexes so that it will replace all occurrences and use \b instead of \s to mark word boundaries.
source = new Array(/\brsrs\b/g,/\bñ\b/g, etc
On a side note, since all your regexes follow the same pattern it might be easier to just do:
source = new Array( 'rsr', 'ñ', 'naum', etc );
if( myregex = newtext.match( new Regexp( "\b"+source[i]+"\b", 'g' ) ) ) {
...
If by "live replacement" you mean calling function replacement at each keystroke then \b at the end will not help you, you should indeed use \s. However in your replacement function your are adding a space to the text field value so your single character words are triggering the replacement.
Here is my refactoring of your code :
(function () { // wrap in immediate function to hide local variables
source = [ [/\brsrs\s$/, "risos"], // place reg exp and replacement next to each other
[/\b(ñ|naum)\s$/, "não"], // note combined regexps
[/\bvc\s$/, "você"]
// ...
]; // not also use of array literals in place of new Array
document.getElementById ("translation").addEventListener ('keyup', function (ev) {
var t = this.value // fetch text area value
, m
, i = source.length;
while (i--) // for each possible match
if ((m = t.match(source[i][0]))) { // does this one match ?
// replace match : first remove the match string (m[0]) from the end of
// the text string, then add the replacement word followed by a space
this.value = t.slice (0, -m[0].length) + source[i][1] + ' ';
return; // done
}
}, false);
}) ();
And the fiddle is : http://jsfiddle.net/jFYuV
In a somewhat different style, you could create a function that encapsulated the list of substitutions:
var substitutions = {
"rsrs": "risos",
"ñ": "não",
"naum": "não",
"vc": "você",
// ...
};
var createSubstitutionFunction = function(subs) {
var keys = [];
for (var key in subs) {
if (subs.hasOwnProperty(key)) {
keys[keys.length] = key;
}
}
var regex = new RegExp("\\b" + keys.join("\\b|\\b") + "\\b", "g");
return function(text) {
return text.replace(regex, function(match) {
return subs[match];
});
};
};
var replacer = createSubstitutionFunction(substitutions);
You would use it like this:
replacer("Some text with rsrs and naum and more rsrs and vc")
// ==> "Some text with risos and não and more risos and você"
I have attempted to create a function that will replace multiple regular expression values from an array. This works if the array does not contain quotation marks of any kind, this is problematic when I want to use a comma in my pattern. So I've been trying to find an alternative way of serving the pattern with no luck. Any ideas?
function removeCharacters(str){
//ucpa = unwanted character pattern array
//var ucpa = [/{/g,/}/g,/--/g,/---/g,/-/g,/^.\s/];
var ucpa = ["/{/","/}/","/--/","/---/","/-/","/^.\s/","/^,\s/"];
for (var i = 0; i < ucpa.length; i++){
//does not work
var pattern = new RegExp(ucpa[i],"g");
var str = str.replace(pattern, " ");
}
return str;
}
WORKING:
function removeCharacters(str){
//ucpa = unwanted character pattern array
var ucpa = [/{/g,/}/g,/--/g,/---/g,/-/g,/^.\s/,/^,\s/];
for (var i = 0; i < ucpa.length; i++){
var str = str.replace(ucpa[i], " ");
}
return str;
}
REFINED:
function removeCharacters(str){
var pattern = /[{}]|-{1,3}|^[.,]\s/g;
str = str.replace(pattern, " ");
return str;
}
The RegExp constructor takes raw expressions, not wrapped in / characters.
Therefore, all of your regexes contain two /s each, which isn't what you want.
Instead, you should make an array of actual regex literals:
var ucpa = [ /.../g, /",\/.../g, ... ];
You can also wrap all that into a single regex:
var str = str.replace(/[{}]|-{1,3}|^[.,]\s/g, " ")
although I'm not sure if that's exactly what you want since some of your regexes are nonsensical, for example ,^\s could never match.