JavaScript Regex to find UOM in a string - javascript

I have a list of products that contains UOM in the product title. It needs automatically detect the UOM in the title by using Regex.
Expectations
Banana Yogurt 70ml returns ml
Fish Nuggets 200G returns g
Potato Wedges 200 G returns g
I have this function below
detectMetricUnit = (title) => {
let unit,
regex = new RegExp(/(?:\d)/mg),
measurement = title.match(regex) && title.match(regex)[0],
matches = measurement && title.split(measurement)[1];
if(matches) {
if(/millilitre|milliliter|ml/.test(matches.toLowerCase())){
unit = 'ml';
} else if(/litre|liter|l/.test(matches.toLowerCase())){
unit = 'l';
} else if (/kilogram|kg/.test(matches.toLowerCase())) {
unit = 'kg';
} else if (/gram|g/.test(matches.toLowerCase())) {
unit = 'g';
}
}
return unit;
}
However I have some problematic strings such as
Chocolate Drink 330ML X 24 matches 3 and return null UOM
which I am expecting to get ml.
Appreciate if someone could point out my mistake in my regex. How do I actually get the full integers and find the UOM attached next to it even with a space?

You may define a dictionary of possible UOMs you want to detect and then build a regex similar to
/(\d+(?:\.\d+)?)\s?(millilitre|milliliter|ml|litre|liter|l|kilogram|kg|gram|g)\b/i
See the regex demo. The (\d+(?:\.\d+)?) part will capture an integer or float value into Group 1, then \s? match an optional whitespace (change to \s* to match 0 or more whitespaces), and then (millilitre|milliliter|ml|litre|liter|l|kilogram|kg|gram|g)\b will capture UOM unit into Group 2 as a whole word (due to \b word boundary).
Here is the JS implementation to get the first UOM from string:
let strs = ['Banana Yogurt 70ml', 'Fish Nuggets 200G', 'Potato Wedges 200 G', 'Chocolate Drink 330ML X 24']
let dct = {millilitre: 'ml', milliliter: 'ml', ml: 'ml', litre:'l', liter: 'l', l: 'l', kilogram: 'kg', kg: 'kg', gram: 'g', g: 'g'}
detectMetricUnit = (title) => {
let unit, match, val,
regex = new RegExp("(\\d+(?:\\.\\d+)?)\\s?(" + Object.keys(dct).join("|") + ")\\b", "i");
match = title.match(regex);
if (match) {
val = match[1];
unit = dct[match[2].toLowerCase()]
}
return [val, unit];
}
strs.forEach(x => console.log(detectMetricUnit(x)) )
To get all of them, multiple occurrences:
let strs = ['Banana Yogurt 70ml and Fish Nuggets 200G', 'Potato Wedges 200 G and Chocolate Drink 330ML X 24']
let dct = {millilitre: 'ml', milliliter: 'ml', ml: 'ml', litre:'l', liter: 'l', l: 'l', kilogram: 'kg', kg: 'kg', gram: 'g', g: 'g'}
detectMetricUnit = (title) => {
let match, results = [],
regex = new RegExp("(\\d+(?:\\.\\d+)?)\\s?(" + Object.keys(dct).join("|") + ")\\b", "ig");
while (match=regex.exec(title)) {
results.push([ match[1], dct[match[2].toLowerCase()] ]);
}
return results;
}
strs.forEach(x => console.log(x, detectMetricUnit(x)) )

Related

How to divide a DocumentFragment based on character offset

I have a string that (potentially) contains HTML tags.
I want to split it into smaller valid HTML strings based on (text) character length. The use case is essentially pagination. I know the length of text that can fit on a single page. So I want to divide the target string into "chunks" or pages based on that character length. But I need each of the resulting pages to contain valid HTML without unclosed tags, etc.
So for example:
const pageCharacterSize = 10
const testString = 'some <strong>text with HTML</strong> tags
function paginate(string, pageSize) { //#TODO }
const pages = paginate(testString, pageCharacterSize)
console.log(pages)
// ['some <strong>text </strong>', '<strong>with HTML</strong> ', 'tags']
I think this is possible to do with a DocumentFragment or Range but I can't figure out how slice the pages based on character offsets.
This MDN page has a demo that does something close to what I need. But it uses caretPositionFromPoint() which takes X, Y coordinates as arguments.
Update
For the purposes of clarity, here are the tests I'm working with:
import { expect, test } from 'vitest'
import paginate from './paginate'
// 1
test('it should chunk plain text', () => {
// a
const testString = 'aa bb cc dd ee';
const expected = ['aa', 'bb', 'cc', 'dd', 'ee']
expect(paginate(testString, 2)).toStrictEqual(expected)
// b
const testString2 = 'a a b b c c';
const expected2 = ['a a', 'b b', 'c c']
expect(paginate(testString2, 3)).toStrictEqual(expected2)
// c
const testString3 = 'aa aa bb bb cc cc';
const expected3 = ['aa aa', 'bb bb', 'cc cc']
expect(paginate(testString3, 5)).toStrictEqual(expected3)
// d
const testString4 = 'aa bb cc';
const expected4 = ['aa', 'bb', 'cc']
expect(paginate(testString4, 4)).toStrictEqual(expected4)
// e
const testString5 = 'a b c d e f g';
const expected5 = ['a b c', 'd e f', 'g']
expect(paginate(testString5, 5)).toStrictEqual(expected5)
// f
const testString6 = 'aa bb cc';
const expected6 = ['aa bb', 'cc']
expect(paginate(testString6, 7)).toStrictEqual(expected6)
})
// 2
test('it should chunk an HTML string without stranding tags', () => {
const testString = 'aa <strong>bb</strong> <em>cc dd</em>';
const expected = ['aa', '<strong>bb</strong>', '<em>cc</em>', '<em>dd</em>']
expect(paginate(testString, 3)).toStrictEqual(expected)
})
// 3
test('it should handle tags that straddle pages', () => {
const testString = '<strong>aa bb cc</strong>';
const expected = ['<strong>aa</strong>', '<strong>bb</strong>', '<strong>cc</strong>']
expect(paginate(testString, 2)).toStrictEqual(expected)
})
Here is a solution that assumes and supports the following:
tags without attributes (you could tweak the regex to support that)
well formed tags assumed, e.g. not: <b><i>wrong nesting</b></i>, missing <b>end tag, missing start</b> tag
tags may be nested
tags are removed & later restored for proper characters per page count
page split is done by looking backwards for first space
function paginate(html, pageSize) {
let splitRegex = new RegExp('\\s*[\\s\\S]{1,' + pageSize + '}(?!\\S)', 'g');
let tagsInfo = []; // saved tags
let tagOffset = 0; // running offset of tag in plain text
let pageOffset = 0; // page offset in plain text
let openTags = []; // open tags carried over to next page
let pages = html.replace(/<\/?[a-z][a-z0-9]*>/gi, (tag, pos) => {
let obj = { tag: tag, pos: pos - tagOffset };
tagsInfo.push(obj);
tagOffset += tag.length;
return '';
}).match(splitRegex).map(page => {
let nextOffset = pageOffset + page.length;
let prefix = openTags.join('');
tagsInfo.slice().reverse().forEach(obj => {
if(obj.pos >= pageOffset && obj.pos < nextOffset) {
// restore tags in reverse order to maintain proper position
page = page.substring(0, obj.pos - pageOffset) + obj.tag + page.substring(obj.pos - pageOffset);
}
});
tagsInfo.forEach(obj => {
let tag = obj.tag;
if(obj.pos >= pageOffset && obj.pos < nextOffset) {
if(tag.match(/<\//)) {
// remove tag from openTags list
tag = tag.replace(/<\//, '<');
let index = openTags.indexOf(tag);
if(index >= 0) {
openTags.splice(index, 1);
}
} else {
// add tag to openTags list
openTags.push(tag);
}
}
});
pageOffset = nextOffset;
let postfix = openTags.slice().reverse().map(tag => tag.replace(/</, '</')).join('');
page = prefix + page.trim() + postfix;
return page.replace(/<(\w+)><\/\1>/g, ''); // remove tags with empty content
});
return pages;
}
[
{ str: 'some <strong>text <i>with</i> HTML</strong> tags, and <i>some <b>nested tags</b> sould be <b>supported</b> as well</i>.', size: 16 },
{ str: 'a a b b c c', size: 3 },
{ str: 'aa aa bb bb cc cc', size: 5 },
{ str: 'aa bb cc', size: 4 },
{ str: 'aa <strong>bb</strong> <em>cc dd</em>', size: 3 },
{ str: '<strong>aa bb cc</strong>', size: 2 }
].forEach(o => {
let pages = paginate(o.str, o.size);
console.log(pages);
});
Output:
[
"some <strong>text <i>with</i></strong>",
"<strong> HTML</strong> tags, and",
"<i>some <b>nested tags</b></i>",
"<i> sould be</i>",
"<i><b>supported</b> as</i>",
"<i>well</i>."
]
[
"a a",
"b b",
"c c"
]
[
"aa aa",
"bb bb",
"cc cc"
]
[
"aa",
"bb",
"cc"
]
[
"aa",
"<strong>bb</strong>",
" <em>cc</em>",
"<em>dd</em>"
]
[
"<strong>aa</strong>",
"<strong>bb</strong>",
"<strong>cc</strong>"
]
Update
Based on new request in comment I fixed the split regex from '[\\s\\S]{1,' + pageSize + '}(?!\\S)' to '\\s*[\\s\\S]{1,' + pageSize + '}(?!\\S)', e.g. added \\s* to catch leading spaces. I also added a page.trim() to remove leading spaces. Finally I added a few of the OP examples.

Find the longest anagram with array javascript

I try to find the longest anagram in Javascript. For this, I have an array with 10 letters and a dictionary that contains every words.
I would like that the program test every combination possible.
We started from 10 (the array length of letters) and we check if it's an anagram
If not, we remove the char at the very end, and we check, if not, we shift the removed char by one to the left... When the entire combinations with 9 letters is tested, we test for 8, 7, 6, 5, 4, 3, 2 letters.
var wordFound = '' // The longest word found
var copyArr = [] // I don't manipulate the lettersChosen array, so I save a copy in copyArr
var savedWord = [] // A copy of copyArr but i'm not sure about this
var lengthLetters = 0 // The length of the numbers left
var lettersChosen = ['A', 'S', 'V', 'T', 'S', 'E', 'A', 'M', 'N'] //This the the array of letters
function isAnagram(stringA, stringB) {
stringA = stringA.toLowerCase().replace(/[\W_]+/g, "");
stringB = stringB.toLowerCase().replace(/[\W_]+/g, "");
const stringASorted = stringA.split("").sort().join("");
const stringBSorted = stringB.split("").sort().join("");
return stringASorted === stringBSorted;
}
function checkForEachWord(arr) {
strLetters = ''
for (i in arr)
strLetters = strLetters + arr[i]
for (var i in file)
if (isAnagram(strLetters, file[i])) {
wordFound = file[i]
return true
}
return false
}
function getOneOfTheLongestWord() {
lettersChosen.forEach(letter => {
copyArr.push(letter) // I copy the array
})
var index = 1 // The index of the letter to remove
var countLetter = 1 // How much letters I have to remove
var position = copyArr.length - index // The actual position to remove
var savedArray = [] // The copy of CopyArr but i'm not sure about that
var iteration = 0 // The total of combination possible
var test = checkForEachWord(copyArr) // I try with 10 letters
if (test == true)
return true // I found the longest word
while (test == false) {
copyArr.splice(position, 1) // I remove the char at current position
index++ // Change letter to remove
if (index > copyArr.length + 1) { // If I hit the first character, then restart from the end
index = 1
countLetter++ // Remove one more letter
}
console.log(copyArr + ' | ' + position)
position = copyArr.length - index // Get the position based on the actual size of the array letters
test = checkForEachWord(copyArr) // Test the anagram
copyArr = [] // Reset array
lettersChosen.forEach(letter => { // Recreate the array
copyArr.push(letter)
})
}
return true // Word found
}
getOneOfTheLongestWord()
My code is not optimal there is so many way to improve it.
Actually my output is good with 9 letters.
copyArr | position
A,S,V,T,S,E,A,M | 8
A,S,V,T,S,E,M,N | 6
A,S,V,T,S,A,M,N | 5
A,S,V,T,E,A,M,N | 4
A,S,V,S,E,A,M,N | 3
A,S,T,S,E,A,M,N | 2
A,V,T,S,E,A,M,N | 1
S,V,T,S,E,A,M,N | 0
But not with 8 letters, I don't see how I can use my countLetter to test all combinations...
Thank you very much.
Short answer, put the sorted versions of dictionary words into a trie, then do an A* search.
Longer answer because you probably haven't encountered those things.
A trie is a data structure which at each point gives you a lookup by character of the next level of the trie. You can just use a blank object as a trie. Here is some simple code to add a word to one.
function add_to_trie (trie, word) {
let letters = word.split('').sort();
for (let i in letters) {
let letter = letters[i];
if (! trie[letter]) {
trie[letter] = {};
}
trie = trie[letter];
}
trie['final'] = word;
}
An A* search simply means that we have a priority queue that gives us the best option to look at next. Rather than implement my own priority queue I will simply use an existing one at flatqueue. It returns the lowest priority possible. So I'll use as a priority one that puts the longest possible word first, and if there is a tie then goes with whatever word we are farthest along on. Here is an implementation.
import FlatQueue from "flatqueue";
function longest_word_from (trie, letters) {
let sorted_letters = letters.sort();
let queue = new FlatQueue();
// Entries will be [position, current_length, this_trie]
// We prioritize the longest word length first, then the
// number of characters. Since we get the minimum first,
// we make priorities negative numbers.
queue.push([0, 0, trie], - (letters.length ** 2));
while (0 < queue.length) {
let entry = queue.pop();
let position = entry[0];
let word_length = entry[1];
let this_trie = entry[2];
if (position == letters.length) {
if ('final' in this_trie) {
return this_trie['final'];
}
}
else {
if (letters[position] in this_trie) {
queue.push(
[
position + 1, // Advance the position
word_length + 1, // We added a letter
this_trie[letters[position]] // And the sub-trie after that letter
],
- letters.length * (
letters.length + position - word_length
) - word_length - 1
);
}
queue.push(
[
position + 1, // Advance the position
word_length, // We didn't add a a letter
this_trie // And stayed at the same position.
],
- letters.length * (
letters.length + position - word_length - 1
) - word_length
);
}
}
return null;
}
If the import doesn't work for you, you can simply replace that line with the code from index.js. Simply remove the leading export default and the rest will work.
And with that, here is sample code that demonstrates it in action.
let file = ['foo', 'bar', 'baz', 'floop'];
let letters = 'fleaopo'.split('')
let this_trie = {};
for (var i in file) {
add_to_trie(this_trie, file[i]);
}
console.log(longest_word_from(this_trie, letters));
If you have a long dictionary, loading the dictionary into the trie is most of your time. But once you've done that you can call it over and over again with different letters, and get answers quite quickly.

Bra size validation with RegExp (US, EU, Japan, Australia)

I want to check if an input is a valid bra measurement. In the US, bra sizes are written with an even number 28-48 and a letter A-I, AAA, AA, DD, DDD, HH or HHH. The EU, Japan and Australia use different numbers and patterns, ex. 90C C90 and DD6.
-I want to split the letters and digits, check that the letter is between A - I or AA, AAA, DD, DDD, HH or HHH, and that the number is 28 - 48 (even numbers only), 60-115 (increments of 5, so 65, 70, 75, etc.) or 6-28 even numbers only.
var input = $("#form_input").val("");
var bust = input.match(/[\d\.]+|\D+/g);
var vol = bust[0];
var band = bust[1];
I can write a long test condition:
if ((vol > 28 && vol < 48) && band == "AAA" || band == "AA" || band == "A" || band == "B" || etc.) { //some code
} else { error message" }```
How do I shorten this and do the above things using regex?
It is a bit of a long pattern with the alternatives, but you can easily adjust the ranges if something is missing or matches too much.
You can first check if the pattern matches using test. To get the band and the vol matches, one option is to extract either the digits or the uppercase chars from the match as there are matches for example for 90C and C90
^(?:(?:28|3[02468]|4[02468])(?:AA?|[BC]|D{1,4}|[E-I])|(?:[6-9][05]|1[01][05])(?:AA?|[BC]|DD?|[E-I])|[A-I](?:[6-9][05]|1[01][05])|(?:[68]|1[02468]|2[0246])(?:AA?|[BC]|DD?|[E-I]))$
Explanation
^ Start of string
(?: Non capture group for the alternatives
(?:28|3[02468]|4[02468]) Match from 28 - 48 in steps of 2
(?:AA?|[BC]|D{1,4}|[E-I]) Match AA, A, B, C, 1-4 times a D or a range E-I
| Or
(?:[6-9][05]|1[01][05]) Match from 60 - 115 insteps of 5
(?:AA?|[BC]|DD?|[E-I]) Match AA, A, B, C DD, D or a range E-I
| Or
[A-I](?:[6-9][05]|1[01][05]) Match a range A-I and a number 60 - 115 in steps of 5
| Or
(?:[68]|1[02468]|2[0246]) Match from 6 - 26 in steps of 2
(?:AA?|[BC]|DD?|[E-I]) Match AA, A, B, C, DD, D or a range E-I
) Close alternation
$ End of string
Regex demo
const pattern = /^(?:(?:28|3[02468]|4[02468])(?:AA?|[BC]|D{1,4}|[E-I])|(?:[6-9][05]|1[01][05])(?:AA?|[BC]|DD?|[E-I])|[A-I](?:[6-9][05]|1[01][05])|(?:[68]|1[02468]|2[0246])(?:AA?|[BC]|DD?|[E-I]))$/;
const str = `28A
28AA
30B
34AA
36DDDD
D70
I115
A70
H80
6AA
26I
`;
str.split('\n').forEach(s => {
if (pattern.test(s)) {
console.log(`Match: ${s}`);
let vol = s.match(/\d+/)[0];
let band = s.match(/[A-Z]+/)[0];
console.log(`vol: ${vol}`);
console.log(`band: ${band}`);
console.log("---------------------------------------");
}
})
^(((([0-4])(0|2|4|6|8))|(6|8))|(((6|7|8|9)(0|5))|(1[01][05])))((AAA)|(AA)|(DD)|(DDD)|(HH)|(HHH)|[A-I])$
Proof that all valid sizes match, while all 100_464 sample invalid sizes do not:
const validNumbers = Array
.from({ length: 22 }, (_, i) => 6 + (i * 2))
.concat(Array.from({ length: 12 }, (_, i) => 60 + (i * 5)));
const validLetters = [
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
'AAA', 'AA', 'DD', 'DDD', 'HH', 'HHH'
];
const validSizes = validNumbers.map((number) => validLetters
.map((letter) => number + letter))
.flat();
const invalidNumbers = Array
.from({ length: 1_000 }, (_, i) => i)
.filter((n) => !validNumbers.includes(n))
const invalidLetters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.split('')
.map((letter) => Array.from({ length: 4 }, (_, i) => letter.repeat(i + 1)))
.flat();
const invalidSizes = invalidNumbers.map((number) => invalidLetters
.map((letter) => number + letter))
.flat();
const regex = /^(((([0-4])(0|2|4|6|8))|(6|8))|(((6|7|8|9)(0|5))|(1[01][05])))((AAA)|(AA)|(DD)|(DDD)|(HH)|(HHH)|[A-I])$/;
const falsePositives = invalidSizes.filter((size) => regex.test(size));
console.log({ falsePositives });
console.log({ validSizes: validSizes.map((size) => ({ size, isValid: regex.test(size) })) });

Reorder Data in Log Files - Javascript

I'm trying to solve the Reorder Data in Log Files algorithm.
You have an array of logs. Each log is a space delimited string of words.
For each log, the first word in each log is an alphanumeric identifier. Then, either:
Each word after the identifier will consist only of lowercase letters, or;
Each word after the identifier will consist only of digits.
We will call these two varieties of logs letter-logs and digit-logs. It is guaranteed that each log has at least one word after its identifier.
Reorder the logs so that all of the letter-logs come before any digit-log. The letter-logs are ordered lexicographically ignoring identifier, with the identifier used in case of ties. The digit-logs should be put in their original order.
Return the final order of the logs.
Example:
Input: logs = ["dig1 8 1 5 1","let1 art can","dig2 3 6","let2 own kit dig","let3 art zero"]
Output: ["let1 art can","let3 art zero","let2 own kit dig","dig1 8 1 5 1","dig2 3 6"]
My idea is having a map for the digits and one for the letters. I have done it. Then, I would need to sort the digits and letters and add all the sorted letters to my answer array and all the sorted digits to my answer array.
var reorderLogFiles = function(logs) {
if(!logs || logs.length === 0)
return [];
let numbers = {
'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6,
'7': 7, '8': 8, '9': 9
};
let digits = new Map();
let letters = new Map();
for(let i=0; i<logs.length; i++) {
const log = logs[i].split(" ");
if(numbers[log[1]] !== undefined)
digits.set(log[0], log.splice(1, log.length));
else
letters.set(log[0], log.splice(1, log.length));
}
// How can I sort letter and digits?
let ans = [];
for(const [key, value] of sortedLetters) {
const temp = key + " " + value.join(" ");
ans.push(temp);
}
for(const [key, value] of sortedDigits) {
const temp = key + " " + value.join(" ");
ans.push(temp);
}
return ans;
};
I think you can simplify your code somewhat. First, create the digits and letters groups by filtering the original logs; this can be made easier by first splitting all the values in logs. Next, sort the letters based on the second value in the array and add the digits to the end of the sorted array. Finally, join the strings back together:
const reorderLogFiles = logs => {
// split values on first space
logs = logs.map(v => v.split(/\s+(.*)/).filter(Boolean));
// filter into digits and letters
let digits = logs.filter(v => v[1].match(/^[\s\d]+$/));
let letters = logs.filter(v => v[1].match(/^[a-z\s]+$/));
// sort the letters
letters.sort((a, b) => (c = a[1].localeCompare(b[1])) ? c : a[0].localeCompare(b[0]));
// reassemble the list
result = letters.concat(digits);
// and convert back to strings
result = result.map(a => a.join(' '));
return result;
}
let logs = ["dig1 8 1 5 1", "let1 art can", "dig2 3 6", "let2 own kit dig", "let3 art zero"];
console.log(reorderLogFiles(logs));
logs = ["a1 9 2 3 1", "g1 act car", "zo4 4 7", "ab1 off key dog", "a8 act zoo", "a2 act car"];
console.log(reorderLogFiles(logs));
Note this code can be written more compactly by chaining operations but I've written it out more fully to make it easier to follow.
If you don't want to use regex, you can test the first character of each substring to see if it's a digit or letter. For example:
let digits = logs.filter(v => v[1][0] >= '0' && v[1][0] <= '9');
let letters = logs.filter(v => v[1][0] >= 'a' && v[1][0] <= 'z');

How to convert human readable memory size into bytes?

I'm trying to convert strings that match /(\d)+(\.\d+)?(m|g|t)?b?/i into bytes.
For example, 1KB would return 1024. 1.2mb would return 1258291.
If you reorganize the capturing group in your regex like so: /(\d+(?:\.\d+)?)\s?(k|m|g|t)?b?/i
you can do something like:
function unhumanize(text) {
var powers = {'k': 1, 'm': 2, 'g': 3, 't': 4};
var regex = /(\d+(?:\.\d+)?)\s?(k|m|g|t)?b?/i;
var res = regex.exec(text);
return res[1] * Math.pow(1024, powers[res[2].toLowerCase()]);
}
unhumanize('1 Kb')
# 1024
unhumanize('1 Mb')
# 1048576
unhumanize('1 Gb')
# 1073741824
unhumanize('1 Tb')
# 1099511627776
You've already got a capturing group for the unit prefix, now all you need is a lookup table:
{ 'k', 1L<<10 },
{ 'M', 1L<<20 },
{ 'G', 1L<<30 },
{ 'T', 1L<<40 },
{ 'P', 1L<<50 },
{ 'E', 1L<<60 }
Demo: http://ideone.com/5O7Vp
Although 1258291 is clearly far too many significant digits to get from 1.2MB.
oops, I gave a C# example. The method is still good though.
One liner solution:
"1.5 MB".replace(/(\d+)+(\.(\d+))?\s?(k|m|g|t)?b?/i, function(value, p1, p2, p3, p4) { return parseFloat(p1 + (p2 || ""))*({ 'K' : 1<<10, 'M' : 1<<20, 'G' : 1<<30, 'T' : 1<<40 }[p4] || 1); })
# 1572864

Categories

Resources