Ignoring punctuation in JavaScript - javascript

I am working on a Caeser cipher encryptor with JavaScript for a university project and have so far managed to get it working fine.
The only problem is, I need to retain the punctuation ( ,!, ? etc) after encryption and I'm having difficulties. Here is my current code:
function encryptor() {
var alphabet = "abcdefghijklmnopqrstuvwxyz";
var letter = "";
var encWord = "";
var userInput = document.getElementById("message").value;
var key = parseInt(window.prompt("Enter a key:"));
for (i = 0; i < userInput.length; i++) {
letter = userInput[i];
var shift = alphabet.indexOf(letter);
if (letter.match("/[^A-Za-z0-9_]/")) {
continue;
} else if (shift < 23) {
var encLetter = alphabet[shift + key];
} else {
var encLetter = alphabet[shift - (26 - key)];
}
encWord += encLetter;
}
document.getElementById("encWord").innerHTML = encWord;
}
Could anyone please point me in the right direction please?
Thanks.

Before the continue you should add letter to encWord
if (letter.match(/[^A-Za-z0-9_]/)) {
encWord += letter;
continue;
}
Otherwise it will just skip it.

For keepint the punctuation, you could check, if the character is in the alphabe string and if not, then use the punctuation for this item.
function encryptor() {
var alphabet = "abcdefghijklmnopqrstuvwxyz",
letter,
encWord = "",
userInput = document.getElementById('message').value,
key = parseInt(window.prompt("Enter a key:")),
shift,
i;
for (i = 0; i < userInput.length; i++) {
letter = userInput[i];
shift = alphabet.indexOf(letter);
if (shift === -1) {
encWord += letter;
continue;
}
encWord += alphabet[(shift + key) % alphabet.length];
}
document.getElementById("encWord").innerHTML = encWord;
}
<input id="message" type="text" onchange="encryptor()"><br>
<div id="encWord"></div>

The easiest method would be to maintain an "alphabet" and simply shift along it, using indexOf to get each symbols location.
Here is an implementation:
var caesarCipher = (function() {
var alphabet = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!"#¤%&/()=?`½§#£$€{[]}|\'*';
return function caesarCipher(message, offset) {
if (offset === void 0) {
offset = 1;
}
return message.replace(/./igm, function(letter) {
if (alphabet.indexOf(letter) >= 0) {
return alphabet[((alphabet.indexOf(letter) + offset) + alphabet.length) % alphabet.length];
}
return letter;
});
};
})();
//TEST
var chipherOffset = 7;
var message = "Hello World.";
caesarCipher(message, chipherOffset);
console.log(caesarCipher(message, chipherOffset));
console.log(caesarCipher(caesarCipher(message, chipherOffset), 0 - chipherOffset));
Notice how i "forgot" to add . to my alphabet. That is the risk you run when maintaining your own alphabet.
Another solution would be to use charCodeAt and String.fromCharCode and simply use the build-in alphabet already in the browser:
var caesarCipher = (function () {
var alphabet = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!"#¤%&/()=?`½§#£$€{[]}|\'*';
return function caesarCipher(message, offset) {
if (offset === void 0) { offset = 1; }
return message.replace(/./igm, function (letter) {
return String.fromCharCode(letter.charCodeAt(0) + offset);
});
};
})();
//TEST
var chipherOffset = 7;
var message = "Hello World.";
caesarCipher(message, chipherOffset);
console.log(caesarCipher(message, chipherOffset));
console.log(caesarCipher(caesarCipher(message, chipherOffset), 0 - chipherOffset));
This solution fixes the "forgot-to-add-symbol" issue, but you risk getting some very weird symbols like "\n", which can really mess your strings up!

Related

How can I extract all contained characters in a String? [duplicate]

I have a string with repeated letters. I want letters that are repeated more than once to show only once.
Example input: aaabbbccc
Expected output: abc
I've tried to create the code myself, but so far my function has the following problems:
if the letter doesn't repeat, it's not shown (it should be)
if it's repeated once, it's show only once (i.e. aa shows a - correct)
if it's repeated twice, shows all (i.e. aaa shows aaa - should be a)
if it's repeated 3 times, it shows 6 (if aaaa it shows aaaaaa - should be a)
function unique_char(string) {
var unique = '';
var count = 0;
for (var i = 0; i < string.length; i++) {
for (var j = i+1; j < string.length; j++) {
if (string[i] == string[j]) {
count++;
unique += string[i];
}
}
}
return unique;
}
document.write(unique_char('aaabbbccc'));
The function must be with loop inside a loop; that's why the second for is inside the first.
Fill a Set with the characters and concatenate its unique entries:
function unique(str) {
return String.prototype.concat.call(...new Set(str));
}
console.log(unique('abc')); // "abc"
console.log(unique('abcabc')); // "abc"
Convert it to an array first, then use Josh Mc’s answer at How to get unique values in an array, and rejoin, like so:
var nonUnique = "ababdefegg";
var unique = Array.from(nonUnique).filter(function(item, i, ar){ return ar.indexOf(item) === i; }).join('');
All in one line. :-)
Too late may be but still my version of answer to this post:
function extractUniqCharacters(str){
var temp = {};
for(var oindex=0;oindex<str.length;oindex++){
temp[str.charAt(oindex)] = 0; //Assign any value
}
return Object.keys(temp).join("");
}
You can use a regular expression with a custom replacement function:
function unique_char(string) {
return string.replace(/(.)\1*/g, function(sequence, char) {
if (sequence.length == 1) // if the letter doesn't repeat
return ""; // its not shown
if (sequence.length == 2) // if its repeated once
return char; // its show only once (if aa shows a)
if (sequence.length == 3) // if its repeated twice
return sequence; // shows all(if aaa shows aaa)
if (sequence.length == 4) // if its repeated 3 times
return Array(7).join(char); // it shows 6( if aaaa shows aaaaaa)
// else ???
return sequence;
});
}
Using lodash:
_.uniq('aaabbbccc').join(''); // gives 'abc'
Per the actual question: "if the letter doesn't repeat its not shown"
function unique_char(str)
{
var obj = new Object();
for (var i = 0; i < str.length; i++)
{
var chr = str[i];
if (chr in obj)
{
obj[chr] += 1;
}
else
{
obj[chr] = 1;
}
}
var multiples = [];
for (key in obj)
{
// Remove this test if you just want unique chars
// But still keep the multiples.push(key)
if (obj[key] > 1)
{
multiples.push(key);
}
}
return multiples.join("");
}
var str = "aaabbbccc";
document.write(unique_char(str));
Your problem is that you are adding to unique every time you find the character in string. Really you should probably do something like this (since you specified the answer must be a nested for loop):
function unique_char(string){
var str_length=string.length;
var unique='';
for(var i=0; i<str_length; i++){
var foundIt = false;
for(var j=0; j<unique.length; j++){
if(string[i]==unique[j]){
foundIt = true;
break;
}
}
if(!foundIt){
unique+=string[i];
}
}
return unique;
}
document.write( unique_char('aaabbbccc'))
In this we only add the character found in string to unique if it isn't already there. This is really not an efficient way to do this at all ... but based on your requirements it should work.
I can't run this since I don't have anything handy to run JavaScript in ... but the theory in this method should work.
Try this if duplicate characters have to be displayed once, i.e.,
for i/p: aaabbbccc o/p: abc
var str="aaabbbccc";
Array.prototype.map.call(str,
(obj,i)=>{
if(str.indexOf(obj,i+1)==-1 ){
return obj;
}
}
).join("");
//output: "abc"
And try this if only unique characters(String Bombarding Algo) have to be displayed, add another "and" condition to remove the characters which came more than once and display only unique characters, i.e.,
for i/p: aabbbkaha o/p: kh
var str="aabbbkaha";
Array.prototype.map.call(str,
(obj,i)=>{
if(str.indexOf(obj,i+1)==-1 && str.lastIndexOf(obj,i-1)==-1){ // another and condition
return obj;
}
}
).join("");
//output: "kh"
<script>
uniqueString = "";
alert("Displays the number of a specific character in user entered string and then finds the number of unique characters:");
function countChar(testString, lookFor) {
var charCounter = 0;
document.write("Looking at this string:<br>");
for (pos = 0; pos < testString.length; pos++) {
if (testString.charAt(pos) == lookFor) {
charCounter += 1;
document.write("<B>" + lookFor + "</B>");
} else
document.write(testString.charAt(pos));
}
document.write("<br><br>");
return charCounter;
}
function findNumberOfUniqueChar(testString) {
var numChar = 0,
uniqueChar = 0;
for (pos = 0; pos < testString.length; pos++) {
var newLookFor = "";
for (pos2 = 0; pos2 <= pos; pos2++) {
if (testString.charAt(pos) == testString.charAt(pos2)) {
numChar += 1;
}
}
if (numChar == 1) {
uniqueChar += 1;
uniqueString = uniqueString + " " + testString.charAt(pos)
}
numChar = 0;
}
return uniqueChar;
}
var testString = prompt("Give me a string of characters to check", "");
var lookFor = "startvalue";
while (lookFor.length > 1) {
if (lookFor != "startvalue")
alert("Please select only one character");
lookFor = prompt(testString + "\n\nWhat should character should I look for?", "");
}
document.write("I found " + countChar(testString, lookFor) + " of the<b> " + lookFor + "</B> character");
document.write("<br><br>I counted the following " + findNumberOfUniqueChar(testString) + " unique character(s):");
document.write("<br>" + uniqueString)
</script>
Here is the simplest function to do that
function remove(text)
{
var unique= "";
for(var i = 0; i < text.length; i++)
{
if(unique.indexOf(text.charAt(i)) < 0)
{
unique += text.charAt(i);
}
}
return unique;
}
The one line solution will be to use Set. const chars = [...new Set(s.split(''))];
If you want to return values in an array, you can use this function below.
const getUniqueChar = (str) => Array.from(str)
.filter((item, index, arr) => arr.slice(index + 1).indexOf(item) === -1);
console.log(getUniqueChar("aaabbbccc"));
Alternatively, you can use the Set constructor.
const getUniqueChar = (str) => new Set(str);
console.log(getUniqueChar("aaabbbccc"));
Here is the simplest function to do that pt. 2
const showUniqChars = (text) => {
let uniqChars = "";
for (const char of text) {
if (!uniqChars.includes(char))
uniqChars += char;
}
return uniqChars;
};
const countUnique = (s1, s2) => new Set(s1 + s2).size
a shorter way based on #le_m answer
let unique=myArray.filter((item,index,array)=>array.indexOf(item)===index)

Counting Words Between Two Variable Strings

Total newbie + first time poster here with very little experience though I feel this problem is one I could solve with the help of some generous strangers.
I am querying a GDoc and attempting to create a function to count words between two strings for two possible end strings, for example:
Example #1
Definitive Title
*Count these words*
===============
OR
Example #2
Definitive Title
*Count these words*
Other words that are in a table
Definitive Title
*Count these other different words*
===============
In both of the above examples I looking to count the words between a pre-defined string and an end string.
If I ran the function that I am trying to create on Example #1 I am hoping it'd return 3 words. For Example #2 I'd hope that my function returns 8 words.
So far my function looks like this:
function doPost(e) {
var docUrl = e.parameter.docUrl
var text = DocumentApp.openByUrl(docUrl).getBody().getText()
var wordCount = text.split(" ").length
return ContentService.createTextOutput(wordCount.toString()).setMimeType(ContentService.MimeType.TEXT)
}
This returns a word count for the entire document. Any advice to point me in the right direction?
For more dynamic, appropriate and accurate solution, execute the following snippets before the split () function. Regular Expressions often used to provide dynamic solutions. It is a must have skill.
text = text.replace(/(^\s*)|(\s*$)/gi,""); // remove the start and end spaces of the string (like trim ())
text = text.replace(/[ ]{2,}/gi," "); // filter out one or more spaces
text = text.replace(/\n /,"\n"); // filter out news lines with spacing at beginning
wordCount = text.split(" ").length;
Here is a solution to your problem you can log the difference of characters and words or you can log the total amount of words or characters in the two sentaces. You are also going to want to put the bigger sentence on top, otherwise it will give you a negative number.
var x = "count these words";
var y = "count words";
function findCharDif(word1, word2) {
var word1length = word1.length;
var word2length = word2.length;
var difference = word1length - word2length;
var total = word1length + word2length;
console.log(difference);
console.log(total);
}
function findWordDif(sentence1, sentence2) {
var words1 = 0;
var words2 = 0;
for (var i = 0; i < sentence1.length; i++) {
if (sentence1[i] == " ") {
words1++;
} else {
continue
}
}
for (var a = 0; a < sentence2.length; a++) {
if (sentence2[a] == " ") {
words2++;
} else {
continue
}
}
var difference = (words1 + 1) - (words2 + 1); // this logs out the difference of words between the sentences
var totalWords = (words1 + 1) + (words2 + 1); // this logs out the total amount of words
console.log(difference);
console.log(totalWords);
}
findCharDif(x, y);
findWordDif(x, y);
The below code seems to have worked! Was able to sit down with someone and solve it with them:
function doPost(e) {
var docUrl = e.parameter.docUrl
/*
var text = DocumentApp.openByUrl(docUrl).getBody().getText()
var wordCount = text.split(" ").length
*/
var wordCount = countScenario2(docUrl);
return ContentService.createTextOutput(wordCount.toString()).setMimeType(ContentService.MimeType.TEXT)
}
/**
* Count the words from Start Test to a table or ====
*/
function countScenario2(docUrl) {
//var docUrl = 'https://docs.google.com/document/d/';
var doc = DocumentApp.openByUrl(docUrl);
var body = doc.getBody();
var reference = body.findText('Start Text');
var start = getIndex('Start Text', body);
var tables = body.getTables();
var count = 0;
for(var j = 1; j < tables.length ; j ++) {
var end = body.getChildIndex(tables[j]);
for (var i = start + 1; i < end; i++) {
var element = body.getChild(i);
var text = element.getText();
//if(text.length > 0) count += text.split(" ").filter(word => word !== ' ' && word !== '' && word !== ' ').length;
var match = text.match(/\b(\w+)\b/g);
count += (match) ? match.length : 0;
}
console.log(count);
var reference = body.findText('Start Text', reference);
var element = reference.getElement();
var start = body.getChildIndex(element.getParent());
}
var end = getIndex('=========================================================', body);
for (var i = start + 1; i < end; i++) {
var element = body.getChild(i);
var text = element.getText();
//if(text.length > 0) count += text.split(" ").filter(word => word !== ' ' && word !== '' && word !== ' ').length;
var match = text.match(/\b(\w+)\b/g);
count += (match) ? match.length : 0;
}
console.log(count);
return count ;
}
/**
* This will return the index of the element
*
* #param {string} keyword The text to be found
* #param {Body} body This is the body of the document
*/
function getIndex(keyword, body, previous) {
var reference = body.findText(keyword, previous);
var element = reference.getElement();
return body.getChildIndex(element.getParent());
}
/************ */
function testPost(){
var e = {parameter:{docUrl:'https://docs.google.com/document/d/'}};
var result = doPost(e);
console.log(JSON.stringify(result.getContent()));}
/**
* Count the words from Start Text to ====
*/
function countScenario1(docUrl) {
//var docUrl = 'https://docs.google.com/document/d/';
var doc = DocumentApp.openByUrl(docUrl);
var body = doc.getBody();
var start = getIndex('Start Text', body);
var end = getIndex('=========================================================', body);
var count = 0;
for (var i = start + 1; i < end; i++) {
var element = body.getChild(i);
var text = element.getText();
//if(text.length > 0) count += text.split(" ").filter(word => word !== ' ' && word !== '' && word !== ' ').length;
var match = text.match(/\b(\w+)\b/g);
count += (match) ? match.length : 0;
}
console.log(count);
return count;
}
function test(){
var docUrl = 'https://docs.google.com/document/d/';
var wordCount = countScenario2(docUrl);
console.log(wordCount);
}
As what #Rishabh K said in his answer, you should definitely want to replace trailing spaces and multiple spaces to avoid inaccurate results.
However on the other hand, I don't think it answers the OP's question. Correct me if I'm wrong but I think this is what you want:
var sample1 = `This is the start identifier
These words should be included
As well As these ones
Even this
Until it ends
now
Ending identifier
These words shouldn't be included
If any of these appears, the logic is wrong`;
var sample2 = sample1 + `
This is the start identifier
These some few words
should also be included in the result set
Ending identifier`;
var sample3 = sample2 + `
This is the start identifier
Although we have the start identifier above
These words shouldn't be included
because there is no corresponding end identifier`;
function getWordDiffBetween(source, str1, str2) {
// make sure newSource, str1 and str2 are all strings
var args = Array.prototype.slice.call(arguments);
args.forEach(function(str, idx) {
if (typeof str !== 'string') {
throw `Argument ${[idx + 1]} is not a string.`;
}
});
var startId = '<==start==>',
endId = '<==end==>';
var newSource = source.replace(new RegExp(str1, 'g'), startId) // replace the start identifier with our own
.replace(new RegExp(str2 + '|={2,}', 'g'), endId) // replace the end identifier with our own
.replace(/(^\s*)|(\s*$)/gi, "") // remove the start and end spaces of the string (like trim ())
.replace(/\s+/g, ' ') //replace all 1 or more spaces/newline/linefeed with a single space
//separate text into words which are separated by a space since we replaced all newlines with space
var words = newSource.split(' ');
// get the indexes where the start and end identifiers occured
var strOneIdx = getAllIndexes(words, startId, true);
var strTwoIdx = getAllIndexes(words, endId, true);
var results = [], // we will store our results here
i;
for (i = 0; i < strOneIdx.length; i++) {
var idxOne = strOneIdx[i]; // current index for str1
var idxTwo = strTwoIdx.find(x => x > idxOne);
//make sure that idxOne has a partner
if (idxTwo) {
var wordsInBetween = words.slice(idxOne + 1, idxTwo); //get range between idxOne and idxTwo
results = results.concat(wordsInBetween); // add the result
}
}
return results;
}
function getAllIndexes(arr, val) {
var indexes = [],
i;
for (i = 0; i < arr.length; i++) {
if (arr[i] === val) {
indexes.push(i);
}
}
return indexes;
}
var startIdentifier = 'This is the start identifier',
endIdentifier = 'Ending identifier',
wordResults = {
sample1: getWordDiffBetween(sample1, startIdentifier, endIdentifier),
sample2: getWordDiffBetween(sample2, startIdentifier, endIdentifier),
sample3: getWordDiffBetween(sample3, startIdentifier, endIdentifier) //should be equal to sample2
};
console.log(wordResults);
We have 2 functions - getWordDiffBetween and getAllIndexes. For explanation, check the comments I added in noteworthy lines.
Edit (updated snippet above):
It seems like you also want "====================" included as your end identifier. This can be done by changing the code:
.replace(new RegExp(str2, 'g'), endId) // replace the end identifier with our own
into
.replace(new RegExp(str2 + '|={2,}', 'g'), endId) // replace the end identifier with our own
which means match occurence of your <end string> or if there is 2 or more occurences of =. You can also change the number 2 in {2,} to your desired count.

Reducing duplicate characters in a string to a given minimum

I was messing around with the first question here: Reduce duplicate characters to a desired minimum and am looking for more elegant answers than what I came up with. It passes the test but curious to see other solutions. The sample tests are:
reduceString('aaaabbbb', 2) 'aabb'
reduceString('xaaabbbb', 2) 'xaabb'
reduceString('aaaabbbb', 1) 'ab'
reduceString('aaxxxaabbbb', 2) 'aaxxaabb'
and my solution (that passes these tests):
reduceString = function(str, amount) {
var count = 0;
var result = '';
for (var i = 0; i < str.length; i++) {
if (str[i] === str[i+1]) {
count++;
if (count < amount) {
result += str[i];
}
} else {
count = 0;
result += str[i];
}
};
return result;
}
Just use regular expressions.
var reduceString = function (str, amount) {
var re = new RegExp("(.)(?=\\1{" + amount + "})","g");
return str.replace(re, "");
}
I guess my best solution would be like
var str = "axxxaabbbbcaaxxxaab",
redStr = (s,n) => s.replace(/(\w)\1+/g,"$1".repeat(n));
console.log(redStr(str,2));
I tried to make it as short as possible:
reduceString = function(str, amount) {
var finalString = '', cL = '', counter;
str.split('').forEach(function(i){
if (i !== cL) counter = 0;
counter++;
cL = i;
if (counter <= amount ) finalString = finalString + i;
});
return finalString;
}
You can use reg expression instead. tested in javascript.
how it works:
(.) //match any character
\1 //if it follow by the same character
+{2 //more than 1 times
/g //global
$1 //is 1 time by $1$1 is 2 times
reduceString('aaaabbbb', 2)
reduceString('xaaabbbb', 2)
reduceString('aaaabbbb', 1)
reduceString('aaxxxaabbbb', 2)
function reduceString(txt,num)
{
var canRepeat=['$1'];
for (i=1;i<num;i++)
{
canRepeat.push('$1')
}
canRepeat = canRepeat.join('');
console.log(txt.replace(/(.)\1{2,}/g, canRepeat))
}
With regex:
var reduceString = function(str, amount) {
var x = [ ...new Set(str) ];
for (var c of x){
var rex = new RegExp(c + '{'+amount+',}','g');
str = str.replace(rex,string(c,amount));
}
return str;
};
var string = function(c,amount){
for(var i=0,s="";i<amount;i++)s+=c;
return s;
};
Up above regex solutions are much more better, but here is my accepted solution with reduce:
make an array from string via spread operator
Check the previous item
find how many times char is repeated in result string
otherwise concat result string with the current char
Don`t forget to use the second argument as the initial value, and return for each cases
reduceString = function(str, amount) {
return [...str].reduce(((res, cur)=>{
if(res.length && cur === res[res.length-1]){
dupsCount = [...res].filter(char => char === cur).length
if(dupsCount===amount){
return res;
}
else {
res+=cur;
return res;
}
}
res+=cur;
return res;
}),"")
}

JavaScript strings outside of the BMP

BMP being Basic Multilingual Plane
According to JavaScript: the Good Parts:
JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.
This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.
Further investigation confirms this:
> String.fromCharCode(0x20001);
The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.
Question: is it at all possible to handle post-BMP characters in JavaScript?
2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:
Depends what you mean by ‘support’. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.
But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.
If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:
String.prototype.getCodePointLength= function() {
return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};
String.fromCodePoint= function() {
var chars= Array.prototype.slice.call(arguments);
for (var i= chars.length; i-->0;) {
var n = chars[i]-0x10000;
if (n>=0)
chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
}
return String.fromCharCode.apply(null, chars);
};
I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.
I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.
You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/
Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.
if (!String.prototype.ucLength) {
String.prototype.ucLength = function() {
// this solution was taken from
// http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
};
}
if (!String.prototype.codePointAt) {
String.prototype.codePointAt = function (ucPos) {
if (isNaN(ucPos)){
ucPos = 0;
}
var str = String(this);
var codePoint = null;
var pairFound = false;
var ucIndex = -1;
var i = 0;
while (i < str.length){
ucIndex += 1;
var code = str.charCodeAt(i);
var next = str.charCodeAt(i + 1);
pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
if (ucIndex == ucPos){
codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
break;
} else{
i += pairFound ? 2 : 1;
}
}
return codePoint;
};
}
if (!String.fromCodePoint) {
String.fromCodePoint = function () {
var strChars = [], codePoint, offset, codeValues, i;
for (i = 0; i < arguments.length; ++i) {
codePoint = arguments[i];
offset = codePoint - 0x10000;
if (codePoint > 0xFFFF){
codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
} else{
codeValues = [codePoint];
}
strChars.push(String.fromCharCode.apply(null, codeValues));
}
return strChars.join("");
};
}
if (!String.prototype.ucCharAt) {
String.prototype.ucCharAt = function (ucIndex) {
var str = String(this);
var codePoint = str.codePointAt(ucIndex);
var ucChar = String.fromCodePoint(codePoint);
return ucChar;
};
}
if (!String.prototype.ucIndexOf) {
String.prototype.ucIndexOf = function (searchStr, ucStart) {
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = 0;
}
var str = String(this);
var strUCLength = str.ucLength();
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i < strUCLength){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i++;
}
return -1;
};
}
if (!String.prototype.ucLastIndexOf) {
String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = strUCLength - 1;
}
if (ucStart >= strUCLength){
ucStart = strUCLength - 1;
}
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i >= 0){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i--;
}
return -1;
};
}
if (!String.prototype.ucSlice) {
String.prototype.ucSlice = function (ucStart, ucStop) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = strUCLength + ucStart;
if (ucStart < 0){ ucStart = 0;}
}
if (typeof(ucStop) == 'undefined'){
ucStop = strUCLength - 1;
}
if (ucStop < 0){
ucStop = strUCLength + ucStop;
if (ucStop < 0){ ucStop = 0;}
}
var ucChars = [];
var i = ucStart;
while (i < ucStop){
ucChars.push(str.ucCharAt(i));
i++;
}
return ucChars.join("");
};
}
if (!String.prototype.ucSplit) {
String.prototype.ucSplit = function (delimeter, limit) {
var str = String(this);
var strUCLength = str.ucLength();
var ucChars = [];
if (delimeter == ''){
for (var i = 0; i < strUCLength; i++){
ucChars.push(str.ucCharAt(i));
}
ucChars = ucChars.slice(0, 0 + limit);
} else{
ucChars = str.split(delimeter, limit);
}
return ucChars;
};
}
More recent JavaScript engines have String.fromCodePoint.
const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP
Also a code-point iterator, which gets you the code-point length.
function countCodePoints( str )
{
const i = str[Symbol.iterator]();
let count = 0;
while( !i.next().done ) ++count;
return count;
}
console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'
Yes, you can. Although support to non-BMP characters directly in source documents is optional according to the ECMAScript standard, modern browsers let you use them. Naturally, the document encoding must be properly declared, and for most practical purposes you would need to use the UTF-8 encoding. Moreover, you need an editor that can handle UTF-8, and you need some input method(s); see e.g. my Full Unicode Input utility.
Using suitable tools and settings, you can write var foo = '𠀁'.
The non-BMP characters will be internally represented as surrogate pairs, so each non-BMP character counts as 2 in the string length.
Using for (c of this) instruction, one can make various computations on a string that contains non-BMP characters. For instance, to compute the string length, and to get the nth character of the string:
String.prototype.magicLength = function()
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
k++;
}
return k;
}
String.prototype.magicCharAt = function(n)
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
if (k == n) return c + "";
k++;
}
return "";
}
This old topic has now a simple solution in ES6:
Split characters into an array
simple version
[..."😴😄😃⛔🎠🚓🚇"] // ["😴", "😄", "😃", "⛔", "🎠", "🚓", "🚇"]
Then having each one separated you can handle them easily for most common cases.
Credit: DownGoat
Full solution
To overcome special emojis as the one in the comment, one can search for the connection charecter (char code 8205 in UTF-16) and make some modifications. Here is how:
let myStr = "👩‍👩‍👧‍👧😃𝌆"
let arr = [...myStr]
for (i = arr.length-1; i--; i>= 0) {
if (arr[i].charCodeAt(0) == 8205) { // special combination character
arr[i-1] += arr[i] + arr[i+1]; // combine them back to a single emoji
arr.splice(i, 2)
}
}
console.log(arr.length) //3
Haven't found a case where this doesn't work. Comment if you do.
To conclude
it seems that JS uses the 8205 char code to represent UCS-2 characters as a UTF-16 combinations.

What's the best way to count keywords in JavaScript?

What's the best and most efficient way to count keywords in JavaScript? Basically, I'd like to take a string and get the top N words or phrases that occur in the string, mainly for the use of suggesting tags. I'm looking more for conceptual hints or links to real-life examples than actual code, but I certainly wouldn't mind if you'd like to share code as well. If there are particular functions that would help, I'd also appreciate that.
Right now I think I'm at using the split() function to separate the string by spaces and then cleaning punctuation out with a regular expression. I'd also want it to be case-insensitive.
Cut, paste + execute demo:
var text = "Text to be examined to determine which n words are used the most";
// Find 'em!
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
var word = matches[0].toLowerCase();
if (typeof words[word] == "undefined")
{
words[word] = 1;
}
else
{
words[word]++;
}
}
// Sort 'em!
var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
wordList.push([word, words[word]]);
}
}
wordList.sort(function(a, b) { return b[1] - a[1]; });
// Come back any time, straaanger!
var n = 10;
var message = ["The top " + n + " words are:"];
for (var i = 0; i < n; i++)
{
message.push(wordList[i][0] + " - " + wordList[i][1] + " occurance" +
(wordList[i][1] == 1 ? "" : "s"));
}
alert(message.join("\n"));
Reusable function:
function getTopNWords(text, n)
{
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
var word = matches[0].toLowerCase();
if (typeof words[word] == "undefined")
{
words[word] = 1;
}
else
{
words[word]++;
}
}
var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
wordList.push([word, words[word]]);
}
}
wordList.sort(function(a, b) { return b[1] - a[1]; });
var topWords = [];
for (var i = 0; i < n; i++)
{
topWords.push(wordList[i][0]);
}
return topWords;
}
Once you have that array of words cleaned up, and let's say you call it wordArray:
var keywordRegistry = {};
for(var i = 0; i < wordArray.length; i++) {
if(keywordRegistry.hasOwnProperty(wordArray[i]) == false) {
keywordRegistry[wordArray[i]] = 0;
}
keywordRegistry[wordArray[i]] = keywordRegistry[wordArray[i]] + 1;
}
// now keywordRegistry will have, as properties, all of the
// words in your word array with their respective counts
// this will alert (choose something better than alert) all words and their counts
for(var keyword in keywordRegistry) {
alert("The keyword '" + keyword + "' occurred " + keywordRegistry[keyword] + " times");
}
That should give you the basics of doing this part of the work.
Try to split you string on words and count the resulting words, then sort on the counts.
This builds upon a previous answer by insin by only having one loop:
function top_words(text, n) {
// Split text on non word characters
var words = text.toLowerCase().split(/\W+/)
var positions = new Array()
var word_counts = new Array()
for (var i=0; i<words.length; i++) {
var word = words[i]
if (!word) {
continue
}
if (typeof positions[word] == 'undefined') {
positions[word] = word_counts.length
word_counts.push([word, 1])
} else {
word_counts[positions[word]][1]++
}
}
// Put most frequent words at the beginning.
word_counts.sort(function (a, b) {return b[1] - a[1]})
// Return the first n items
return word_counts.slice(0, n)
}
// Let's see if it works.
var text = "Words in here are repeated. Are repeated, repeated!"
alert(top_words(text, 3))
The result of the example is: [['repeated',3], ['are',2], ['words', 1]]
I would do exactly what you have mentioned above to isolate each word. I would then probably add each word as the index of an array with the number of occurrences as the value.
For example:
var a = new Array;
a[word] = a[word]?a[word]+1:1;
Now you know how many unique words there are (a.length) and how many occurrences of each word existed (a[word]).

Categories

Resources