How can I implement javascript function to calculate frequency of each word in a given sentence.
this is my code:
function search () {
var data = document.getElementById('txt').value;
var temp = data;
var words = new Array();
words = temp.split(" ");
var uniqueWords = new Array();
var count = new Array();
for (var i = 0; i < words.length; i++) {
//var count=0;
var f = 0;
for (j = 0; j < uniqueWords.length; j++) {
if (words[i] == uniqueWords[j]) {
count[j] = count[j] + 1;
//uniqueWords[j]=words[i];
f = 1;
}
}
if (f == 0) {
count[i] = 1;
uniqueWords[i] = words[i];
}
console.log("count of " + uniqueWords[i] + " - " + count[i]);
}
}
am unable to trace out the problem ..any help is greatly appriciated.
output in this format:
count of is - 1
count of the - 2..
input: this is anil is kum the anil
Here is a JavaScript function to get the frequency of each word in a sentence:
function wordFreq(string) {
var words = string.replace(/[.]/g, '').split(/\s/);
var freqMap = {};
words.forEach(function(w) {
if (!freqMap[w]) {
freqMap[w] = 0;
}
freqMap[w] += 1;
});
return freqMap;
}
It will return a hash of word to word count. So for example, if we run it like so:
console.log(wordFreq("I am the big the big bull."));
> Object {I: 1, am: 1, the: 2, big: 2, bull: 1}
You can iterate over the words with Object.keys(result).sort().forEach(result) {...}. So we could hook that up like so:
var freq = wordFreq("I am the big the big bull.");
Object.keys(freq).sort().forEach(function(word) {
console.log("count of " + word + " is " + freq[word]);
});
Which would output:
count of I is 1
count of am is 1
count of big is 2
count of bull is 1
count of the is 2
JSFiddle: http://jsfiddle.net/ah6wsbs6/
And here is wordFreq function in ES6:
function wordFreq(string) {
return string.replace(/[.]/g, '')
.split(/\s/)
.reduce((map, word) =>
Object.assign(map, {
[word]: (map[word])
? map[word] + 1
: 1,
}),
{}
);
}
JSFiddle: http://jsfiddle.net/r1Lo79us/
I feel you have over-complicated things by having multiple arrays, strings, and engaging in frequent (and hard to follow) context-switching between loops, and nested loops.
Below is the approach I would encourage you to consider taking. I've inlined comments to explain each step along the way. If any of this is unclear, please let me know in the comments and I'll revisit to improve clarity.
(function () {
/* Below is a regular expression that finds alphanumeric characters
Next is a string that could easily be replaced with a reference to a form control
Lastly, we have an array that will hold any words matching our pattern */
var pattern = /\w+/g,
string = "I I am am am yes yes.",
matchedWords = string.match( pattern );
/* The Array.prototype.reduce method assists us in producing a single value from an
array. In this case, we're going to use it to output an object with results. */
var counts = matchedWords.reduce(function ( stats, word ) {
/* `stats` is the object that we'll be building up over time.
`word` is each individual entry in the `matchedWords` array */
if ( stats.hasOwnProperty( word ) ) {
/* `stats` already has an entry for the current `word`.
As a result, let's increment the count for that `word`. */
stats[ word ] = stats[ word ] + 1;
} else {
/* `stats` does not yet have an entry for the current `word`.
As a result, let's add a new entry, and set count to 1. */
stats[ word ] = 1;
}
/* Because we are building up `stats` over numerous iterations,
we need to return it for the next pass to modify it. */
return stats;
}, {} );
/* Now that `counts` has our object, we can log it. */
console.log( counts );
}());
const sentence = 'Hi my friend how are you my friend';
const countWords = (sentence) => {
const convertToObject = sentence.split(" ").map( (i, k) => {
return {
element: {
word: i,
nr: sentence.split(" ").filter(j => j === i).length + ' occurrence',
}
}
});
return Array.from(new Set(convertToObject.map(JSON.stringify))).map(JSON.parse)
};
console.log(countWords(sentence));
Here is an updated version of your own code...
<!DOCTYPE html>
<html>
<head>
<title>string frequency</title>
<style type="text/css">
#text{
width:250px;
}
</style>
</head>
<body >
<textarea id="txt" cols="25" rows="3" placeholder="add your text here"> </textarea></br>
<button type="button" onclick="search()">search</button>
<script >
function search()
{
var data=document.getElementById('txt').value;
var temp=data;
var words=new Array();
words=temp.split(" ");
var unique = {};
for (var i = 0; i < words.length; i++) {
var word = words[i];
console.log(word);
if (word in unique)
{
console.log("word found");
var count = unique[word];
count ++;
unique[word]=count;
}
else
{
console.log("word NOT found");
unique[word]=1;
}
}
console.log(unique);
}
</script>
</body>
I think your loop was overly complicated. Also, trying to produce the final count while still doing your first pass over the array of words is bound to fail because you can't test for uniqueness until you have checked each word in the array.
Instead of all your counters, I've used a Javascript object to work as an associative array, so we can store each unique word, and the count of how many times it occurs.
Then, once we exit the loop, we can see the final result.
Also, this solution uses no regex ;)
I'll also add that it's very hard to count words just based on spaces. In this code, "one, two, one" will results in "one," and "one" as being different, unique words.
While both of the answers here are correct maybe are better but none of them address OP's question (what is wrong with the his code).
The problem with OP's code is here:
if(f==0){
count[i]=1;
uniqueWords[i]=words[i];
}
On every new word (unique word) the code adds it to uniqueWords at index at which the word was in words. Hence there are gaps in uniqueWords array. This is the reason for some undefined values.
Try printing uniqueWords. It should give something like:
["this", "is", "anil", 4: "kum", 5: "the"]
Note there no element for index 3.
Also the printing of final count should be after processing all the words in the words array.
Here's corrected version:
function search()
{
var data=document.getElementById('txt').value;
var temp=data;
var words=new Array();
words=temp.split(" ");
var uniqueWords=new Array();
var count=new Array();
for (var i = 0; i < words.length; i++) {
//var count=0;
var f=0;
for(j=0;j<uniqueWords.length;j++){
if(words[i]==uniqueWords[j]){
count[j]=count[j]+1;
//uniqueWords[j]=words[i];
f=1;
}
}
if(f==0){
count[i]=1;
uniqueWords[i]=words[i];
}
}
for ( i = 0; i < uniqueWords.length; i++) {
if (typeof uniqueWords[i] !== 'undefined')
console.log("count of "+uniqueWords[i]+" - "+count[i]);
}
}
I have just moved the printing of count out of the processing loop into a new loop and added a if not undefined check.
Fiddle: https://jsfiddle.net/cdLgaq3a/
I had a similar assignment. This is what I did:
Assignment : Clean the following text and find the most frequent word (hint, use replace and regular expressions).
const sentence = '%I $am#% a %tea#cher%, &and& I lo%#ve %te#a#ching%;. The#re $is no#th#ing; &as& mo#re rewarding as educa#ting &and& #emp%o#weri#ng peo#ple. ;I found tea#ching m%o#re interesting tha#n any ot#her %jo#bs. %Do#es thi%s mo#tiv#ate yo#u to be a tea#cher!? %Th#is 30#Days&OfJavaScript &is al#so $the $resu#lt of &love& of tea&ching'
console.log(`\n\n 03.Clean the following text and find the most frequent word (hint, use replace and regular expressions) \n\n ${sentence} \n\n`)
console.log(`Cleared sentence : ${sentence.replace(/[.,\/#!$%\^&\*;:{}=\-_`~()#]/g, "")}`)
console.log(mostFrequentWord(sentence))
function mostFrequentWord(sentence) {
sentence = sentence.replace(/[.,\/#!$%\^&\*;:{}=\-_`~()#]/g, "").trim().toLowerCase()
let sentenceArray = sentence.split(" ")
let word = null
let count = 0
for (i = 0; i < sentenceArray.length; i++) {
word = sentenceArray[i]
count = sentence.match(RegExp(sentenceArray[i], 'gi')).length
if (count > count) {
count = count
word = word
}
}
return `\n Count of most frequent word "${word}" is ${count}`
}
I'd go with Sampson's match-reduce method for slightly better efficiency. Here's a modified version of it that is more production-ready. It's not perfect, but it should cover the vast majority of scenarios (i.e., "good enough").
function calcWordFreq(s) {
// Normalize
s = s.toLowerCase();
// Strip quotes and brackets
s = s.replace(/["“”(\[{}\])]|\B['‘]([^'’]+)['’]/g, '$1');
// Strip dashes and ellipses
s = s.replace(/[‒–—―…]|--|\.\.\./g, ' ');
// Strip punctuation marks
s = s.replace(/[!?;:.,]\B/g, '');
return s.match(/\S+/g).reduce(function(oFreq, sWord) {
if (oFreq.hasOwnProperty(sWord)) ++oFreq[sWord];
else oFreq[sWord] = 1;
return oFreq;
}, {});
}
calcWordFreq('A ‘bad’, “BAD” wolf-man...a good ol\' spook -- I\'m frightened!') returns
{
"a": 2
"bad": 2
"frightened": 1
"good": 1
"i'm": 1
"ol'": 1
"spook": 1
"wolf-man": 1
}
Related
I am working an anagram generator and am trying to break off each new item in the array into a new line. The way this works is it slices each array item and loops through each character.
The output needs to be:
cat, cta, act, atc, tca, tac,
bat, bta, abt, atb, tba, tab,
rat, rta, art, atr, tra, tar,
But it is:
cat, cta, act, atc, tca, tac, bat, bta, abt, atb, tba, tab, rat, rta, art, atr, tra, tar, splat, splta, spalt, spatl,...
So far the code I have is this:
HTML:
<div id="anagrams"></div>
JS:
var arr = ['cat', 'bat', 'rat', 'splat'];
var allAnagrams = function(arr) {
var anagrams = {};
arr.forEach(function(str) {
var recurse = function(ana, str) {
if (str === '')
anagrams[ana] = 1;
for (var i = 0; i < str.length; i++)
recurse(ana + str[i], str.slice(0, i) + str.slice(i + 1));
};
recurse(' ', str);
});
return Object.keys(anagrams);
}
document.getElementById("anagrams").innerHTML = (allAnagrams(arr));
To accomplish a new line per array item I basically want to check if the amount of the characters exceeds the amount of characters in the string/array item and if it does, insert a break into the HTML. I tried doing that by:
var arr = ['cat', 'bat', 'rat', 'splat'];
var allAnagrams = function(arr) {
var anagrams = {};
arr.forEach(function(str) {
var recurse = function(ana, str) {
if (str === '')
anagrams[ana] = 1;
for (var i = 0; i < str.length; i++)
recurse(ana + str[i], str.slice(0, i) + str.slice(i + 1));
// check if string length is greater than the count and
// if it is, insert a break between the string
if (i > str.length) {
recurse(' <br>', str);
}
};
recurse(' ', str);
});
return Object.keys(anagrams);
}
document.getElementById("anagrams").innerHTML = (allAnagrams(arr));
However it still prints across a single line. Am I approaching this the correct way? I also tried using ana in place of i but I think I need to use i since that's the actual count - is that correct?
A jsfiddle can be seen here: https://jsfiddle.net/4eqhd1m4/1/
I would slightly restructure the anagram creation.
Anagrams is now a string.
Recurse no longer takes care of adding break lines. Considering you want a break per element, it's cleaner to add it in the Array.forEach
jsfiddle
Edit
Adding a second jsfiddle to demonstrate the same behavior, except instead of using strings directly it returns an array (which gets split and rejoined using breaklines). It may be preferable to have the anagrams returned as an array.
jsfiddle
Is this is what you need ? All anagrams in different lines
var arr = ['cat', 'bat', 'rat', 'splat'];
var allAnagrams = function(arr) {
var anagrams = {};
arr.forEach(function(str) {
var recurse = function(ana, str) {
if (str === '')
anagrams[ana] = 1;
for (var i = 0; i < str.length; i++)
recurse(ana + str[i], str.slice(0, i) + str.slice(i + 1));
// check if string length is greater than the count and
// if it is, insert a break between the string
if (i > str.length) {
recurse(' <br \/>', str);
}
};
recurse(' <br \/>', str);
});
return Object.keys(anagrams);
}
document.getElementById("anagrams").innerHTML = (allAnagrams(arr));
<div id="anagrams"></div>
I wrote a simple program to analyze a string to find the word with the greatest amount of duplicate letters within it. It essentially takes a given string, breaks it up into an array of separated words, and then breaks up each separate word into alphabetically sorted groups of individual letters (which are then compared as prev and next, 2 at a time, as the containing array is iterated through). Any two adjacent and matching values found adds one tally to the hash-file next to the word in question, and the word with the most tallied pairs of duplicate letters is returned at the end as greatest. No matching pairs found in any word returns -1. This is what it's supposed to do.
Below, I've run into a problem: If I don't use a REGEXP to replace one of my matched characters, then my code gives false positives as it will count triplicates (eg, "EEE"), as two separate pairs, (eg, "EEE" = "EE & EE", instead of being viewed as "EE, E"). However, if I DO use the REGEXP below to prevent triplicate counts, then doing so breaks my loop mid-stride, and skips to the next word. Is there no way to make this way work? If not, would it be better to employ a REGEXP which deletes all chars EXCEPT the duplicate characters in question, and then perhaps I could divide the .length of each word by 2 to get the number of pairs remaining? Any ideas as to how to solve this would greatly help.
var str = "Helloo aplpplpp pie";
//var str = "no repting letrs";
//var str = "ceoderbyte";
function LetterCountI(str) {
var input = str.split(" ");
console.log(input);
console.log("\n")
var hashObject = {};
var word = "";
var count = 0;
for(var i = 0; i<input.length; i++) {
var currentItem = input[i];
var currentWordIntoChars = currentItem.split("").sort();
console.log(currentWordIntoChars);
var counter = 0;
for(var j=1; j<currentWordIntoChars.length; j++) {
console.log(currentWordIntoChars[j-1] + "=currentChar j-1");
console.log(currentWordIntoChars[j] + "=prev j");
console.log("-");
var final = currentItem;
if(currentWordIntoChars[j-1] == currentWordIntoChars[j]) {
counter++;
hashObject[final] = counter;
//currentWordIntoChars = currentWordIntoChars[j-1].replace(/[a-z]/gi, String.fromCharCode(currentItem.charCodeAt(0)+1));
//HERE REPLACE j-1 with random# or something
//to avoid 3 in a row being counted as 2 pair
//OR use regexp to remove all but pairs, and
//then divide .length/2 to get pairs.
console.log(counter + " === # total char pairs");
}
if(count<hashObject[currentItem]) {
word = final;
count = hashObject[currentItem];
}
}
}
console.log(hashObject);
console.log("\n");
for (var o in hashObject) if (o) return word;
return -1;
}
console.log(LetterCountI(str));
An other way to do it, consists to replace duplicate characters in a sorted word:
var str = "Helloo aplpplpp pie";
function LetterCountI(str) {
var input = str.split(" ");
var count = 0;
var result = -1;
for(var i = 0; i<input.length; i++) {
var nb = 0;
var sortedItem = input[i].split("").sort().join("");
sortedItem.replace(/(.)\1/g, function (_) { nb++ });
if (nb > count) {
count = nb;
result = input[i];
}
}
return result;
}
console.log(LetterCountI(str));
Notes: The replace method is only a way to increment nb using a callback function. You can do the same using the match method and counting results.
if two words have the same number of duplicates, the first word will be returned by default. You can easily change this behaviour with the condition of the if statement.
Whenever you find a match within a word, increment j by 1 to skip comparing the next letter.
var str = "Helloo aplpplpp pie";
//var str = "no repting letrs";
//var str = "ceoderbyte";
function LetterCountI(str)
{
var input = str.split(" ");
console.log(input);
console.log("\n")
var hashObject = {};
var word = "";
var count = 0;
for(var i = 0; i<input.length; i++)
{
var currentItem = input[i];
var currentWordIntoChars = currentItem.split("").sort();
console.log(currentWordIntoChars);
var counter = 0;
for(var j=1; j<currentWordIntoChars.length; j++)
{
console.log(currentWordIntoChars[j-1] + "=currentChar j-1");
console.log(currentWordIntoChars[j] + "=prev j");
console.log("-");
var final = currentItem;
if(currentWordIntoChars[j-1] == currentWordIntoChars[j])
{
counter++;
hashObject[final] = counter;
j++; // ADD HERE
console.log(counter + " === # total char pairs");
}
if(count<hashObject[currentItem])
{
word = final;
count = hashObject[currentItem];
}
}
}
console.log(hashObject);
console.log("\n");
for (var o in hashObject) if (o) return word;
return -1;
}
console.log(LetterCountI(str));
I have to match 2 strings where at least one word is same, I need to give a success msg.
var str1 = "Hello World";
var str2 = "world is beautiful";
I need to match/compare these 2 strings, in both strings world is matching, So i need to print a success message. How do I go about it.
The following code will output all the matching words in the both strings:
var words1 = str1.split(/\s+/g),
words2 = str2.split(/\s+/g),
i,
j;
for (i = 0; i < words1.length; i++) {
for (j = 0; j < words2.length; j++) {
if (words1[i].toLowerCase() == words2[j].toLowerCase()) {
console.log('word '+words1[i]+' was found in both strings');
}
}
}
You can avoid comparing all the words in one list with all the words in the other by sorting each and eliminating duplicates. Adapting bjornd's answer:
var words1 = str1.split(/\s+/g),
words2 = str2.split(/\s+/g);
var allwords = {};
// set 1 for all words in words1
for(var wordid=0; wordid < words1.length; ++wordid) {
var low = words1[wordid].toLowerCase();
allwords[low] = 1;
}
// add 2 for all words in words2
for(var wordid=0; wordid < words2.length; ++wordid) {
var current = 0;
var low = words2[wordid].toLowerCase();
if(allwords.hasOwnProperty(low)) {
if(allwords[low] > 1) {
continue;
}
}
current += 2;
allwords[low] = current;
}
// now those seen in both lists have value 3, the rest either 1 or 2.
// this is effectively a bitmask where the unit bit indicates words1 membership
// and the 2 bit indicates words2 membership
var both = [];
for(var prop in allwords) {
if(allwords.hasOwnProperty(prop) && (allwords[prop] == 3)) {
both.push(prop);
}
}
This version should be reasonably efficient, because we are using a dictionary/hash structure to store information about each set of words. The whole thing is O(n) in javascript expressions, but inevitably dictionary insertion is not, so expect something like O(n log n) in practise. If you only care that a single word matches, you can quit early in the second for loop; the code as-is will find all matches.
This is broadly equivalent to sorting both lists, reducing each to unique words, and then looking for pairs in both lists. In C++ etc you would do it via two sets, as you could do it without using a dictionary and the comparison would be O(n) after the sorts. In Python because it's easy to read:
words1 = set(item.lower() for item in str1.split())
words2 = set(item.lower() for item in str2.split())
common = words1 & words2
The sort here (as with any set) happens on insertion into the set O(n log n) on word count n, and the intersection (&) is then efficent O(m) on the set length m.
I just tried this on WriteCodeOnline and it works there:
var s1 = "hello world, this is me";
var s2 = "I am tired of this world and I want to get off";
var s1s2 = s1 + ";" + s2;
var captures = /\b(\w+)\b.*;.*\b\1\b/i.exec(s1s2);
if (captures[1])
{
document.write(captures[1] + " occurs in both strings");
}
else
{
document.write("no match in both strings");
}
Just adapting #Phil H's code with a real bitmask:
var strings = ["Hello World", "world is beautiful"]; // up to 32 word lists
var occurrences = {},
result = [];
for (var i=0; i<strings.length; i++) {
var words = strings[i].toLowerCase().split(/\s+/),
bit = 1<<i;
for (var j=0, l=words.length; j<l; j++) {
var word = words[j];
if (word in occurrences)
occurrences[word] |= bit;
else
occurrences[word] = bit;
}
}
// now lets do a match for all words which are both in strings[0] and strings[1]
var filter = 3; // 1<<0 | 1<<1
for (var word in occurrences)
if ((occurrences[word] & filter) === filter)
result.push(word);
OK, the simple way:
function isMatching(a, b)
{
return new RegExp("\\b(" + a.match(/\w+/g).join('|') + ")\\b", "gi").test(b);
}
isMatching("in", "pin"); // false
isMatching("Everything is beautiful, in its own way", "Every little thing she does is magic"); // true
isMatching("Hello World", "world is beautiful"); // true
...understand?
I basically converted "Hello, World!" to the regular expression /\b(Hello|World)\b/gi
Something like this would also do:
isMatching = function(str1, str2) {
str2 = str2.toLowerCase();
for (var i = 0, words = str1.toLowerCase().match(/\w+/g); i < words.length; i++) {
if (str2.search(words[i]) > -1) return true;
}
return false;
};
var str1 = "Hello World";
var str2 = "world is beautiful";
isMatching(str1, str2); // returns true
isMatching(str1, 'lorem ipsum'); // returns false
I've got an array of words I need to sort by frequency. Before I do that, I need to strip out words like 'the,' 'it,' etc (anything less than three letters, really), as well as all numerals and any words beginning with # (the array of words is pulled from Twitter, although the example below is just a random paragraph from Wikipedia).
I can remove one word, but have been going crazy trying to remove more than one, or a range. Any suggestions? Thank you!
http://jsfiddle.net/9NzAC/6/
HTML:
<div id="text" style="background-color:Teal;position:absolute;left:100px;top:10px;height:500px;width:500px;">
Phrenology is a pseudoscience primarily focused on measurements of the human skull, based on the concept that the brain is the organ of the mind, and that certain brain areas have localized, specific functions or modules. The distinguishing feature of phrenology is the idea that the sizes of brain areas were meaningful and could be inferred by examining the skull of an individual.
</div>
JS:
//this is the function to remove words
<script type="text/javascript">
function removeA(arr){
var what, a= arguments, L= a.length, ax;
while(L> 1 && arr.length){
what= a[--L];
while((ax= arr.indexOf(what))!= -1){
arr.splice(ax, 1);
}
}
return arr;
}
</script>
//and this does the sorting & counting
<script type="text/javascript">
var getMostFrequentWords = function(words) {
var freq={}, freqArr=[], i;
// Map each word to its frequency in "freq".
for (i=0; i<words.length; i++) {
freq[words[i]] = (freq[words[i]]||0) + 1;
}
// Sort from most to least frequent.
for (i in freq) freqArr.push([i, freq[i]]);
return freqArr.sort(function(a,b) { return b[1] - a[1]; });
};
var words = $('#text').get(0).innerText.split(/\s+/);
//Remove articles & words we don't care about.
var badWords = "the";
removeA(words,badWords);
var mostUsed = getMostFrequentWords(words);
alert(words);
</script>
Instead of removing from the original array, just push to a new one, it's simpler, and it'll make your code shorter and more readable.
var words = ['the', 'it', '12', '#twit', 'aloha', 'hello', 'bye']
var filteredWords = []
for (var i = 0, l = words.length, w; i < l; i++) {
w = words[i]
if (!/^(#|\d+)/.test(w) && w.length > 3)
filteredWords.push(w)
}
console.log(filteredWords) // ['aloha', 'hello']
Demo: http://jsfiddle.net/VcfvU/
I recommend you to do array[i] = null (or "") and then just clean up your arrays empty nodes. You can easily achieve that using Array#filter
Test: http://jsfiddle.net/6LPep/
Code:
var FORGETABLE_WORDS = ',the,of,an,and,that,which,is,was,';
var words = text.innerText.split(" ");
for(var i = 0, word; word = words[i++]; ) {
if (FORGETABLE_WORDS.indexOf(',' + word + ',') > -1 || word.length < 3) {
words[i-1] = "";
}
}
// falsy will get deleted
words.filter(function(e){return e});
// as example
output.innerHTML = words.join(" ");
// just continue doing your stuff with "words" array.
// ...
I think it's cleaner than the way you're doing it currently. If you need anything else I will update this answer.
console.log(
['🍇','🍈','🍌','🍉','🍊','🍋'].filter(a => !['🍌','🍊'].includes(a))
)
What's the best and most efficient way to count keywords in JavaScript? Basically, I'd like to take a string and get the top N words or phrases that occur in the string, mainly for the use of suggesting tags. I'm looking more for conceptual hints or links to real-life examples than actual code, but I certainly wouldn't mind if you'd like to share code as well. If there are particular functions that would help, I'd also appreciate that.
Right now I think I'm at using the split() function to separate the string by spaces and then cleaning punctuation out with a regular expression. I'd also want it to be case-insensitive.
Cut, paste + execute demo:
var text = "Text to be examined to determine which n words are used the most";
// Find 'em!
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
var word = matches[0].toLowerCase();
if (typeof words[word] == "undefined")
{
words[word] = 1;
}
else
{
words[word]++;
}
}
// Sort 'em!
var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
wordList.push([word, words[word]]);
}
}
wordList.sort(function(a, b) { return b[1] - a[1]; });
// Come back any time, straaanger!
var n = 10;
var message = ["The top " + n + " words are:"];
for (var i = 0; i < n; i++)
{
message.push(wordList[i][0] + " - " + wordList[i][1] + " occurance" +
(wordList[i][1] == 1 ? "" : "s"));
}
alert(message.join("\n"));
Reusable function:
function getTopNWords(text, n)
{
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
var word = matches[0].toLowerCase();
if (typeof words[word] == "undefined")
{
words[word] = 1;
}
else
{
words[word]++;
}
}
var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
wordList.push([word, words[word]]);
}
}
wordList.sort(function(a, b) { return b[1] - a[1]; });
var topWords = [];
for (var i = 0; i < n; i++)
{
topWords.push(wordList[i][0]);
}
return topWords;
}
Once you have that array of words cleaned up, and let's say you call it wordArray:
var keywordRegistry = {};
for(var i = 0; i < wordArray.length; i++) {
if(keywordRegistry.hasOwnProperty(wordArray[i]) == false) {
keywordRegistry[wordArray[i]] = 0;
}
keywordRegistry[wordArray[i]] = keywordRegistry[wordArray[i]] + 1;
}
// now keywordRegistry will have, as properties, all of the
// words in your word array with their respective counts
// this will alert (choose something better than alert) all words and their counts
for(var keyword in keywordRegistry) {
alert("The keyword '" + keyword + "' occurred " + keywordRegistry[keyword] + " times");
}
That should give you the basics of doing this part of the work.
Try to split you string on words and count the resulting words, then sort on the counts.
This builds upon a previous answer by insin by only having one loop:
function top_words(text, n) {
// Split text on non word characters
var words = text.toLowerCase().split(/\W+/)
var positions = new Array()
var word_counts = new Array()
for (var i=0; i<words.length; i++) {
var word = words[i]
if (!word) {
continue
}
if (typeof positions[word] == 'undefined') {
positions[word] = word_counts.length
word_counts.push([word, 1])
} else {
word_counts[positions[word]][1]++
}
}
// Put most frequent words at the beginning.
word_counts.sort(function (a, b) {return b[1] - a[1]})
// Return the first n items
return word_counts.slice(0, n)
}
// Let's see if it works.
var text = "Words in here are repeated. Are repeated, repeated!"
alert(top_words(text, 3))
The result of the example is: [['repeated',3], ['are',2], ['words', 1]]
I would do exactly what you have mentioned above to isolate each word. I would then probably add each word as the index of an array with the number of occurrences as the value.
For example:
var a = new Array;
a[word] = a[word]?a[word]+1:1;
Now you know how many unique words there are (a.length) and how many occurrences of each word existed (a[word]).