I am doing a modified version of collecting word co-occurrences, so I wrote my own javascript, and I am tracking the occurrences in three objects. However, once the objects get large (~8 million, 3 million, and 172000) a function that took 5 seconds per 100000 sentences now takes minutes to do one sentence with 30 words (30 tokens). I am nowhere near my RAM cap (I have 12 more GBs of RAM it could be using, and the program is only using 2.2GB). Using Node.js v17.3.1.
Why does my function take so long when the objects get bigger (even though the sentences remain the same length)? Should I be using a different object besides Javascript's default object, or is there a way improve the speed of access and setting these objects when they are so big?
Code:
let posCounts = {};
let negCounts = {};
// the number of times each word occurs
let wordCounts = {};
let tokens = // some function that gets tokens;
for (let k = 0; k < tokens.length; k++) {
// count word occurences
if (tokens[k] in wordCounts) {
wordCounts[tokens[k]] += 1;
} else {
wordCounts[tokens[k]] = 1;
}
for(let tok = k + 1; tok < tokens.length; tok++) {
if (tok == k) {
// avoid word to self cooccurrence
// should no longer be possible
continue;
} else {
// check which form of the cooccurence exists already in either count
actual_tok = (tokens[k] + "-" + tokens[tok]);
if(actual_tok in posCounts || actual_tok in negCounts) {
// no-op
} else {
actual_tok = (tokens[tok] + "-" + tokens[k]);
}
// condition set before this block of code
if(condition) {
if (actual_tok in posCounts) {
posCounts[actual_tok] += 1;
} else {
posCounts[actual_tok] = 1;
}
} else {
if (actual_tok in negCounts) {
negCounts[actual_tok] += 1;
} else {
negCounts[actual_tok] = 1;
}
}
}
}
}
Update: I've tried increasing the heap size via node train_matrices.js --max-old-space-size=12288 and node train_matrices.js --max_old_space_size=12288 (underline instead of dash), and that didn't work either.
Probably not the main issue in your code, but you can reduce the number of lookups by changing this structure from this:
if (tokens[k] in wordCounts) {
wordCounts[tokens[k]] += 1;
} else {
wordCounts[tokens[k]] = 1;
}
to this:
let token = tokens[k];
let cnt = wordCounts[token] || 0;
wordCounts[token] = cnt + 1;
And, as I said in a comment, I've read that a Map object with .get() and .set() is better suited when there are lots of dynamically created keys whereas plain objects are better suited when you have lots of objects with all the same keys (as the JS compiler can sometimes make a C-like struct for it), but this can't be done when you're regularly adding new keys.
The answer was to both use the increase memory flag node <YOUR_FILE_NAME>.js --max-old-space-size=12288 and change to using a Map instead of an object - thanks to #jfriend00 and #Norman Breau for the suggestions. That said, maps have a max capacity of 2^24 items or 1 GB, so I ended up using a modified version of the BigMap from this stackoverflow (modified to limit the total number of items still - ended up running completely out of RAM).
Modified code (you can replace BigMap with Map if you want):
let posCounts = new BigMap();
let negCounts = new BigMap();
let wordCounts = new BigMap();
let actual_tok;
tokens = // some code
// mark every cooccurrence
for (let k = 0; k < tokens.length; k++) {
// count word occurences
if (wordCounts.has(tokens[k])) {
wordCounts.set(tokens[k], wordCounts.get(tokens[k]) + 1);
} else {
wordCounts.set(tokens[k], 1);
}
for(let tok = k + 1; tok < tokens.length; tok++) {
if (tok == k) {
// avoid word to self cooccurrence
// should no longer be possible
continue;
} else {
// check which form of the cooccurence exists already in either count
actual_tok = (tokens[k] + "-" + tokens[tok]);
if(posCounts.has(actual_tok) || negCounts.has(actual_tok)) {
// no-op
} else {
actual_tok = (tokens[tok] + "-" + tokens[k]);
}
if(condition) {
if (posCounts.has(actual_tok)) {
posCounts.set(actual_tok, posCounts.get(actual_tok) + 1);
} else {
posCounts.set(actual_tok, 1);
}
} else {
if (negCounts.has(actual_tok)) {
negCounts.set(actual_tok, negCounts.get(actual_tok) + 1);
} else {
negCounts.set(actual_tok, 1);
}
}
}
}
}
}
Related
Background
I'm new to JavaScript and am solving various formulations of the Josephus Problem to better understand the syntax. Using a circularLinkedList implementation*, I've solved the classic formulation: Wikipedia||Numberphile. I've also solved the problem for any fixed number of fighters and a fixed number of skips between eliminations (e.g., if skipping two fighters between eliminations, 1 eliminates 4, 5 eliminates 8, etc). I am now trying to solve the problem given any function that indicates the number of skips at a given moment.
Problem
I can't access the return value of my skip function. I understand from 1, 2, 3 that my issue involves asynchronicity, but am having trouble isolating takeaways from the long responses involving AJAX and jQuery, which I'm unfamiliar with. Could I get an ELI5 solution? I apologize for my lack of understanding.
Code
function winnerStepFunc(num, func) {
let cll = new circularLinkedList(); //Initializing list with participants
for (let a = 0; a < num; a++) {
cll.append(a);
}
function next(funcSteps) { //Generating string indicating #steps from function's output
let toEvaluate = "start";
for (let i = 0; i < funcSteps; i++) {
toEvaluate += ".next"
}
return toEvaluate;
}
let start = cll.getHead(); //Selecting first eliminator
while (cll.size() > 1) {
let toCheck = func(); // PROBLEM LINE
console.log("toCheck = " + toCheck); //toCheck = undefined
let str = next(toCheck);
while (eval(str) !== start && cll.size() > 1) { //
let locCurrent = cll.indexOf(start.element);
start = eval(str).next;
cll.removeAt(((locCurrent + toCheck)% cll.size()));
}
cll.delete(eval(str).next.element);
start = start.next;
}
console.log(start.element + 1);
}
function callFunction(name, args) { // builds function string to be evaluated
let result = name + "(";
for (let i = 0; i < args.length -1; i++) {
result += args[i] + ", ";
}
result += args[args.length-1] + ")";
return result;
}
function callFunction(name) {
let result = `${name}()`;
return result;
}
function addOne() { //<--The first basic example I'm trying:
return ++globalTimes; //Make the step increase by one for each elimination
}
var globalTimes = 0;
winnerStepFunc(12, function(){eval(callFunction("addOne"))});
*CLL Implementation
You don't return in your function. I would remove all the eval stuff and just call the function directly.
winnerStepFunc(12, addOne);
So I'm working on a simple JavaScript web-based game. The goal is to guess a X digit random number. This means the random number can be 4, 5 digits up to whatever you want. You can actually play the game at www.juegodescifralo.com , (it's in Spanish, sorry about that).
The user inputs a number that is stored as an array. The random number is also generated as an array. Individual numbers in both arrays can be repeated.
There are three types of possible "values/numbers": the "good" ones are the numbers you chose that are in the same position as the ones in the random array. So for example:
Random array is: 1457
User input is: 6851
Number 5 is a "good" number, since it's in the same position. Then there are the second type of "values", which are "regular". This means they are inside the random number but not in the same position. In this example, number 1 would be a "regular" value. And the third type is the "bad" ones, which are not even inside the random array.
The function I've developed is as follows:
function checkNumbers(randomArray, myArray, good, regular, bad) {
for (var x = 0; x < randomArray.length; x++) {
var posRepetido = randomArray.indexOf(myArray[x]); //Is current number inside random array?
if (posRepetido == -1) { //It's not inside
console.log("number " + myArray[x] + "is not inside");
bad++;
} else { //It's inside
regular++;
if (myArray[x] == randomArray[x]) { //If it's the same number...
console.log("number " + myArray[x] + "is in the correct position");
good++;
regular--;
} else { //If it's not the same number
if (randomArray[posRepetido] != myArray[posRepetido]) {
console.log("number " + myArray[x] + "is inside but not in the same position");
} else {
console.log("number " + myArray[x] + "is not inside");
}
}
}
}
var obj = { //Return object for accessing later, to show feedback to the user.
good: good,
regular: regular,
bad: bad
};
return obj;
}
The code is a bit buggy. When there are duplicates in the random array, and one of them is marked as good, then the other one (even if it exists in user input) will be set as bad, not as regular as it should.
The thing becomes even more complicated since you should be able to play against any amount of digits. So I should be able to guess a 20 digit number without "problems".
You can play by yourself at www.juegodescifralo.com
How can I go about this? Any ideas how can I access array data more easily? Thank you very much!
Rather than indexOf comparisons against -1 and checks of myArray[x], it would probably be a lot easier to use includes and array methods such as forEach for better abstraction. All you really need is an if, an else if, and an else. For example:
function checkNumbers(randomArray, userArray, good=0, regular=0, bad=0) {
userArray.forEach((guess, i) => {
if (guess === randomArray[i]) good++;
else if (randomArray.includes(guess)) regular++;
else bad++;
});
return { good, regular, bad };
}
// 4 good
console.log(checkNumbers(
'1234'.split(''),
'1234'.split(''),
));
// 4 good, 2 bad
console.log(checkNumbers(
'1234'.split(''),
'123456'.split(''),
));
// 4 good, 2 regular
console.log(checkNumbers(
'1234'.split(''),
'123412'.split(''),
));
// all regular:
console.log(checkNumbers(
'123456789123456789'.split(''),
'912345678912345678'.split(''),
));
i think it is easier to loop thru it twice like this
function checkNumbers(randomArray, guessArray) {
var clone = randomArray.slice(0);
var good = 0;
var regular = 0;
var bad = 0;
var visited = [];
guessArray.forEach(function(guess, index) {
if (guess === clone[index]) { // the guess is in right position
good++;
clone[index] = "x"; // strike it out so it cannot be used later
visited.push(index);
} else if (clone.indexOf(guess) === -1) {
bad++;
visited.push(index);
}
});
guessArray.forEach(function(guess, index) {
if (!visited.includes(index)) {
var match = clone.indexOf(guess);
if (match !== -1) {
regular++;
clone[match] = "x"; // strike it out so it cannot be used later
}
}
});
return {
good: good,
bad: bad,
regular: regular
}
}
first loop is to check the good and bad. and strikeout the value it is good so it cannot be used again.
second loop to check for the regular and strikeout the value it is used so it cannot be used again.
This should work.
function guessNumber (numUser, numRandom) {
if (typeof numUser == 'number') {
numUser = numUser.toString().split('');
}
if (typeof numRandom == 'number') {
numRandom = numRandom.toString().split('');
}
if (typeof numRandom != 'object' || typeof numUser != 'object') {
return false;
}
if (numRandom == numUser) {
return true;
}
var numRegular = {},
numBuenos = {},
numMalos = {},
numRepeat = {};
for(var i = 0; i < numRandom.length; i++) {
if (!numRepeat[numRandom[i]]) {
numRepeat[numRandom[i]] = 0;
}
numRegular[numRandom[i]] = 0;
numRepeat[numRandom[i]]++;
}
for (var i = 0; i < numUser.length; i++) {
if (numUser[i] == numRandom[i]) {
numBuenos[numUser[i]] = numUser[i];
}else if ($.inArray(numUser[i], numRandom)) {
if (!numRegular[numUser[i]]) {
numRegular[numUser[i]] = 0;
}
if (numRegular[numUser[i]] < numRepeat[numUser[i]]) {
numRegular[numUser[i]]++;
} else {
numMalos[numUser[i]] = numUser[i];
}
} else {
numMalos[numUser[i]] = numUser[i];
}
}
return {
regular: Object.values(numRegular).reduce((a, b) => a + b),
buenos: Object.keys(numBuenos).length,
malos: Object.keys(numMalos).length
};
}
console.log(guessNumber(8365, 8512));
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
I have found a couple other similar threads on stackoverflow (Pass Array Thread 2) and (Pass Array Thread 1) as well as from a few other sites but I either did not understand them, they did not fully answer my question, or I did not know how to implement it into my code.
I have the following code which should create a map for a game based on some arrays:
function createMap(level) {
var map = document.getElementById('map');
mapWidth = parseInt(level[0]);
mapHeight = parseInt(level[1]);
map.innerHTML = '';
rowNumber = 1;
tileID = 1;
var consoleHelp = level[7];
console.log(k+' and value is '+consoleHelp);
k = 1;
for (k = 1; k <= mapHeight; k++) { // repeat below loop until specified height is reached
for (k = 1; k <= mapWidth; k++) { // create a row of tiles based on the specified width of the array
console.log('Row '+k+' created')
if (rowNumber == 1) {
k++;
}
else {
k--;
}
if (level[k] == 'w') {
map.innerHTML += '<span id="'+rowNumber+'-'+tileID+'">desert<image class="tiles" src="desert.png"></span>';
}
else if (level[k] == 'g') {
map.innerHTML += '<span id="'+rowNumber+'-'+tileID+'"><image class="tiles" src="grass.png"></span>';
}
else {
console.log('crap dis did not work');
var consoleHelp = level[k];
console.log(k+' and value is '+consoleHelp);
}
if (rowNumber == 1) {
k--;
}
else {
k++;
}
tileID++
}
rowNumber++
level = level + '_1';
map.innerHTML = "<br>";
}
spawnTile();
}
and the variable arrays (incomplete but you get the idea):
var map_beginning_1 = ['20','10','w','w','w','w','w','w','w','w','w','w','w','w','w','w','w','w','w','w','w','w'];
var map_beginning_1_1 = ['w','g','g','g','g','g','g','g','g','g','g','g','g','g','g','g','g','g','g','w'];
My problem is that I call in
createMap('map_beginning_1')
and nothing happens. There's no errors but nothing else happens. I did a test to see what it was getting and the value of "level[7]" is "i" and "level1" is "a", which is the location of the characters in "map_beginning_1". Can someone please explain to me how to or if it's even possible to variably pass an array through a Javascript function argument and how to do it? And if it isn't possible, can you suggest any alternatives for what I'm trying to do? Javascript is preferable but I can use Jquery if I must.
You have passed a string into the function and not a variable please try the following, removing the single quotes.
createMap(map_beginning_1);
Try createMap(map_beginning_1). Lose the ' (quotes), as you are trying to pass an array but are actually passing a string.
I am caching longitude and latitude (plus a bit more info) for possibly 1000s of locations, currently using a JavaScript hash, a {}. e.g.
var cache = {};
cache['Boston, MA'] = { id: someid, latlon: [lat, long] };
cache['Someotherplace, TX'] = { id: someotherid, latlon: [itslat, itslong]};
Everytime a new location comes up I do a geocode and add the results to the cache. I don't think Boston's latitude will change anytime soon...
Will lookups be reasonably fast? I don't need blazing fast, I'm not running Amazon, but as this data grows to, say 2000 locations, will it bog down? If so, what might be a good alternative?
Much of the performance of the entire javascript engine is based on property lookups on objects so I'm quite sure that significant effort has been paid to the performance of that in the basic JS engine.
But, as with all things related to performance you should measure yourself. It would only take a few minutes to build a test harness in jsperf and either compare it to an alternative or just see if regular JS lookup appears fast enough for you.
Here's a [little test harness][1] that shows more than 20,000 key lookups per ms on my computer. I would think that's fast enough for you.
function log(args) {
var str = "";
for (var i = 0; i < arguments.length; i++) {
if (typeof arguments[i] === "object") {
str += JSON.stringify(arguments[i]);
} else {
str += arguments[i];
}
}
var div = document.createElement("div");
div.innerHTML = str;
document.body.appendChild(div);
}
function addCommas(str) {
var amount = str + "";
var parts = amount.split(".");
amount = parts[0].split("").reverse();
var output = "";
for (var i = 0; i < amount.length; i++) {
output = amount[i] + output;
if ((i+1) % 3 == 0 && (amount.length-1) !== i) {
output = ',' + output;
}
}
if (parts.length > 1) {
output += "." + parts[1];
}
return output;
}
function now() {
return new Date().getTime();
}
// now fill the cache with a random set of keys
// the keys will var in length between minKeyLen and maxKeyLen
function createRandomKeys(num, minKeyLen, maxKeyLen, obj) {
function rand(min, max) {
return Math.floor(Math.random() * (max - min)) + min;
}
var chars = "abcdefghijlkmnopqrstuvwzyz";
var len, key, numKeys = 0;
while (numKeys < num) {
// generate random key length
len = rand(minKeyLen, maxKeyLen);
key = "";
// now select len random chars and combine into a string
for (var j = 0; j < len; j++) {
key += chars.charAt(rand(0, chars.length))
}
// put this key into our object, only count it if it's not already there
if (!Object.prototype.hasOwnProperty.call(obj, key)) {
++numKeys;
obj[key] = true;
}
}
}
var cache = {};
// put all the keys into our object
createRandomKeys(200000, 3, 15, cache);
// now get the list of keys, just so we know what to fetch in our test
var keys = Object.keys(cache);
// now time getting every key
var total = 0;
var start = now();
for (var i = 0; i < keys.length; i++) {
if (cache[keys[i]]) {
++total;
}
}
var end = now();
var elapsed = end - start;
log("Elapsed time = " + elapsed + "ms for " + addCommas(keys.length) + " key lookups - found " + addCommas(total));
log(elapsed/keys.length + "ms per lookup");
log(addCommas((keys.length / elapsed).toFixed(2)) + " key lookups per ms");
// show some sample keys
log("<hr>Sample keys (first 100 keys):<br>");
log(keys.slice(0, 100).join(", "));
I'm testing it with 1,000,000 numbers, and it's just kind of hanging. I thought it would breeze through 1,000,000 easily. Is it my implementation? I have a feeling it's because of the slice(), anyone have an idea?
Edit:
Just got this message:
FATAL ERROR: CALL_AND_RETRY_2 Allocation failed - process out of memory
TopDownSplitMerge(numbersArray);
function TopDownSplitMerge(arrayOfNumbers) {
var length = arrayOfNumbers.length
var middleIndex = parseInt(length/2);
if(length <= 1) {
return arrayOfNumbers;
}
// Split left side
var left = TopDownSplitMerge(arrayOfNumbers.slice(0, middleIndex));
// Split right side
var right = TopDownSplitMerge(arrayOfNumbers.slice(middleIndex, length));
// Merge every back together
return TopDownMerge(left, right);
}
function TopDownMerge(left, right) {
var results = []
while(left.length || right.length) {
console.log("looping...");
// Check if both sides are NOT empty, if so, then just finish shifting the non-empty side
if(left.length && right.length) {
if(left[0] <= right[0]) {
results.push(left.shift())
} else {
results.push(right.shift())
}
} else if(left.length) {
results.push(left.shift())
} else {
results.push(right.shift())
}
}
console.log("Merging....", results.length);
return results;
}
There are two things I had to change
var right = TopDownSplitMerge(arrayOfNumbers.slice(middleIndex, length));
....
....
....
function TopDownMerge(left, right) {
var results = [], leftLen = left.length, rightLen = right.length;
for (var i = 0, j = 0; i < leftLen || j < rightLen;) {
if (i < leftLen && j < rightLen) {
if(left[i] <= right[j]) {
results.push(left[i]);
i += 1;
} else {
results.push(right[j]);
j += 1;
}
} else if (i < leftLen) {
results.push(left[i]);
i += 1;
} else {
results.push(right[j]);
j += 1;
}
}
return results;
}
Edit: Now I changed it to accept indices instead of sliced arrays and it boosts the performance more.
function TopDownSplitMerge(arrayOfNumbers, start, end) {
var length = end - start;
var middleIndex = start + parseInt(length / 2);
if (length <= 1) {
return [arrayOfNumbers[start]];
}
// Split left side
var left = TopDownSplitMerge(arrayOfNumbers, start, middleIndex);
// Split right side
var right = TopDownSplitMerge(arrayOfNumbers, middleIndex, length);
// Merge every back together
return TopDownMerge(left, right);
}
TopDownSplitMerge(numbersArray, 0, numbersArray.length);
Jsperf: http://jsperf.com/so-q-19341534
jsperf for my solution with 10,000,000 numbers: http://jsperf.com/solution-to-so-q-19341534
I think you're right. slice() copies the array, so you're effectively copying the array bajillions of times. And then your shifting the front off the array, which requires copying the array each time--a bajillion more times. A better approach could be to pass in the index range for the 'splitting'.