Reading very large data files using nodejs? Interview question

Reading very large data files using nodejs? Interview question - javascript

I recently wrote a solution for an interview question which was rejected. Please analyze my solution and advice? is it not efficient to handle large files?
Problem
Given a data file containing scored records, in your favorite programming language, write a program to output the N highest record IDs ordered by descending
score. The output should be well-formed JSON. Consider giving thought to the
resource efficiency of your solution.
DATA File
The input data file is a series of key-value pairs (one per line) with the format
:
3830591998918656: {"id":"2208ef95-355c-53a6-96bc-206a4831f2fe","data":"Tu"}
548113328635904: {"id":"d5887987-bf5d-5813-b524-722ffff11882","data":"Vubiteh hone na dabupkof tetnut."}
4322085113430016: {"id":"8f5330b9-0279-5a67-aee3-d6acf221a53a","data":"Losu nihpe upaveitpse teblujnis."}
6348702421614592: {"id":"1fef0dbc-c75b-5959-835f-80e5f15b6da1","data":"Riliv kaliku laiza zen daze ."}
can be upto 100k lines or more
Success Conditions
Upon successful running, your solution should exit with exit code 0. If the input
data file is not valid, your solution should exit with code 2, while if the input
file is not found, your solution should exit with code 1. Empty lines in the input
file should be ignored rather than treated as invalid input.
My solution
highest.js the intry point
{
let argv = require("yargs").argv;
const yargs = require("yargs");
const handler = require("./handler");
const fileName = argv._[0];
const numberOfEntries = argv._[1];
if (!fileName || !numberOfEntries) {
console.log("Command Incorrect : --use node highest <filename> <count>");
} else {
handler.getHighestScore(fileName, numberOfEntries);
}
}
Handler.js
{
const path = require("path");
const fs = require("fs");
const es = require("event-stream");
let ln = 0;
const scoreArray = [];
exports.getHighestScore = (fileName, numberOfEntries) => {
const filePath = path.join(path.dirname(require.main.filename), fileName);
fs.stat(filePath, function (err, stat) {
if (stat && stat.isFile()) {
let s = fs
.createReadStream(filePath)
.pipe(es.split())
.pipe(
es
.mapSync(function (line) {
s.pause();
if (ln < parseInt(numberOfEntries)) {
if (line.trim().length > 0) {
let score = line.slice(0, line.indexOf(":"));
let secondPart = line.slice(line.indexOf(":") + 1);
let jsonObject = JSON.parse(secondPart);
if (
jsonObject.hasOwnProperty("id") &&
jsonObject.id.trim().length > 0
) {
let outputObject = { score: score, id: jsonObject.id };
scoreArray.push(outputObject);
} else {
process.stdout.write(
"***********File Invalid - ID Mssing************* code(2)"
);
process.exit(2);
}
}
}
if (ln == parseInt(numberOfEntries)) {
s.end();
} else {
ln += 1;
}
s.resume();
})
.on("error", function (err) {
process.stdout.write(
"***********Error while reading file************* code(2)"
);
process.exit(2);
})
.on("end", function () {
let arr = scoreArray.sort((a, b) => (b.score > a.score ? 1 : -1));
process.stdout.write(
"TOTAL LINES READ = " +
ln +
" TOTAL OUTPUT = " +
arr.length +
"\n"
);
process.stdout.write(JSON.stringify(arr));
process.stdout.write("\n");
process.exit(0);
})
);
} else {
process.stdout.write("***********FILE NOT FOUND************* code(1)");
process.exit(1);
}
});
};
}
Any advice and criticism about solution is appreciated

You are reading the entire input file into memory and storing everything. A space efficient way to do things is to stream the input one line at a time, parse it, check to see if it contains an N-highest score. If so, add it to the N-highest data structure. If not, skip it. Only retain in memory the N-highest data as you go through the whole file. This seems to be the main point that your code misses.
For efficiency reasons, this does an insertion sort into the highest array so it isn't constantly resorting the entire array every time you add a value.
Here's an implementation of that in nodejs:
const readline = require('node:readline');
const fs = require('node:fs');
// sample input data per line
// 3830591998918656: { "id": "2208ef95-355c-53a6-96bc-206a4831f2fe", "data": "Tu" }
// 548113328635904: { "id": "d5887987-bf5d-5813-b524-722ffff11882", "data": "Vubiteh hone na dabupkof tetnut." }
// 4322085113430016: { "id": "8f5330b9-0279-5a67-aee3-d6acf221a53a", "data": "Losu nihpe upaveitpse teblujnis." }
// 6348702421614592: { "id": "1fef0dbc-c75b-5959-835f-80e5f15b6da1", "data": "Riliv kaliku laiza zen daze ." }
const scoreRegex = /^\s*(\d+):\s*/;
function parseLine(line) {
// ok to skip empty lines
if (line.trim() === "") return null;
const result = {};
// parse leading digits
const scoreMatch = line.match(scoreRegex);
if (!scoreMatch) throw new Error("Missing score at beginning of line");
result.score = BigInt(scoreMatch[1], 10);
const remainingLine = line.slice(scoreMatch[0].length);
result.info = JSON.parse(remainingLine);
if (typeof result.info.id !== "string" || result.info.id === "") {
throw new Error("Missing valid id value");
}
return result;
}
// input and output files
const fileInput = "input.txt";
const fileOutput = "output.txt";
const howManyHighest = 2;
const highestScores = [];
function getLowest() {
return highestScores[highestScores.length - 1].score;
}
// do an insertion sort into the highestScores array
// highest score record first
// maintain length at no more than howManyHighest
function insertHighest(val) {
let inserted = false;
// for performance reasons, only search through the highestScores
// list if this new score is higher than the lowest score we have in the list so far
if (highestScores.length && val.score > getLowest()) {
for (let [index, item] of highestScores.entries()) {
if (val.score > item.score) {
// insert it into this position in the array, pushing the others up
highestScores.splice(index, 0, val);
inserted = true;
break;
}
}
}
if (inserted) {
// trim overflow, if any
if (highestScores.length > howManyHighest) {
highestScores.pop();
}
} else {
// didn't insert it, see if we aren't full yet
if (highestScores.length < howManyHighest) {
highestScores.push(val);
}
}
}
const rl = readline.createInterface({
input: fs.createReadStream(fileInput),
crlfDelay: Infinity
});
rl.on('error', err => {
if (err.code === 'ENOENT') {
process.exit(1);
} else {
console.log(err);
// some sort of read error
process.exit(2);
}
}).on('line', line => {
try {
const data = parseLine(line);
if (data) {
insertHighest(data);
}
} catch (e) {
console.log(e);
console.log(`Invalid line: ${line}`);
process.exit(2);
}
}).on('close', () => {
// generate array of highest scores
const output = highestScores.map(item => item.info.id)
console.log(output);
fs.writeFile(fileOutput, JSON.stringify(output), err => {
if (err) {
console.log(err);
// output write error
process.exit(3);
} else {
// all done, successfully
process.exit(0);
}
});
});
I ran into a few unspecified implementation questions, which if you bring up during your answer will show you've fully understood the problem.
What is the max score value that can exist in this data? This determines if we can use a Javascript number type or whether BigInt needs to be used to handle arbitrarily large numbers (at a cost of some run-time performance). Since this was not specified, I've used BigInt parsing and comparisons here to make the score values limitless integers.
Is the output data supposed to be only an array of ids (sorted by highest score) that belong to the highest scores. Or is it supposed to be some sorted data structure that includes the score and the id and the other data? Your question does not make that entirely clear. Showing an example of the output data for N = 3 would make this clear. This code produces an output array of id values, sorted by id with highest score first.
It is not specified what the valid structure of an id property is. This code just tests for a non-empty string value.
It is not specified what should be output if there are high tie scores or if the highest N+1 scores are all the same (e.g. there's no unique N highest scores). In case of ties at the end of the high score list, this retains only the first ones encountered in the input file (up to the N we're outputing).

Related

Map numbers to other numbers with interpolation in Javascript

I have a dataset with a volume for a given surface elevation of an irregular basin. For example:
cm kL
11870 : 6043453
11871 : 6053522
11872 : 6063591
11873 : 6073674
11874 : 6083767
(...1550 rows)
cm is a series that always increments by one; The associated kL values are irregular but always increase and are never duplicated. The mapping never changes and it can be loaded/stored in any convenient format.
Does Javascript have a simple way to convert between cm and kL? Ideally with linear interpolation in both directions. Ultimately I am looking for this functionality:
cm_to_kL(11872.2); //Expect 6065607.6
kL_to_cm(6065600); //Expect 11872.199

I wrote an example of how to start solving this problem. Like already mentioned, there are no internal functionality for interpolating or handling such structures, but you need to write your own logic.
I have to admit I'm not an expert what comes to math (+ it's 2am here, but this question got me interested in :D).
I hope this helps you at least to get started:
const data = {
11870 : 6043453,
11871 : 6053522,
11872 : 6063591,
11873 : 6073674,
11874 : 6083767,
};
const cm_to_kL = (val) => {
const cm_ref = Math.floor(val);
const factor = parseFloat((val % cm_ref).toFixed(5));
const lower = data[cm_ref];
const higher = data[cm_ref + 1];
if (isNaN(lower) || isNaN(higher)) {
throw Error('Data entry point not found');
}
const result = lower + ((higher - lower) * factor);
if (isNaN(result)) {
throw Error('Incorrect data provided');
}
return result;
};
const kL_to_cm = (val) => {
const [cm, kL] = Object.entries(data).find(([k, v]) => val < v);
if (isNaN(cm) || isNaN(kL)) {
throw Error('Data entry point not found');
}
const lower_cm = cm - 1;
const lower_kL = data[lower_cm];
const diff = (val - lower_kL) / (kL - lower_kL);
const result = parseFloat((lower_cm + diff).toFixed(5))
if (isNaN(result)) {
throw Error('Incorrect data provided');
}
return result;
};
console.log('11872.2',
cm_to_kL(11872.2),
);
console.log('6065600',
kL_to_cm(6065600),
);

Yes of course JS have to do what you need! You can create 2 Maps from your given array, one for cm to kL and another for kL to cm. Create two functions for them cm_to_kL and kL_to_cm to gat value from Maps after this you can easily get elements with O(1) complexity

Make sure no duplicate directory paths exist

I'm writing a script wherein the user selects directories, which are then stored in an array property, so that they can be recursively crawled.
{
"archives": [
"C:\\AMD\\Packages",
"C:\\Users",
"C:\\Windows",
"D:\\",
"E:\\Pictures\\Birthday"
]
}
I obviously don't want to be storing duplicate paths or paths that are contained by other paths. For example, if the user were to select a new folder to add to the array, E:\\Pictures, then E:\\Pictures\\Birthday would be discarded and replaced by it since E:\\Pictures contains E:\\Pictures\\Birthday.
{
"archives": [
"C:\\AMD\\Packages",
"C:\\Users",
"C:\\Windows",
"D:\\",
"E:\\Pictures"
]
}
I know this can be done by parsing all of the values being considered (i.e. ['C:', 'AMD', 'Packages'], [...], ... etc) and then comparing them all to one another. However, this seems extremely intensive, especially if the array of paths grows bigger and the directory paths are longer.
You could also do it by comparing the strings with includes. For example, if A includes B or B includes A, split them, and discard the one with a longer length.
for (const dir of dirs){
if (newPath.includes(dir) || dir.includes(newPath)){
if (newPath.split('\\') < dir.split('\\')){
// remove dir from json object and replace it with newPath
}
} else {
pathArray.push(dir)
}
}
After reading one of the answers below, I just realized that the includes method runs into the issue of comparing similar, yet unique paths i.e. C:\Users and C:\User.
Although there's gotta be a better way to do this??

This function will give you your desired results. It first looks to see if the parent of the path exists in the archives, and if so, does nothing. If it doesn't, it then removes any children of the path and then inserts the new path.
Update
I've added a delim input to the function to make it usable for unix/MacOS style filenames as well.
let data = {
"archives": [
"C:\\AMD\\Packages",
"C:\\Users",
"C:\\Windows",
"D:\\",
"E:\\Pictures"
]
};
const add_to_archives = (path, data, delim) => {
// does the parent of this path already exist? if so, nothing to do
if (data.archives.reduce((c, v) =>
c || path.indexOf(v.slice(-1) == delim ? v : (v + delim)) === 0, false)) return data;
// not found. remove any children of this path
data.archives = data.archives.filter(v => v.indexOf(path.slice(-1) == delim ? path : (path + delim)) !== 0);
// and add the new path
data.archives.push(path);
return data;
}
add_to_archives("E:\\Pictures\\Too", data, "\\");
console.log(data);
add_to_archives("E:\\PicturesToo", data, "\\");
console.log(data);
add_to_archives("D:\\Documents", data, "\\");
console.log(data);
add_to_archives("C:\\AMD", data, "\\");
console.log(data);
data = {
"archives": [
"/var/www/html/site",
"/etc",
"/usr/tim",
"/bin"
]
};
add_to_archives("/var/www/html/site2", data, "/");
console.log(data);
add_to_archives("/etc/conf.d", data, "/");
console.log(data);
add_to_archives("/usr", data, "/");
console.log(data);
add_to_archives("/var/www/html", data, "/");
console.log(data);
.as-console-wrapper {
max-height: 100% !important;
}

We can approach the problem by using a prefix tree
The purpose is to limit the number of paths we check for inclusion or "containment".
That approach may be useful if you have a lot of siblings (tree traversal + lookup as key for each folder).
It is overkill if you often have a root folder specified in archives
algorithm
tree = {}
foreach path
split the path in folders (one may iterate with substring but it is worth it?)
try to match folders of that path while traversing the tree
if you encounter a stop node, skip to next path
if not,
if your path end on an existing node
mark that node as a stop node
drop the children of that node (you can let them be, though)
else
include the remaining folders of the path as node in tree
mark the last node as a stop node
Implem
Note that implem below will fail if path includes a folder named "stop". By subjective order of preference
Use Map and Symbol('stop')
or a real tree (at least do not store folders alongside the boolean stop)
do not suppose any stop node and always drop children if you manage to reach the end of your path
Hope no one tries to outsmart you and rename stop as some obscure -folder will not exist- lolol_xxzz9_stop
function nodupes(archives){
tree = {};
archives.forEach(arch=>{
const folders = arch.split('\\');
folders.splice(1,1);
//case of empty string such as D:\\\
if(folders[folders.length-1].length==0){folders.pop();}
let cur = tree;
let dropped = false;
let lastFolderIndex = 0;
let ok = folders.every((folder,i)=>{
if(cur[folder]){
if(cur[folder].stop){
dropped = true;
return false;
}
cur = cur[folder];
return true;
}
cur[folder] = {}
cur = cur[folder];
lastFolderIndex = i;
return true;
});
if(ok){
cur.stop = true;
//delete (facultatively) the subfolders
if(lastFolderIndex < folders.length-1){
console.log('cleanup', folders, 'on node', cur)
Object.keys(cur).forEach(k=>{
if(k != 'stop'){
delete cur[k];
}
})
}
}
});
//console.log('tree', JSON.stringify(tree,null,1));
//dfs on the tree to get all the paths to... the leaves
let dfs = function(tree,paths,p){
if(tree.stop){
return paths.push(p.join('\\\\'));
}
Object.keys(tree).forEach(k=>{
dfs(tree[k], paths, p.concat(k));
});
}
let paths = [];
dfs(tree, paths,[]);
return paths;
}
let archives = [
'C:\\\\ab',
'D:\\\\', //just some root
'D:\\\\ab',//be dropped
'D:\\\\abc',//dropped as well
'F:\\\\abc\\\\e',//two folder creation
'C:\\\\ab\\c',
'B:\\\\ab\\c',
'B:\\\\ab',//expect B\\\\ab\\c to be dropped
]
console.log(nodupes(archives))

Try this
console.log([
"C:\\AMD\\Packages",
"C:\\Users",
"C:\\User",
"E:\\Pictures",
"E:\\Pictures\\Birthday",
"C:\\Windows",
"D:\\",
"D:\\aabbcc",
"E:\\Pictures\\Birthday"
].sort().reduce(
(acc, cur) =>
acc.length > 0
&& cur.startsWith(acc[acc.length - 1])
&& ( cur.indexOf("\\", acc[acc.length - 1].replace(/\\$/,"").length) !== -1 )
&& acc || acc.concat(cur)
, []
))

How can we take like an input the keyboard in JS?

In my project, I need now to take an input from the keyboard.
The user must be able to enter several words and when he presses CTRL+D you exit the program and the result is displayed.
For example we can enter on the terminal :
bob
alicia
cookie
shirley
david
We have the following code :
#!/usr/bin/env node
let chunk = "";
process.stdin.on("data", data => {
chunk += data.toString();
});
process.stdin.on("end", () => {
chunk.replace(/^\s*[\r\n]/gm,"").split(/\s+/).forEach(function (s) {
process.stdout.write(
s === 'bob'
? 'boy \n'
: s === 'alicia'
? 'girl\n'
: s === 'cookie'
? 'dog \n'
: 'unknown \n');
});
});
And when we press CTRL+D we need to obtain this result :
boy
girl
dog
unknown
unknown
Can you help me please to know, how can I code in order to take the keyboard like an input?

Here is an article that explains the basics. I made an example for you down below, you can probably figure out the rest by yourself.
const readline = require('readline');
readline.emitKeypressEvents(process.stdin);
process.stdin.setRawMode(true);
let input = [];
let chunk = '';
process.stdin.on('keypress', (str, key) => {
if (key.ctrl && key.name === 'd') {
//Handle exit code here
process.exit();
}
if (key.name === 'return') {
input.push(chunk.replace('\r', ''));
chunk = '';
process.stdout.write('\n');
}
chunk+=str;
process.stdout.write(str);
});

One way of doing it would to loop input until a certain input is taken. Pseudo code example:
While (x≠q){
Take input
}
EDIT: Another way, is to not use the return key to use for spacing, instead take the items in all in one input line with a separating comma or space.
var str = "123, 124, 234,252";
var arr = str.split(",").map(val => Number(val) + 1);
console.log(arr);
I found the above from this question: How to split and modify a string in NodeJS?.
Then you can iterate over the array to find out if it's a dog or a girl!

Loop through subfiles and get result of of the sum of contents in Javascript

I'm struggling to get the sum of the subfiles. The code below currently returns the sum of a.txt and all its subfiles, supposing that the contents of a.txt is
1
b.txt
the contents of b.txt is
2
c.txt
and the contents of c.txt is
3
I'd like to also get the sum of b.txt and all of its subfiles, the sum of c.txt and all of its subfiles, and so on and so forth for all the files that exist. So the output would be: the sum of a.txt and its subfiles is sum, the sum of b.txt and its subfiles is sum, the sum of c.txt and its subfiles is sum, and so on...
My code below:
const fs = require('fs')
const file = 'a.txt'
let output = (file) => {
let data = fs.readFileSync(file, 'utf8')
.split('\n')
.reduce((array, i) => {
if (i.match(/.txt$/)) {
let intArr = array.concat(output(i))
return intArr
} else if (i.match(/^\d+$/)) {
array.push(parseInt(i, 10));
}
return array;
}, [])
return data
}
console.log(output(file))
const sum = output(file)
console.log(sum.reduce((a, b) => a + b, 0))
Also, any suggestions for improving this code are welcome.

This can be viewed as a pretty standard graph search. Your code starts to do that but there's a few places it can be changed to make it a little easier.
Below is a depth first search starting with a particular file and keeping track of a counts object. The function parses the file just like yours, adds the numbers the counts object. Then it recurses. When the recursion unwinds it add the resulting child's counts to the parents. In the end it returns the counts object which should have the total + subpages for all pages. It doesn't do any error checking for simplicity and it's not clear what should happen if two children both reference the same grandchild - should it be counted twice? Either was it should be easy to adjust.
I made mocked version of fs.readFileSync to the code would run in the snippet and be easier to see:
// fake fs for readFileSync
let fs = {
files: {
"a.txt": "1\nb.txt",
"b.txt": "2\nc.txt",
"c.txt": "3",
"d.txt": "2\n10\ne.txt\nf.txt",
"e.txt": "1",
"f.txt": "5\n7\ng.txt",
"g.txt": "1\na.txt"
},
readFileSync(file) { return this.files[file]}
}
function dfs(file, counts = {}) {
// parse a sinlge file into object
// {totals: sum_allthenumbers, files:array_of_files}
let data = fs.readFileSync(file, 'utf8').split('\n')
let {total, files} = data.reduce((a, c) => {
if(c.match(/^\d+$/)) a.total += parseInt(c)
else if(c.match(/.txt$/)) a.files.push(c)
return a
},{total: 0, files:[]})
// add the total counts for this file
counts[file] = total
// recurse on children files
for (let f of files) {
if (counts.hasOwnProperty(f)) continue // don't look at files twice if there are cycles
let c = dfs(f, counts)
counts[file] += c[f] // children return the counts object, add childs count to parent
}
// return count object
return counts
}
console.log("original files starting with a.txt")
console.log(dfs('a.txt'))
console.log("more involved graph starts with d.txt")
console.log(dfs('d.txt'))

Optimise JavaScript

Can someone please help me to find best way to optimise the below code as it is taking a long time when I have thousand of records searching
var arr =[
{
children:[
{
children:[
{
children:[
{
name:'XYZZZZZ'
}
]
}
]
}
]
}
];
let list = [];
//Calculate column list
arr.forEach((obj0) => {
if (obj0.hasOwnProperty('children')) {
if (obj0.children.length > 0) {
let objchid1 = obj0.children;
objchid1.forEach((obj1) => {
if (obj1.hasOwnProperty('children')) {
if (obj1.children.length > 0) {
let objchid2 = obj1.children;
objchid2.forEach((obj2) => {
if (obj2.hasOwnProperty('children')) {
if (obj2.children.length > 0) {
let objchid3 = obj2.children;
objchid3.forEach((obj3) => {
if (obj3.name !== 'james') {
console.log('IN THREEE', obj3.name);
list.push(obj3.name);
}
});
}
}
});
}
}
});
}
}
});
I have tried searching a lot but no luck Thanks in advance.!!!

Optimize your data structure. Do not use nested arrays unless you really need to. NoSQL is so popular when it comes to WebDev because reads happen 100.000 times more than writes and saving on bandwidth (for you and the user) is worth more than saving on duplicate data in a database considering how cheap hardware is
You can save the elements of the deepest array as object keys (with the nested .name attribute in your case) and the index of the respective position in the array as the object value. This way you can do myArray[myElementsToIndexObject['elementIamLookingFor']] iterating only one single time over the nested array (for building myElementsToIndexObject)

If the data is from a JSON string, the search can be done during the parsing :
var list = [], json = '[{"child":[{"child":[{"child":[{"name":"XYZZZZZ"}]}]}]}]'
var arr = JSON.parse(json, (key, val) => (key === 'name' && list.push(val), val))
console.log(list)
console.log(arr)

Develop Reference

JavaScript is the programming language of the Web.

Reading very large data files using nodejs? Interview question - javascript

Related

Map numbers to other numbers with interpolation in Javascript

Make sure no duplicate directory paths exist

How can we take like an input the keyboard in JS?

Loop through subfiles and get result of of the sum of contents in Javascript

Optimise JavaScript

Categories

Resources