I am trying to solve the problem where the script enters a website, takes the first 10 links from it and then goes on those 10 links and then goes on to the next 10 links found on each of these 10 previous pages. Until the number of visited pages will be 1000.
This is what it looks like:
I was trying to get this by using for loop inside promise and recursion, this is my code:
const rp = require('request-promise');
const url = 'http://somewebsite.com/';
const websites = []
const promises = []
const getOnSite = (url, count = 0) => {
console.log(count, websites.length)
promises.push(new Promise((resolve, reject) => {
rp(url)
.then(async function (html) {
let links = html.match(/https?:\/\/(www\.)?[-a-zA-Z0-9#:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()#:%_\+.~#?&//=]*)/g)
if (links !== null) {
links = links.splice(0, 10)
}
websites.push({
url,
links,
emails: emails === null ? [] : emails
})
if (links !== null) {
for (let i = 0; i < links.length; i++) {
if (count < 3) {
resolve(getOnSite(links[i], count + 1))
} else {
resolve()
}
}
} else {
resolve()
}
}).catch(err => {
resolve()
})
}))
}
getOnSite(url)
I think you might want a recursive function that takes three arguments:
an array of urls to extract links from
an array of the accumulated links
a limit for when to stop crawling
You'd kick it off by calling it with just the root url, and await all of the returned promises:
const allLinks = await Promise.all(crawl([rootUrl]));
On the initial call the second and third arguments could assume default values:
async function crawl (urls, accumulated = [], limit = 1000) {
...
}
The function would fetch each url, extract its links, and recurse until it hit the limit. I haven't tested any of this, but I'm thinking something along these lines:
// limit the number of links per page to 10
const perPageLimit = 10;
async function crawl (urls, accumulated = [], limit = 1000) {
// if limit has been depleted or if we don't have any urls,
// return the accumulated result
if (limit === 0 || urls.length === 0) {
return accumulated;
}
// process this set of links
const links = await Promise.all(
urls
.splice(0, perPageLimit) // limit to 10
.map(url => fetchHtml(url) // fetch the url
.then(extractUrls)); // and extract its links
);
// then recurse
return crawl(
links, // newly extracted array of links from this call
[...accumulated, links], // pushed onto the accumulated list
limit - links.length // reduce the limit and recurse
);
}
async fetchHtml (url) {
//
}
const extractUrls = (html) => html.match( ... )
Related
I am trying to read a big file in chunks instead of loading it directly to memory using nodejs. My goal is to read the file but cannot load it into memory as the file is big and then group the anagrams and then output them.
I started following the article described here
It basically involves creating a shared buffer at the beginning of the program and passing it down.
Essentially it involves the following functions
function readBytes(fd, sharedBuffer) {
return new Promise((resolve, reject) => {
fs.read(fd, sharedBuffer, 0, sharedBuffer.length, null, (err) => {
if (err) {
return reject(err);
}
resolve();
});
});
}
async function* generateChunks(filePath, size) {
const sharedBuffer = Buffer.alloc(size);
const stats = fs.statSync(filePath); // file details
const fd = fs.openSync(filePath); // file descriptor
let bytesRead = 0; // how many bytes were read
let end = size;
for (let i = 0; i < Math.ceil(stats.size / size); i++) {
await readBytes(fd, sharedBuffer);
bytesRead = (i + 1) * size;
if (bytesRead > stats.size) {
// When we reach the end of file,
// we have to calculate how many bytes were actually read
end = size - (bytesRead - stats.size);
}
yield sharedBuffer.slice(0, end);
}
}
I then call it in main like the following. My goal is to group all the anagrams and then output them. However the issue I am having is that when I run the program the first 99,000 items I can access via console.log(Object.values(result)[99000]); however after that I am getting undefined. Any ideas what I am doing wrong?
const CHUNK_SIZE = 10000000; // 10MB
async function main() {
let result = {};
for await (const chunk of generateChunks("Data/example2.txt", CHUNK_SIZE)) {
let words = chunk.toString("utf8").split("\n");
for (let word of words) {
let cleansed = word.split("").sort().join("");
if (result[cleansed]) {
result[cleansed].push(word);
} else {
result[cleansed] = [word];
}
}
}
console.log(Object.values(result)[99000]);
return Object.values(result);
}
I have a scenario in K6 where I get 2 values at the same timestamp from datadog. The first value is an incorrect spike. The second one is the correct value which I need.
I use K6 Trend to collect the values from Datadog. It collects it as 2 different values due to which my max value comes incorrectly as one of those spikes. How to collect it such that when the timestamp is same the latest value gets extracted?
Current implementation:
index.js:
//thresholds and scenarios get loaded in K6
export const getPaymentIdRPS = new Trend("get_payment_id_rps");
export function setup() {
//setup data
}
//this is the scenario with the issue
export function collectGetPaymentIdMetrics(data){
metrics.getPaymentIdRPS= payments.queryGetPaymentIdRPS();
metrics.getPaymentIdRPS.points.forEach(val => {
getPaymentIdRPS.add(val[1], tags);
});
}
In different library, payments.js:
function queryGetPaymentIdRPS() {
const queries = setQueryEnv();
const response = datadog.metricsQuery(queries.getPaymentIdRPS, 10); //(query, duration of extraction)
return parseMetricQuery(response); //these functions defined towards end of this Q.
}
The above returns:
INFO[0032] rps val= [1666798390000,54.5] source=console
INFO[0037] rps val= [1666798390000,15.571428571428573] source=console
INFO[0042] rps val= [1666798400000,19] source=console
INFO[0047] rps val= [1666798400000,5.428571428571429]
Hence max = 54.5 but I want max as 15.57 by storing [1666798390000,15.571428571428573] and [1666798400000,5.428571428571429] only.
metricsQuery:
metricsQuery(query, seconds) {
//Delay the query start/end timestamps to allow datadog to process metrics;
const DATADOG_INGESTION_DELAY = 30;
let start = Math.ceil(Date.now() / 1000) - seconds;
let end = Math.ceil(Date.now() / 1000);
start = start - DATADOG_INGESTION_DELAY;
end = end - DATADOG_INGESTION_DELAY;
//null body in 2nd param
let res = this.queryClient.get(
`/query?from=${start}&to=${end}&query=${query}`,
null,
{ tags: { name: `Datadog /metric` } }
);
check(res, {
"DD metric query OK": (r) => r.status === 200,
}) || fail(JSON.stringify(res));
check(res, {
"DD metric query valid": (r) => r.json().status != "error",
}) || fail(JSON.stringify(res.json().error));
return res;
}
};
parseMetricQuery:
export function parseMetricQuery(response) {
const responseBody = response.json();
//Check invalid response
if (responseBody.series === undefined || response.status != 200) {
fail("Failed to get Datadog metric");
}
//Check 0 response, map will fail if array is empty
if (responseBody.series.length === 0) {
return {
sum: 0,
length: 0,
average: 0,
points: [],
url: response.request.url,
};
} else {
const length = responseBody.series[0].length;
var series = responseBody.series[0].pointlist;
var sum = 0;
sum = series
.map((val) => {
return val[1];
})
.reduce((previousValue, currentValue) => previousValue + currentValue, 0);
return {
sum: sum,
length: length,
average: sum / length,
points: series,
url: response.request.url,
};
}
}
Since each iteration of K6 is like a fresh start, we can't have any global map whose values remain intact to compare timestamps already recorded. Hence I can't compare whether timestamp is already recorded and hence replace value of that timestamp rather than append to Trend list.
how to get data from an object Store from index number n to k.
For example if we have 100 records in an store (ProductStore), an I need to get from index 11 to 20 or 50 to 60. I need to use this in pagination. In mysql we can use LIMIT and OFFSET clouse,is there any equivalent to OFFSET in Indexeddb.
You can use cursor.advance to skip past some records. There is no limit with cursors, you simply stop iterating by counting how many times you iterated.
Something like this:
function query(db, criteria, offset, limit) {
return new Promise((resolve, reject) => {
const results = [];
const transaction = db.transaction('store');
transaction.oncomplete = event => resolve(results);
transaction.onerror = event => reject(event.target);
const store = transaction.objectStore('store');
const index = store.index('index');
const request = index.openCursor(criteria);
let advanced = offset === 0;
let counter = 0;
request.onsuccess = event => {
const cursor = event.target.result;
if (!cursor) {
return;
}
if (!advanced) {
advanced = true;
cursor.advance(offset);
}
counter++;
results.push(cursor.value);
if (counter >= limit) {
return;
}
cursor.continue();
};
});
}
I have an observable get data from stream each time at size 512 each next I have to break it up to 200 char at other observable and keep [12] char in other buffer to concatenate with next block, I solve it by using new subject and for loop, I believe there maybe a better, more pretty solution.
received Observable ----------------------------------------
1st next [512] -------> [112] [200] [200] -------> [200] [200]
2nd next [512][112] --> [24][200][200] [88+112] --> [200] [200]
3rd next [512][24] --> [136] [200] [76+124] .....
nth iteration [512][194] --> [106][200][200][106+94] --> [200][200][200]
n+1th [512][6].......
maxValueSize = 200
this._sreamRecord$.subscribe(
{
next: (val) => {
const bufferToSend: Buffer = Buffer.concat([completationBuffer, val])
for (let i = 0; i < bufferToSend.length; i += maxValueSize) {
if (bufferToSend.length - i > maxValueSize) {
bufferStreamer.next(bufferToSend.slice(i, i + maxValueSize))
} else {
completationBuffer = bufferToSend.slice(i, i + maxValueSize)
}
}
},
complete() {
if (completationBuffer.length) {
bufferStreamer.next(completationBuffer)
}
bufferStreamer.complete()
}
})
You may want to consider a solution along these lines
const splitInChunksWithRemainder = (remainder: Array<any>) => {
return (streamRecord: Array<any>) => {
const streamRecordWithRemainder = remainder.concat(streamRecord);
let chunks = _.chunk(streamRecordWithRemainder, maxValueSize);
const last = chunks[chunks.length - 1];
let newRemainder = [];
if (last.length != maxValueSize) {
newRemainder = chunks[chunks.length - 1];
chunks.length = chunks.length - 1;
}
return {chunks, newRemainder};
};
}
let f = splitInChunksWithRemainder([]);
this._sreamRecord$.pipe(
switchMap(s => {
const res = f(s);
f = splitInChunksWithRemainder(res.newRemainder);
return from(res.chunks);
})
)
.subscribe(console.log);
The idea is to split each streamRecord with lodash chunk function after having concatenated the previous remainder, i.e. the array left as tail from the split of the previous streamRecord.
This is done using the function splitInChunksWithRemainder, which is an higher level function, i.e. a function which returns a function, in this case after having set the remainder coming from the previous split.
UPDATE after comment
If you need to emit also the last newRemainder, than you can consider a slightly more complex solution such as the following
const splitInChunksWithRemainder = (remainder: Array<any>) => {
return (streamRecord: Array<any>) => {
const streamRecordWithRemainder = remainder.concat(streamRecord);
let chunks = _.chunk(streamRecordWithRemainder, maxValueSize);
const last = chunks[chunks.length - 1];
let newRemainder = [];
if (last.length != maxValueSize) {
newRemainder = chunks[chunks.length - 1];
chunks.length = chunks.length - 1;
}
return {chunks, newRemainder};
};
}
const pipeableChain = () => (source: Observable<any>) => {
let f = splitInChunksWithRemainder([]);
let lastRemainder: any[];
return source.pipe(
switchMap(s => {
const res = f(s);
lastRemainder = res.newRemainder;
f = splitInChunksWithRemainder(lastRemainder);
return from(res.chunks);
}),
concat(defer(() => of(lastRemainder)))
)
}
_streamRecord$.pipe(
pipeableChain()
)
.subscribe(console.log);
We have introduced the pipeableChain function. In this function we save the remainder which is returned by the execution of splitInChunksWithRemainder. Once the source Observable completes, we add a last notification via the concat operator.
As you see, we have also to use the defer operator to make sure we create the Observable only when the Observer subscribes, i.e. after the source Observable completes. Without defer the Observable passed to concat as parameter would be created when the source Observable is initially subscribed, i.e. when lastRemainder is still undefined.
I have an api endpoint that receives a large volume of requests from various sources.
For every request received, I create a promise that invokes a internal api.
I want to batch these promises by source, where each batch contains at most 10 seconds of requests.
How can this be done?
If you have multiple requests from multiple sources you may just keep placing them into a Map object where keys being sources and values being received requests collected in an array. Such as let myMap be something like;
{source1: [req1,req2,req3],
source2: [req1,req2],
.
.
sourceN: [req1,req2,...,reqm]}
You may set up a pseudo recursive setTimeout loop to invoke your internal API.
var apiInterval = 10000;
function runner(){
setTimeout(mv => { Promise.all(mv.map(reqs => Promise.all(reqs.map(req => apiCall(req)))))
.then(pss => pss.map(ps => ps.map(p => p.then(r => doSomethingWithEachApiCallResult(r)))));
clearMapValues(); // to be filled in the next 10 seconds
runner();
}, apiInterval, myMap.values.slice());
}
Please take above as a pseudo code just to give you an idea. For instance Map.values return an iterator object and you may need to turn it into an array like [...myMap.values()] before using .map() or .slice() over it.
This is a little better than setInterval way of looping as you may change the interval value dynamically depending on the workload or whatnot.
I propose the following solution.
It uses a Map to store a string key and a array of values.
It uses setTimeout for every map key to flush the values of that map key to a callback.
Code
/**
* A stream of requests come from various sources, can be transposed into a batch indexed
* by the source of the request.
*
* The size of each batch is defined by a time interval. I.e. any request received within the
* time interval is stored in a batch.
*/
export class BatchStream<K, V> {
cache: Map<K, V[]>
flushRate: number
onBatch: (k: K, v: V[]) => Promise<void>
debug: boolean
constructor(onBatch: (k: K, v: V[]) => Promise<void>, flushRate = 5000, debug = false) {
this.cache = new Map<K, V[]>()
this.onBatch = onBatch
this.debug = debug
this.flushRate = flushRate
this.flush = this.flush.bind(this)
}
push(k: K, v: V) {
if (this.cache.has(k)) {
let batch = this.cache.get(k)
batch.push(v)
this.cache.set(k, batch)
} else {
this.cache.set(k, [v])
setTimeout(this.flush, this.flushRate, k)
}
}
flush(k: K) {
this.debug && console.log("Flush", k)
let batch = this.cache.get(k)
this.cache.delete(k)
this.onBatch(k, batch)
this.debug && console.log("Size", this.cache.size)
}
}
Test
it("BatchStream", (done) => {
let sources = []
let iterations = 10
let jobs = []
let jobsDone = 0
let debug = true
// Prepare sources
for (let i = 97; i < 123; i++) {
sources.push(String.fromCharCode(i))
}
// Prepare a stream of test data
for (let k of sources) {
for (let i = 0; i < iterations; i++) {
jobs.push({ k, v: k + i.toString() })
}
}
shuffle(jobs)
// Batch handler
let onBatch = (k: string, v: string[]) => {
return new Promise<void>((resolve, reject) => {
jobsDone += v.length
debug && console.log(" --> " + k, v.length, v.join(","), jobsDone, sources.length * iterations)
if (jobsDone == sources.length * iterations) {
done()
}
resolve()
})
}
let batchStream = new BatchStream<string, string>(onBatch, 5000, debug)
// Stream test data into batcher
let delay = 0
for (let j of jobs) {
delay += 100
setTimeout(() => {
batchStream.push(j.k, j.v)
}, delay)
}
})