I have a multithreaded web crawler that downloads a website and stores it in a database (which takes around 4 minutes). To make the crawling faster, I used node.js cluster module, but I have a problem, I want to iterate over to the next segment of the while loop, after all the threads have done their processes, not as soon as they start. How do I make sure all my threads are concluded and then move on?
Here is the relevant code in the main while loop:
while (indexSize !== indexSizeLimit) {
const queueLength = queue.length;
const numberOfThreads = Math.min(numberOfCPUs, queueLength);
const threadAllocations = Array(numberOfThreads).fill(0);
let queuesAllocated = 0;
const queueChunks = [];
function fillQueueChunks() {
loop: while (true) {
for (let i = 0; i < numberOfThreads; i++) {
threadAllocations[i] += 1;
queuesAllocated += 1;
if (queuesAllocated === queueLength) {
break loop;
};
};
};
let start = 0;
for (let threadAllocation of threadAllocations) {
const end = start + threadAllocation;
queueChunks.push(queue.slice(start, end));
start = end;
};
};
fillQueueChunks();
// Find out how to make multithreading finish, and then move on with the loop.
if (cluster.isMaster) {
for (let i = 0; i < numberOfThreads; i++) {
cluster.fork();
};
} else {
const chunk = queueChunks[cluster.worker.id - 1];
await Promise.all(chunk.map(function (url) {
return new Promise(async function (resolve) {
const webcode = await request(url);
if (webcode !== "Failure") {
indexSize += 1;
const document = new Document(url, webcode);
const hrefs = document.hrefs();
const hrefsQuery = Query(hrefs);
// Also make sure it is not included in indexed webpages.
const hrefIndividualized = hrefsQuery.individualize();
hrefIndividualized;
// Do something with hrefIndividualized in regards to maintaining a queue in the database.
// And in adding a nextQueue which to replace the queue in code with.
await document.save();
};
resolve("Written");
});
}));
process.exit(0);
};
};
Wrap the threading in a promise. You can check in the parent thread if there is a disconnect event, and if the amount of disconnects is equal to the number of threads, then you can resolve the promise.
Here is what I have
while (indexSize !== indexSizeLimit) {
let nextQueue = [];
const queueLength = queue.length;
const numberOfThreads = Math.min(numberOfCPUs, queueLength);
const threadAllocations = Array(numberOfThreads).fill(0);
let queuesAllocated = 0;
// queueChunks: [[{_id: ..., ...}], [...], ...]
const queueChunks = [];
function fillQueueChunks() {
loop: while (true) {
for (let i = 0; i < numberOfThreads; i++) {
threadAllocations[i] += 1;
queuesAllocated += 1;
if (queuesAllocated === queueLength) {
break loop;
};
};
};
let start = 0;
for (let threadAllocation of threadAllocations) {
const end = start + threadAllocation;
queueChunks.push(queue.slice(start, end));
start = end;
};
};
fillQueueChunks();
await new Promise(async function (resolve) {
if (cluster.isMaster) {
let threadsDone = 0;
for (let i = 0; i < numberOfThreads; i++) {
cluster.fork();
};
cluster.on("disconnect", function (_) {
threadsDone += 1;
if (threadsDone === numberOfThreads) {
resolve("Queue Processed");
};
});
} else {
const queueJob = queueChunks[cluster.id - 1];
await Promise.all(queueJob.map(function (queueItem) {
return new Promise(async function (resolve) {
const url = queueItem._id;
const webcode = await request(url);
if (webcode !== "Failure") {
const document = Document(url, webcode);
let hrefs = document.hrefs();
const hrefsQuery = Query(hrefs);
await document.save();
indexSize += 1;
hrefs = hrefsQuery.individualize();
const hrefIncidences = Promise.all(hrefs.map(function (href) {
return new Promise(async function (resolve) {
const incidences = await Site.countDocuments({
url: href
});
resolve(incidences);
});
}));
hrefs = hrefs.filter(function (_, i) {
return hrefIncidences[i] === 0;
}).map(function (href) {
return {
_id: href
};
});
await Queue.insertMany(hrefs);
nextQueue = nextQueue.concat(hrefs);
};
await Queue.deleteOne({
_id: url
});
resolve("Success");
});
}));
process.exit(0);
};
});
queue = nextQueue;
};
Related
i have a piece of code like this:
for(let j = 0; j < someNumber; j++) {
const forkedChildOne = fork("./child.js");
const childOne = await new Promise((resolve, reject) => {
forkedChildOne.on('message', (msg){
// someCode
resolve(msg);
});
});
}
for(let i = 0; i < someAnotherNumber; i++) {
const forkedChildTwo = fork("./child_numberTwo.js");
const childTwo = await new Promise((resolve, reject) => {
forkedChildTwo.on('message', (msg){
// someCode
resolve(msg);
});
});
}
but here, first it wait for the first loop to complete then it goes to the second loop. but i need to run them in parallel. how can i do that? thanks.
Put them in separate functions defined with the async keyword. Then call both functions, one after the other.
const forkloop = async (number, js) {
for(let j = 0; j < number; j++) {
const forked = fork(js);
const child = await new Promise((resolve, reject) => {
forked.on('message', (msg){
// someCode
resolve(msg);
});
});
}
}
const init = async () => {
/* Do not await here */ forkloop(someNumber, './child.js');
/* Do not await here */ forkloop(someAnotherNumber, './child_numberTwo.js');
}
init();
Can you please try using Promise.all ? Please try below code. I have not tested it,, but I believe it should work.
let arrayRequests = [
forkedChildOne.on('message'),
forkedChildTwo.on('message') ];
return Promise.all(arrayRequests).then(results => {
let resultForkedChildOne = results[0];
let resultForkedChildTwo = results[1];
});
I have a counter which is local to the scope of a parent function and being passed across multiple child functions and is incremented across multiple instances. I am having trouble maintaining the count
I have tried the following
var maxLimit = 150;
async function incrementCounter(counter) {
counter++;
console.log(counter);
return counter;
}
async function processRights() {
var counter = 0,
end = false;
var queryInput = [0, 1, 1, 0, 1];
for (var i = 0; i < queryInput.length; i++) {
var element = queryInput[i];
var thOutput = await processTitle(counter, element, 'th');
if (!thOutput) {
end = true;
return;
}
var nthOutput = await processTitle(counter, element, 'nth');
if (!nthOutput) {
end = true;
return;
};
}
if (!queryInput || !queryInput.length) {
end = true;
}
return;
}
async function processTitle(counter, element, type) {
var output = await callFunc(counter, element, type);
if (!output) {
return false;
}
return output;
}
async function callFunc(counter, element) {
var responses = [];
var counterValue1 = await incrementCounter(counter);
if (counterValue1 >= maxLimit) {
return false;
}
await callAnotherFunc();
if (1) {
var qryArr = [0, 1, 1, 0, 1];
for (let i = 0; i < qryArr.length; i++) {
var counterValue2 = await incrementCounter(counterValue1);
console.log("counterValue2 -- " + counterValue2);
if (counterValue2 >= maxLimit) {
return false;
}
await callAnotherFunc();
}
return responses;
}
}
async callAnotherFunc(){
return true;
}
processRights();
I would like the increment the counter and check against the maximum limit each time the callAnotherFunc function is called. I am a novice to JS. Pl help!
If the counter can be really a global variable, you can declare it side by side with the maxLimit variable, something like this:
var overallCounter = 0;
var maxLimit = 150;
...
async callAnotherFunc(){
return overallCounter++ <= maxLimit;
}
From the question and the code I think this is what you want to achieve, right?
I use my code in modes 'dev', 'gpu' and 'cpu'. When I run up my code in 'dev' mode I had right number of results word 'more' in almost 100mb file. There is 259200 words 'more'. But when I run it up in 'gpu' or 'cpu' mode, I have only 3 results from entire file. And second thing, even if I used 'gpu' and 'cpu' mode there is a time differences. When I used gpu it finish in 4000-4100ms and when I use cpu it finish in 2600-2700ms. I think using gpu should be faster than cpu in this searching. Hope for help!
const { GPU } = require('gpu.js');
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
const gpu = new GPU({ mode: 'cpu' });
rl.question('Wprowadż szukaną frazę: ', (value) => {
let phrase = value;
let start = new Date();
fs.readFile('plik2.txt', 'utf8', function(err, data) {
if (err) throw err;
let tempPhraseLen = phrase.length;
let tempDataLen = data.length;
let tempPhrase = [];
for (let i = 0; i < tempPhraseLen; i++) {
tempPhrase.push(phrase[i].charCodeAt(0));
}
let tempData = [];
for (let i = 0; i < tempDataLen; i++) {
tempData.push(data[i].charCodeAt(0));
}
function kernelFunction(pFunc, arrFunc, pLen, arrLen) {
let done = false;
let counter = 0;
for (let i = 0; i < arrLen; i++) {
if (arrFunc[i] === pFunc[0]) {
let isChosen = true;
let k = 0;
for (let j = i; j < i + pLen; j++) {
isChosen = true;
if (arrFunc[j] !== pFunc[k]) {
isChosen = false;
break;
}
k++;
}
if (isChosen) {
done = true;
counter++;
}
}
}
if (!done) {
return 0;
} else {
return counter;
}
}
const kernel = gpu.createKernel(kernelFunction, {
output: [ 1 ],
tactic: 'precision'
});
const result = kernel(tempPhrase, tempData, tempPhraseLen, tempDataLen);
if (result[0] === 0) {
console.log('Nie udało się znaleźć podanego wzorca');
} else {
console.log('Znalazłeś ' + result[0] + ' powtórzeń/nia frazy "' + phrase + '"');
}
let end = new Date() - start;
console.info('Time: %dms', end);
});
rl.close();
});
I have a loop (with index j) in a loop (with index i) with an await function in it, after debug, I found sometimes some pair of (i,j) execute more than one time. I am totally confused -_-
Could someone explain it? Thanks so much!
Here is the code:
I add searchFunc to an input element.
async function searchFunc() {
let results = [];
let notebooksP = await queryData(url1);
notebooks = notebooksP.value;
// debugger;
for (let i = 0; i < notebooks.length; i++) {
let noteIds;
let noteIdsP = await queryData(urlBase + notebooks[i].id);
noteIds = noteIdsP.value;
debugger;
for (let j = 0; j < noteIds.length; j++) {
console.log("runing at i=", i, ", j=", j, );
let noteContentsP = await queryData(urlBase + noteIds[j].id);
let data = noteContentsP.value;
// debugger;
let content = data.content;
let idx = content.search(key);
if (idx != -1) {
let res = {};
res.notebookId = notebooks[i].id;
res.noteId = noteIds[j].id;
results.push(res);
console.log("found at i=", i, " j=", j);
}
}
}
function queryData(path) {
return new Promise(function (resolve, reject) {
var xhr = new XMLHttpRequest();
xhr.open('get', path);
xhr.send(null);
xhr.onreadystatechange = function () {
if (xhr.readyState != 4) return;
if (xhr.readyState == 4 && xhr.status == 200) {
var ret = xhr.responseText;
resolve({value: JSON.parse(ret)});
} else {
reject('error');
}
}
})
}
const searchContent = debounce(searchFunc, 500);
searchBox.addEventListener('input', searchContent);
function debounce(fn, wait) {
let timeout = null;
return function () {
if (timeout !== null) clearTimeout(timeout);
timeout = setTimeout(fn, wait);
}
}
There is a design flaw here if you are triggering a search on every keypress. This can lead to problems with ordering of responses among other things. However, ignoring this issue, one simple way around the problem you are experiencing is simply to block the function from being called twice at the same time using a variable to track when it is running:
let searching = false;
async function searchFunc() {
if (searching) {
return;
}
searching = true;
let results = [];
let notebooksP = await queryData(url1);
notebooks = notebooksP.value;
// debugger;
for (let i = 0; i < notebooks.length; i++) {
let noteIds;
let noteIdsP = await queryData(urlBase + notebooks[i].id);
noteIds = noteIdsP.value;
for (let j = 0; j < noteIds.length; j++) {
console.log("runing at i=", i, ", j=", j, );
let noteContentsP = await queryData(urlBase + noteIds[j].id);
let data = noteContentsP.value;
// debugger;
let content = data.content;
let idx = content.search(key);
if (idx != -1) {
let res = {};
res.notebookId = notebooks[i].id;
res.noteId = noteIds[j].id;
results.push(res);
console.log("found at i=", i, " j=", j);
}
}
}
searching = false;
}
I use the the variable previous_epoch to fetch reddit posts before today in the first iteration. The next iteration previous_epoch must contain the date from the response data. Oddly enough, the date doesn't change after the first two loops
const getposts = async(user) => {
var dataArray = [];
var utc = new Date()
.toJSON()
.slice(0, 10)
.replace(/-/g, "/");
var previous_epoch;
for(let i=0; i <=5; i++) {
if(i == 0) {
console.log('i is 0');
previous_epoch = new Date(utc).getTime() / 1000;
console.log('p',previous_epoch); <--keeps doesn't work after second loop
}
else {
// console.log('else',dataArray);
previous_epoch = dataArray[dataArray.length-1].created_utc
console.log('else',previous_epoch);
}
await getdat(user,previous_epoch).then((resp) => {
let jsondat = resp.data.data;
dataArray = jsondat.concat(dataArray);
})
}
// console.log( dataArray);
var result = _.map( dataArray, function( o ) {
return o.id;
});
/* console.log("%j",result); */
}
Function getdat()
const getdat = (user,epoch) => {
const sub_url = `https://api.pushshift.io/reddit/search/submission/?subreddit=${user}&limit=300&sort=desc&before=${epoch}`;
console.log('elZK',sub_url);
return new Promise((resolve, reject) => {
axios.get(sub_url)
.then(response => {
return resolve(response)
})
.catch(error => {
return reject(error.message)
})
})
}
here is the fiddle
EDIT 2:
I've changed the logic so that I get the next date from getdat function itself
const getdat = (user,epoch) => {
let res= {};
const sub_url = `https://api.pushshift.io/reddit/search/submission/?subreddit=${user}&limit=300&sort=desc&before=${epoch}`;
console.log('elZK',sub_url);
return new Promise((resolve, reject) => {
axios.get(sub_url)
.then(response => {
let d = response.data.data;
let e = d[d.length - 1].created_utc;
console.log('e',e);
res.data = d;
res.epoch = e;
return resolve(res)
})
.catch(error => {
return reject(error.message)
})
})
}
export const getUserPosts = async(user) => {
var dataArray = [];
var utc = new Date()
.toJSON()
.slice(0, 10)
.replace(/-/g, "/");
var previous_epoch;
var epoch;
for(let i=0; i <=5; i++) {
if(dataArray.length === 0) {
console.log('i is 0');
previous_epoch = new Date(utc).getTime() / 1000;
console.log('p',previous_epoch);
}
else {
// console.log('else',dataArray);
previous_epoch = epoch;
console.log('else',previous_epoch);
}
const resp = await getdat(user,previous_epoch);
const jsondat = resp.data;
dataArray = jsondat.concat(dataArray);
epoch = resp.epoch;
}
// console.log( dataArray);
var result = _.map( dataArray, function( o ) {
return o.id;
});
console.log("%j",result);
}
On first iteration you fetch posts before today, so last post in result will have created_utc before or equal today. Next iterations just repeat you first request.
Try this code
for(let i=0; i <=5; i++) {
if(dataArray.length === 0) {
console.log('i is 0');
previous_epoch = new Date(utc).getTime() / 1000;
console.log('p',previous_epoch);
}
else {
// console.log('else',dataArray);
previous_epoch = dataArray[0].created_utc
console.log('else',previous_epoch);
}
const resp = await getdat(user,previous_epoch);
const jsondat = resp.data.data;
dataArray = jsondat.concat(dataArray);
}
Try this code
const getdat = (user,epoch) => {
const sub_url = `https://api.pushshift.io/reddit/search/submission/?subreddit=${user}&limit=10&sort=desc&before=${epoch}`;
console.log('elZK',sub_url);
return axios.get(sub_url);
};
const getposts = async(user) => {
let dataArray = [];
let previous_epoch;
for(let i=0; i <=5; i++) {
previous_epoch = dataArray.length>0 ? dataArray[dataArray.length-1].created_utc-1 : Math.floor(Date.now()/1000);
const result = await getdat(user,previous_epoch);
dataArray = dataArray.concat(result.data.data);
}
console.log(dataArray.map(e => e.created_utc));
};
getposts('data');