How do run multiple asynchronous functions in this node code? - javascript

I'm pulling text from N urls. First I get the N urls in linksOnPage and then i run a doOnPage function to get the text from each url. When i run code only 1 of the N urls gets processed through the function. I assume it's because the processing function is running asynchronously. How do I stack these up in a queue and run them all/ whats a better way to do this?
Here's the main JS code:
var nodeio, linksOnPage, lyricsFromLink, db;
nodeio = require('node.io');
db = require('./db');
db.loadDB();
var loadSong = function(artist, title, lyrics){
console.log("loadSong being called");
var newSongObj = {};
newSongObj['artist'] = artist;
newSongObj['title'] = title;
newSongObj['lyrics'] = lyrics;
//store the lyrics in a mongo table
var newSong = new db.Song(newSongObj);
newSong.save(function(err) {
if(err){
throw err;
} else{
console.log("saved with no errors!");
}
});
};
// generic utility for getting links on a page and running a function on each one
exports.linksOnPage = function(pageObj, linkSelector, doOnPage, contentSelector) {
nodeio.scrape(function(){
this.getHtml(pageObj.pageUrl, function(err, $) {
var links = [];
var i = 0;
$(linkSelector).each(function(link) {
var fullLink = pageObj.rootUrl + link.attribs.href
links.push(fullLink);
//run a function on each link
console.log('getting lyrics for song: ', i);
doOnPage(pageObj.artist, fullLink, contentSelector);
i = i+1;
});
//this.emit(links);
});
});
}
// get the lyrics for a specific song
exports.lyricsFromLink = function(artist, pageUrl, lyricsSelector) {
nodeio.scrape(function(){
this.getHtml(pageUrl, function(err, $) {
var lyrics = "";
console.log('before each statement');
$(lyricsSelector).each(function(lyricParagraph) {
lyrics = lyrics + " " + lyricParagraph.text;
});
console.log('after each statement');
loadSong(artist, pageUrl, lyrics);
this.emit(lyrics)
});
});
}

Related

How to read and Write multiple files using node js?

In a array I have filenames; I want to first read one file and perform some operation then store result in a separate file. Then read 2nd file, perform operation again and save result in new 2nd file. Do the same procedure for all files. Below I have written code to read and write files.
TextReader.js
var fs = require('fs');
const readline= require('readline');
var headerIndex = [];
var isFirstLine = true;
var finalList = [];
module.exports={
readTextFile: (filename)=>{
console.log('inside textreader')
readline.createInterface({
input: fs.createReadStream(`./s3/${filename}`)
}).on('line', function(line) {
console.log(line);
console.log("-----------------------------");
if (isFirstLine) {
headerIndex = line.split('|');
}
else if (!isFirstLine){
let rowValues = line.split('|');
let valueIndex = 0;
var singlePerson = {};
headerIndex.forEach(currentval => {
singlePerson[currentval] = rowValues[valueIndex];
valueIndex++;
});
finalList.push(singlePerson);
}
isFirstLine = false;
}).on('close',function(){
//console.log(finalList);
var data='';
var header= "Employee ID"+'\t'+headerIndex[0]+'\t'+headerIndex[2]+'\t'+headerIndex[1]+'\t'+headerIndex[4]
+'\t'+headerIndex[3]+'\t'+headerIndex[5]+'\n';
for (var i = 0; i < finalList.length; i++) {
function split(name){
var conv=name.split(' ');
var result=[conv.slice(0, -1).join(' '),conv.slice(-1)[0]].join(conv.length < 2 ? '' : ',');
return result;
}
split(finalList[i].UserName);
data=data+finalList[i].LoginID+'\t'+split(finalList[i].UserName)+'\t'+finalList[i].Email+'\t'
+finalList[i].LoginID+'\t'+'A&G Professional'+'\t'+finalList[i].Title+'\t'+finalList[i].State+'\n';
}
var newFilename= filename.substr(0, filename.lastIndexOf("."))
var alldata= header + data;
//console.log(alldata)
fs.appendFile(`./s3/${filename}.xlsx`,alldata, (err) => {
if (err) throw err;
console.log('File created');
});
});
}
}
I am calling readTextFile(); from another file.
demo.js
const { readTextFile } = require("./textReader");
var array=['UserRoleDetails_12102021063206.txt',
'UserRoleDetails_12102021064706 (1).txt',
'UserRoleDetails_12102021064706.txt',
'UserRoleDetails_12102021070206.txt']
array.forEach(function(currentItem){
readTextFile(currentItem);
})
The problem i am facing is that all files are processed at the same time and all the datas of all files are stored together.
first, this node js is not work in sequential as you mention here
and second, array.forEach is not useful here to do the sequential operation
you need to use
const { readTextFile } = require("./textReader");
var array=['UserRoleDetails_12102021063206.txt',
'UserRoleDetails_12102021064706 (1).txt',
'UserRoleDetails_12102021064706.txt',
'UserRoleDetails_12102021070206.txt']
for (const element of array) {
readTextFile(currentItem);
}
NOTE:- readTextFile(currentItem) your this function is not async so maybe you need to make it async
if you are not clear then raise your hand

NodeJS Loop issue due to async/synchronicity issues

I am porting an old ruby script over to use javascript setting the function as a cron instance so it will run on schedule. The function queries our mysql database and retrieves inventory information for our products and then sends requests to a trading partners api to update our inventory on their site.
Due to nodes a-synchronicity I am running into issues. We need to chunk requests into 1000 items per request, and we are sending 10k products. The issue is each request is just sending the last 1000 items each time. The for loop that is inside the while loop is moving forward before it finishes crafting the json request body. I tried creating anon setTimeout functions in the while loop to try and handle it, as well as creating an object with the request function and the variables to be passed and stuffing it into an array to iterate over once the while loop completes but I am getting the same result. Not sure whats the best way to handle it so that each requests gets the correct batch of items. I also need to wait 3 minutes between each request of 1000 items to not hit the request cap.
query.on('end',()=>{
connection.release();
writeArray = itemArray.slice(0),
alteredArray = [];
var csv = json2csv({data: writeArray,fields:fields}),
timestamp = new Date(Date.now());
timestamp = timestamp.getFullYear() + '-' +(timestamp.getMonth() + 1) + '-' + timestamp.getDate()+ ' '+timestamp.getHours() +':'+timestamp.getMinutes()+':'+timestamp.getSeconds();
let fpath = './public/assets/archives/opalEdiInventory-'+timestamp+'.csv';
while(itemArray.length > 0){
alteredArray = itemArray.splice(0,999);
for(let i = 0; i < alteredArray.length; i++){
jsonObjectArray.push({
sku: alteredArray[i]['sku'],
quantity: alteredArray[i]["quantity"],
overstockquantity: alteredArray[i]["osInv"],
warehouse: warehouse,
isdiscontinued: alteredArray[i]["disc"],
backorderdate: alteredArray[i]["etd"],
backorderavailability: alteredArray[i]["boq"]
});
}
var jsonObject = {
login: user,
password: password,
items: jsonObjectArray
};
postOptions.url = endpoint;
postOptions.body = JSON.stringify(jsonObject);
funcArray.push({func:function(postOptions){request(postOptions,(err,res,body)=>{if(err){console.error(err);throw err;}console.log(body);})},vars:postOptions});
jsonObjectArray.length = 0;
}
var mili = 180000;
for(let i = 0;i < funcArray.length; i++){
setTimeout(()=>{
var d = JSON.parse(funcArray[i]['vars'].body);
console.log(d);
console.log('request '+ i);
//funcArray[i]['func'](funcArray[i]['vars']);
}, mili * i);
}
});
});
You would need async/await or Promise to handle async actions in node js.
I am not sure if you have node version which supports Async/await so i have tried a promise based solution.
query.on('end', () => {
connection.release();
writeArray = itemArray.slice(0),
alteredArray = [];
var csv = json2csv({ data: writeArray, fields: fields }),
timestamp = new Date(Date.now());
timestamp = timestamp.getFullYear() + '-' + (timestamp.getMonth() + 1) + '-' + timestamp.getDate() + ' ' + timestamp.getHours() + ':' + timestamp.getMinutes() + ':' + timestamp.getSeconds();
let fpath = './public/assets/archives/opalEdiInventory-' + timestamp + '.csv';
var calls = chunk(itemArray, 1000)
.map(function(chunk) {
var renameditemsArray = chunk.map((item) => new renamedItem(item, warehouse));
var postOptions = {};
postOptions.url = endpoint;
postOptions.body = JSON.stringify({
login: user,
password: password,
items: renameditemsArray
});
return postOptions;
});
sequenceBatch(calls, makeRequest)
.then(function() {
console.log('done');
})
.catch(function(err) {
console.log('failed', err)
});
function sequenceBatch (calls, cb) {
var sequence = Promise.resolve();
var count = 1;
calls.forEach(function (callOptions) {
count++;
sequence = sequence.then(()=> {
return new Promise(function (resolve, reject){
setTimeout(function () {
try {
cb(callOptions);
resolve(`callsequence${count} done`);
}
catch(err) {
reject(`callsequence ${count} failed`);
}
}, 180000);
});
})
});
return sequence;
}
function makeRequest(postOptions) {
request(postOptions, (err, res, body) => {
if (err) {
console.error(err);
throw err;
}
console.log(body)
});
}
function chunk(arr, len) {
var chunks = [],
i = 0,
n = arr.length;
while (i < n) {
chunks.push(arr.slice(i, i += len));
}
return chunks;
}
function renamedItem(item, warehouse) {
this.sku = item['sku']
this.quantity = item["quantity"]
this.overstockquantity = item["osInv"]
this.warehouse = warehouse
this.isdiscontinued = item["disc"]
this.backorderdate = item["etd"]
this.backorderavailability= item["boq"]
}
});
Could you please try this snippet and let me know if it works?I couldn't test it since made it up on the fly. the core logic is in the sequenceBatch function. the The answer is based on an another question which explains how timeouts and promises works together.
Turns out this wasn't a closure or async issues at all, the request object I was building was using references to objects instead of shallow copies resulting in the data all being linked to the same object ref in the ending array.

posting to node from angular controller

Ok, I do not understand what is going here, works locally but not on my server.
I have a angular controller that post to my node server.
each time I try and run the function that triggers the post I get
POST http://www.mysite.co.uk/mm3/back-end/savephotos 404 (Not Found)
Im honestly lost, ive rewritten the post 5 times I cant find the problem.
If anyone can see where ive gone wrong please help.
angular controller
mm3\js\controller.js
//all photos've been pushed now sending it to back end
$timeout(function () {
$http.post('back-end/savephoto', $scope.photosToPhp).then(function (success) {
$scope.generating = false;
$scope.generateBtn = 'Generate';
//creating mock up gallery
for (var x = 0; x < success.data.photos; x++) {
var file = '/mm3/tmp/' + success.data.folder + "/out" + x + ".png";
$scope.gallery.push(file);
}
$scope.photosToPhp = [];
}, function (error) {
});
}, 800);
then my node back-end
UPDATED:
So I have added a few console logs in my function to see where its going wrong and where it is getting to.
I keep getting:
test 1 function started error saving photo
mm3\back-end\controller.js
app.post('/mm3/back-end/savePhoto', function (req, res) {
console.log('test 1 function started');
var folder = Math.random().toString(36).substr(2, 20);
var photos = req.body;
var counts = 0;
var callback = function(counts){
if(counts < photos.length){
saveBase64(photos[counts],folder,counts,callback);
console.log('test 2 save photo');
}else{
var counts = 0;
var response = {"folder":folder, "photos": photos.length};
console.log('test 3 save photo else');
res.send(response)
}
};
saveBase64(photos[counts],folder,counts,callback);
});
app.post('/mm3/downloadZip', function(req, res){
var photos = req.body;
var out = photos[0];
var test = out.split('/');
var loc = test.pop();
var end = test.join('/');
console.log('test 3 function Generate zip file');
console.log(end);
var outName = '/' + end +'/mm3/MockUp.zip';
var output = fs.createWriteStream(outName);
var archive = archiver('zip', {store: true });
var zip = function(photos, f){
for(var t = 0; t < photos.length; t++){
var file = 'mockUp'+ t +'.jpg';
var from = '/var/www/html' + photos[t];
archive.file( from, { name: file });
}
f();
};
output.on('close', function() {
var photos = req.body;
var out = photos[0];
var test = out.split('/');
var loc = test.pop();
var end = test.join('/');
res.send(end + '/MockUp.zip');
console.log('archiver has been finalized and the output file descriptor has closed.');
});
archive.on('error', function(err) {
throw err;
});
archive.pipe(output);
zip(photos, f);
function f(){
archive.finalize();
}
});
function saveBase64(photo,folder, counts, callback){
var result = photo.split(',')[1];
var path = '/mm3/tmp/' + folder;
var filename = path + "/out"+ counts + ".png";
mkdirp( path, function() {
fs.writeFile(filename, result, 'base64', function(error){
if (error) {
console.log('error saving photo');
}else{
console.log('photo saved');
counts ++;
callback(counts);
}
});
});
}
I think this is the problem:
app.post('back-end/savephoto', function (req, res) {
// skipped some lines
});
change it to
app.post('/back-end/savephoto', function (req, res) {
// skipped some lines
});
In Angular, the below:
$http.post('back-end/savephoto......
Becomes:
$http.post('/back-end/savephoto.....
In Node, the below:
app.post('back-end/savephoto.....
Becomes:
app.post('back-end/savephoto....
Then, you need to add a console.log under the Node route to see if it even is executed. This will narrow it down. Also, you can remove the $http.post call outside of the timeout to eliminate the obvious.
Let me know how you get on.
Shayan

Reading a file line by line, parse them and insert them in mongo in node js

I have a file which is tab separated. It has thousands of data. How can I use nodeJs to read the file, line by line, parse them and create an object and insert them in a mongo DB.
I am just learning node and mongo. I come from different background. So how can this be done.
Finally the Mongo DB has to be populated with proper data.
I searched in net but I could not find the complete solution.
Thanks.
I had an issue with the answer by Juvenik. My problem was that the database would not be populated by the time readline had completed. The lines were being read synchronously, but the DB insertion was asynchronous.
Instead, I found a simpler solution with the line-reader package. It reads the lines and waits for a callback before continuing.
var MongoClient = require('mongodb').MongoClient
var dbName = 'yourDbName'
var url = 'mongodb://localhost:27017/' + dbName
var collectionName = 'yourCollectionName'
var filename = 'yourFileName.txt'
var printLine = 1000
MongoClient.connect(url, function(err, db) {
if (err) {
console.error('Problem connecting to database')
} else {
console.log('Connected correctly to server.')
var lineReader = require('line-reader')
var collection = db.collection(collectionName)
var lineNum = -1
var headers = []
lineReader.eachLine(filename, function(line, last, cb) {
lineNum++
try {
var split = line.split('\t')
var object = {}
if (lineNum > 0) {
for (var i = 0; i < split.length; i += 1) {
object[headers[i]] = split[i]
}
collection.insert(object, function (insertErr, insertObj) {
if (insertErr) console.error(insertErr)
if (lineNum % printLine === 0) console.log('Line ' + lineNum)
if (last) {
console.log('Done with ' + filename + ' (' + lineNum + ' records)')
process.exit(0)
} else {
cb()
}
})
} else {
headers = line.split('\t')
cb()
}
} catch (lineError) {
console.error(lineError)
}
})
}
})
I came across similar problem. This approach worked for me.
Have a look, it might be helpful.
var mongoDb = require('mongodb');
var mongoClient = mongoDb.MongoClient;
var dbname = 'YOUR_DB_NAME';
var collectionName = 'YOUR_COLLECTION_NAME';
var url = 'mongodb://localhost:27017/'+dbname;
var filename = 'FIle_Name.txt';
console.log('***************Process started');
mongoClient.connect(url,function(err,db){
if(err){
console.log('error on connection '+err);
}
else{
console.log('***************Successfully connected to mongodb');
var collection = db.collection(collectionName);
var fs = require('fs');
var readline = require('readline');
var stream = require('stream');
var instream = fs.createReadStream(filename);
var outstream = new stream;
var rl = readline.createInterface(instream,outstream);
console.log('***************Parsing, please wait ...');
rl.on('line',function(line){
try{
var arr = line.split('\t');
var object = {};
//Parse them here
//Example
object['name'] = arr[0]; //Just an example
var res = collection.insert(object);
}
catch (err){
console.log(err);
}
});
rl.on('close',function(){
db.close();
console.log('***************completed');
});
}
});
I am a learner too. If someone can make it better, it will be good.
Here is a more performant (inserting batches of objects) and updated version (using async and latest mongo driver) of frank-0's answer
const lineReader = require('line-reader');
async function readFileAndInsertInMongo(file) {
let total = 0;
return new Promise((resolve, reject) => {
let buffer = [];
lineReader.eachLine(file, (line, last, cb) => {
// prepare your object based on the line content
let insertObject = {'some_content': 'some_value'};
if (total % 10000 === 0 || last) {
collection.insertMany(buffer, function(err, res){
if (last) {
if (err) {
reject(err);
} else {
resolve(res);
}
} else {
buffer = [];
return cb();
}
});
} else {
buffer.push(insertObject);
return cb();
}
});
});
}
This really is the best solution I have found to parse huge files and insert them in the database without exploding Node's memory. Hope this can help ;)

Wait for Javascript Web Scraping Function to finish before running for next page?

I am attempting to create a web scraper (in node.js) that will pull down information from a site, and write it to a file. I have it built to correctly work for one page, but when I try to use the function in a for loop, to iterate through multiple games, I get bad data in all of the games.
I understand that this is related to Javascript's asynchronous nature, and I have read about callback functions, but I'm not sure I understand how to apply it to my code. Any help would be GREATLY appreciated:
for(x = 4648; x < 4650; x++){ //iterate over a few gameIDs, used in URL for request
scrapeGame(x);
}
function scrapeGame(gameId){
//request from URL, scrape HTML to arrays as necessary
//write final array to file
}
Essentially, what I am looking to do, is within the for loop, tell it to WAIT to finish the scrapeGame(x) function before incrementing x and running it for the next game -- otherwise, the arrays start to overwrite each other and the data becomes a huge mess.
EDIT: I've now included the full code which I am attempting to run! I'm getting errors when looking in the files after they are written. For example, the first file is 8kb, second is ~16, 3rd is ~32, etc. It seems things aren't getting cleared before running the next game.
Idea of the program is to pull Jeopardy questions/answers from the archive site in order to eventually build a quiz app for myself.
//Iterate over arbitrary number of games, scrape each
for(x = 4648; x < 4650; x++){
scrapeGame(x, function(scrapeResult) {
if(scrapeResult){
console.log('Scrape Successful');
} else {
console.log('Scrape ERROR');
}
});
}
function scrapeGame(gameId, callback){
var request = require('request');
cheerio = require('cheerio');
fs = require('fs');
categories = [];
categorylist = [];
ids = [];
clues = [];
values = ['0','$200','$400','$600','$800','$1000','$400','$800','$1200','$1600','$2000'];
valuelist = [];
answers = [];
array = [];
file = [];
status = false;
var showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId;
var showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;
request(showGameURL, function(err, resp, body){
if(!err && resp.statusCode === 200){
var $ = cheerio.load(body);
//add a row to categories to avoid starting at 0
categories.push('Category List');
//pull all categories to use for later
$('td.category_name').each(function(){
var category = $(this).text();
categories.push(category);
});
//pull all clue IDs (coordinates), store to 1d array
//pull any id that has "stuck" in the string, to prevent duplicates
$("[id*='stuck']").each(function(){
var id = $(this).attr('id');
id = id.toString();
id = id.substring(0, id.length - 6);
ids.push(id);
//if single J, pick category 1-6
if (id.indexOf("_J_") !== -1){
var catid = id.charAt(7);
categorylist.push(categories[catid]);
var valId = id.charAt(9);
valuelist.push(values[valId]);
}
//if double J, pick category 7-12
else if (id.indexOf("_DJ_") !== -1){
var catid = parseInt(id.charAt(8)) + 6;
categorylist.push(categories[catid]);
var valId = parseInt(id.charAt(10)) + 5;
valuelist.push(values[valId]);
}
//if final J, pick category 13
else {
categorylist.push(categories[13]);
}
});
//pull all clue texts, store to 1d array
$('td.clue_text').each(function(){
var clue = $(this).text();
clues.push(clue);
});
//push pulled values to big array
array.push(ids);
array.push(categorylist);
array.push(valuelist);
array.push(clues);
//new request to different URL to pull responses
request(showAnswerURL, function(err, resp, body){
if(!err && resp.statusCode === 200){
var $ = cheerio.load(body);
$('.correct_response').each(function(){
var answer = $(this).text();
answers.push(answer);
});
//push answers to big array
array.push(answers);
//combine arrays into 1-d array to prep for writing to file
for(var i = 0; i < array[0].length; i++){
var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
var stringPrint = print.toString();
file.push(stringPrint);
}
//update string, add newlines, etc.
var stringFile = JSON.stringify(file);
stringFile = stringFile.split('\\').join('');
stringFile = stringFile.split('","').join('\n');
//write to file, eventually will append to end of one big file
fs.writeFile('J_GAME_' + gameId +'.txt', stringFile, function(err) {
if(err) {
console.log(err);
} else {
console.log("Game #" + gameId + " has been scraped.");
status = true;
}
});
}
});
}
});
//clear arrays used
valuelist = [];
answers = [];
categories = [];
categorylist = [];
ids = [];
clues = [];
array = [];
file = [];
//feed callback status
callback(status);
}
// Iterate over a few gameIDs, used in URL for request.
for (x = 4648; x < 4650; x++) {
// Pass in the callback as an anonymous function.
// So below I am passing in the id and the function I want to execute.
// AND, defining the results I am expecting as passed in arguments.
scrapeGame(x, function(scrapeResult, err) {
// This will *NOT* execute *UNTIL* you call it in the function below.
// That means that the for loop's execution is halted.
// This function receives the status that is passed in,
// in this case, a boolean true/false and an error if any.
if (scrapeResult) {
// Scrape was true, nothing to do.
// The for loop will now move on to the next iteration.
console.log('Scrape Successful');
} else {
// Scrape was false, output error to console.log and
// break loop to handle error.
console.log('Scrape ERROR :: ' + err);
// Notice we are calling break while in the
// scope of the callback function
// Remove the break if you want to just move onto
// the next game ID and not stop the loop
break;
}
});
}
// This function now accepts two arguments.
function scrapeGame(gameId, callback) {
// ************************************************
// ** Do Your Work Here **
// Request from URL, scrape HTML to arrays as necessary.
// Write final array to file.
// After file creation, execute the callback and pass bool
// status (true/false).
// ************************************************
var request = require('request'),
cheerio = require('cheerio'),
fs = require('fs'),
categories = [],
categorylist = [],
ids = [],
clues = [],
values = [
'0',
'$200',
'$400',
'$600',
'$800',
'$1000',
'$400',
'$800',
'$1200',
'$1600',
'$2000'
],
valuelist = [],
answers = [],
array = [],
file = [],
showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId,
showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;
request(showGameURL, function(err, resp, body) {
if (!err && resp.statusCode === 200) {
var $ = cheerio.load(body);
//add a row to categories to avoid starting at 0
categories.push('Category List');
//pull all categories to use for later
$('td.category_name').each(function() {
var category = $(this).text();
categories.push(category);
});
//pull all clue IDs (coordinates), store to 1d array
//pull any id that has "stuck" in the string, to prevent duplicates
$("[id*='stuck']").each(function() {
var id = $(this).attr('id');
id = id.toString();
id = id.substring(0, id.length - 6);
ids.push(id);
//if single J, pick category 1-6
if (id.indexOf("_J_") !== -1) {
var catid = id.charAt(7);
categorylist.push(categories[catid]);
var valId = id.charAt(9);
valuelist.push(values[valId]);
}
//if double J, pick category 7-12
else if (id.indexOf("_DJ_") !== -1) {
var catid = parseInt(id.charAt(8)) + 6;
categorylist.push(categories[catid]);
var valId = parseInt(id.charAt(10)) + 5;
valuelist.push(values[valId]);
}
//if final J, pick category 13
else {
categorylist.push(categories[13]);
}
});
//pull all clue texts, store to 1d array
$('td.clue_text').each(function() {
var clue = $(this).text();
clues.push(clue);
});
//push pulled values to big array
array.push(ids);
array.push(categorylist);
array.push(valuelist);
array.push(clues);
//new request to different URL to pull responses
request(showAnswerURL, function(err, resp, body) {
if (!err && resp.statusCode === 200) {
var $ = cheerio.load(body);
$('.correct_response').each(function() {
var answer = $(this).text();
answers.push(answer);
});
//push answers to big array
array.push(answers);
//combine arrays into 1-d array to prep for writing to file
for (var i = 0; i < array[0].length; i++) {
var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
var stringPrint = print.toString();
file.push(stringPrint);
}
//update string, add newlines, etc.
var stringFile = JSON.stringify(file);
stringFile = stringFile.split('\\').join('');
stringFile = stringFile.split('","').join('\n');
//write to file, eventually will append to end of one big file
fs.writeFile('J_GAME_' + gameId + '.txt', stringFile, function(err) {
//clear arrays used
valuelist = [];
answers = [];
categories = [];
categorylist = [];
ids = [];
clues = [];
array = [];
file = [];
if (err) {
// ******************************************
// Callback false with error.
callback(false, err);
// ******************************************
} else {
console.log("Game #" + gameId + " has been scraped.");
// ******************************************
// Callback true with no error.
callback(true);
// ******************************************
}
});
}
});
}
});
}
My assumption is that you want them to be scraped one after one, not in parallel. So, for loop won't help. The following approach should do the trick:
var x = 4648;
var myFunc = scrapeGame(x, function cb(){
if(x >= 4650){
return;
}
x++;
return myFunc(x, cb);
});
function scrapeGame(gameId){
//request from URL, scrape HTML to arrays as necessary
//write final array to file
}
For nested async function, where you want them be executed in serial manner, you should just forget about for loop.
An example of correct request handling with http client:
function scrapeGame(gameId, cb){
//your code and set options
http.request(options, function(response){
var result = "";
response.on('data', function (chunk) {
result += chunk;
});
response.on('end',function(){
//write data here;
//do the callback
cb();
});
});
}
I solved the ROOT cause of the issue that I was seeing, though I do believe without the callback assistance from red above, I would have been just as lost.
Turns out the data was processing correctly, but the file write was scrambling. Turns out that there is a different method to call instead of writeFile or appendFile:
fs.appendFileSync();
Calling the Synchronous version processed the writes to the file IN THE ORDER they got appended to the file, instead of just going for it. This, in addition to the callback help above, solved the issue.
Thanks to everyone for the assistance!

Categories

Resources