Asynchronous request for web crawler - javascript

I have an array of URLs from each of which I want to crawl an html table and save it in another array in the same order as the original array.
Due to the asynchronous nature of node I assume it's not working as I expect, the results are in a different order every time.
I googled a lot and tried different things like using a custom async-forEach-function or request-promise instead of request, but nothing worked.
const request = require('request');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');
let verbs = [];
let conjugations = [];
fs.readFileSync('verbs.txt', 'utf-8').split(/\r?\n/).forEach
(function(line){
verbs.push(line);
});
verbs.forEach((verb) => {
const URI = encodeURI("https://ru.wiktionary.org/wiki/" + verb);
var options = {
uri: URI,
transform: function (body) {
return cheerio.load(body);
}
};
rp(options).then(function ($) {
let table = $('span#Русский.mw-headline').parent().nextAll('table').first();
conjugations.push(table.text());
console.log(conjugations[0]);
})
.catch(function (err) {
});
})

Use Promise.all if the order is important.
The Promise.all() method returns a single Promise that resolves when all of the promises passed as an iterable have resolved or when the iterable contains no promises. It rejects with the reason of the first promise that rejects.
Example of keeping things in order:
const verbs = ["hello", "world", "example"];
let timeout = 2000;
const promises = verbs.map(verb=>{
timeout -= 500;
return new Promise((resolve,reject)=>{
setTimeout(function(){
resolve(verb);
}, timeout);
});
});
Promise.all(promises).then(dataArray=>console.log(dataArray));
Solution with your code.
const promises = verbs.map((verb) => {
const URI = encodeURI("https://ru.wiktionary.org/wiki/" + verb);
var options = {
uri: URI,
transform: function(body) {
return cheerio.load(body);
}
};
return rp(options);
})
Promise.all(promises).then(dataArray=>{
dataArray.forEach(function($) {
let table = $('span#Русский.mw-headline').parent().nextAll('table').first();
conjugations.push(table.text());
console.log(conjugations[0]);
})
}).catch(function(err) {});
Downside, if one request fails they all fail.
Alternatively, you could do something like this by using the index of each verb (Using Promise.all to determine when everything is done but that step can be ignored...)
const verbs = ["hello", "world", "example"];
const conjugations = [];
let timeout = 2000;
const promises = verbs.map((verb, index)=>{
return new Promise((resolve, reject)=>{
setTimeout(function(){
conjugations[index] = verb;
resolve();
}, timeout);
timeout -= 500;
});
});
Promise.all(promises).then(()=>console.log(conjugations));
Example with your code.
const request = require('request');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');
let verbs = [];
let conjugations = [];
fs.readFileSync('verbs.txt', 'utf-8').split(/\r?\n/).forEach(function(line) {
verbs.push(line);
});
verbs.forEach((verb, index) => {
const URI = encodeURI("https://ru.wiktionary.org/wiki/" + verb);
var options = {
uri: URI,
transform: function(body) {
return cheerio.load(body);
}
};
rp(options).then(function($) {
let table = $('span#Русский.mw-headline').parent().nextAll('table').first();
conjugations[index] = table.text();
console.log(conjugations[index]);
})
.catch(function(err) {});

Related

Can't download multiple images from array using axios

const Fs = require('fs')
const Path = require('path')
const Axios = require('axios')
var dir = './tmp';
async function downloadImage () {
if (!Fs.existsSync(dir)){
Fs.mkdirSync(dir);
}
var arr = ['https://reaperscans.com/wp-content/uploads/WP-manga/data/manga_6295b8da2aa90/5461fc34b58cd174c806625056c6e0dc/01-copy.jpg','https://reaperscans.com/wp-content/uploads/WP-manga/data/manga_6295b8da2aa90/5461fc34b58cd174c806625056c6e0dc/02-copy.jpg','https://reaperscans.com/wp-content/uploads/WP-manga/data/manga_6295b8da2aa90/5461fc34b58cd174c806625056c6e0dc/03-copy.jpg']
for(var i=0;i<arr.length;i++){
var url = arr[i]
var name = i + '.jpg'
var path = Path.resolve(__dirname,dir, name)
var writer = Fs.createWriteStream(path)
var response = await Axios({
url,
method: 'GET',
responseType: 'stream'
})
response.data.pipe(writer)
return new Promise((resolve, reject) => {
writer.on('finish', resolve)
writer.on('error', reject)
})
}
}
downloadImage()
This is the above code I am using to download images when I tried downloading multiple images whose links are in array it only downloads the image of first array i.e. arr[0] and can't figure out what's the problem it doesn't give any error to and i can individually download single images but not bulky.random manga generator
So, I played around with this code snippet and I was able to fix by removing the promise callback at the bottom of your for loop. You were right, it would only complete one GET request, and then terminate. It now runs through all three, and saves it in your ./tmp directory.const
Fs = require("fs");
const Path = require("path");
const Axios = require("axios");
var dir = "./tmp";
async function downloadImage() {
if (!Fs.existsSync(dir)) {
Fs.mkdirSync(dir);
}
var arr = [
"https://reaperscans.com/wp-content/uploads/WP-manga/data/manga_6295b8da2aa90/5461fc34b58cd174c806625056c6e0dc/01-copy.jpg",
"https://reaperscans.com/wp-content/uploads/WP-manga/data/manga_6295b8da2aa90/5461fc34b58cd174c806625056c6e0dc/02-copy.jpg",
"https://reaperscans.com/wp-content/uploads/WP-manga/data/manga_6295b8da2aa90/5461fc34b58cd174c806625056c6e0dc/03-copy.jpg"
];
for (var i = 0; i < arr.length; i++) {
var url = arr[i];
// for debugging
console.log(arr[i])
var name = i + ".jpg";
var path = Path.resolve(__dirname, dir, name);
var writer = Fs.createWriteStream(path);
var response = await Axios({
url,
method: "GET",
responseType: "stream"
});
// for debugging
console.log(response)
}
}
downloadImage();

Node JS function that return https get request final edited data

Hello everybody I have a problem with the Node JS function that I want it to return https get request final edited data, I know there are a lot of solutions for this async problem but I tried them all and still can't figure out what is wrong with my code?
here is my function without any other solutions editing:
function getMovie(apiKey, gen) {
const baseUrl = "https://api.themoviedb.org/3/discover/movie?api_key=" + apiKey + "&language=en-US&include_adult=false&include_video=false&page=1&with_genres=" + gen;
https.get(baseUrl, function (responce) {
console.log(responce.statusCode);
var d = "";
responce.on("data", function (data) {
d += data;
});
responce.on("end", () => {
const finalData = [];
const moviesData = JSON.parse(d);
const result = moviesData.results;
const maxx = result.length;
const rand = Math.floor(Math.random() * maxx);
const title = result[rand].title;
const rDate = result[rand].release_date;
const overview = result[rand].overview;
const imageRoot = result[rand].poster_path;
const movieId = result[rand].id;
const movieRating = result[rand].vote_average;
// here will push those variables to finalData array
// then return it
return finalData;
});
}).on('error', (e) => {
console.error(e);
});
}
and want after this finalData returns:
const finalResult = getMovie(apiKey, genre);
it always returns undefined, How can I fix this? please anyone ca help me with this problem
thanks in advance.
I solved this problem using promises using this code:
const rp = require('request-promise');
function getMovie(url) {
// returns a promise
return rp(url).then(body => {
// make the count be the resolved value of the promise
let responseJSON = JSON.parse(body);
return responseJSON.results.count;
});
}
getMovie(someURL).then(result => {
// use the result in here
console.log(`Got result = ${result}`);
}).catch(err => {
console.log('Got error from getMovie ', err);
});

Pagination in Zapier

I am trying following code to get all records from a paginated API in Zapier.
const limitPerPage = 20;
const apiUrl = "https://myurl.com/data";
var lastCursor = null;
var output = null;
const getContent = async function (cursor) {
let actualUrl = apiUrl + `?cursor=${cursor}&limit=${limitPerPage}`;
var apiResults = await fetch(actualUrl)
.then(resp => {
return resp.json;
});
}
const getEntireContentList = async function (cursor) {
const results = await getContent(cursor);
console.log("Retreiving data from API for cursor : " + cursor);
if (results.metadata.cursor !== "") {
return results.concat(await getEntireContentList(results.metadata.cursor));
} else {
return results;
}
};
(async() => {
const entireList = await getEntireContentList();
console.log(entireList);
output = entireList;
callback(null, entireList);
})();
I get error as
You did not define output! Try output = {id: 1, hello: await Promise.resolve("world")};
How can I fix this?
Your problem is that though you're awaiting in that function, the top-level carries on and execution ends before your code has had a chance to run.
The good news is, Zapier wraps your code in an async function already, so you can use await at the top level (per these docs).
Try this instead:
const limitPerPage = 20;
const apiUrl = "https://myurl.com/data";
let lastCursor = null;
// var output = null; // zapier does this for you already
const getContent = async function (cursor) {
const actualUrl = apiUrl + `?cursor=${cursor}&limit=${limitPerPage}`;
const rawResponse = await fetch(actualUrl)
return resp.json() // async function, you had it as a property
}
const getEntireContentList = async function (cursor) {
const results = await getContent(cursor);
console.log("Retreiving data from API for cursor : " + cursor);
if (results.metadata.cursor !== "") {
return results.concat(await getEntireUserList(results.metadata.cursor)); // should this be named getEntireContentList?
} else {
return results;
}
};
return {
results: await getEntireContentList()
}
I noticed this is a recursive approach. That's fine, but remember that you've got limited execution time. You also might hit memory limits (depending on how many objects you're returning), so keep an eye on that.

module.exports is returning undefined

I am currently new to Node JS, and today I was trying to read data from a file data.json.
Here is the JSON file:
{"username":"rahul_v7","password":"9673"} {"username":"7vik","password":"3248"} {"username":"pradypot_2","password":"6824"} {"username":"ad_1","password":"9284"} {"username":"premchand_4","password":"4346"}
And, I was using the below code present in a file GetData.js, to read the data present in the data.json:
'use strict';
const fs = require('fs');
let res = '', resObjs = [];
let fin = fs.createReadStream('F:/RahulVerma/NodeJS/data.json', 'utf-8');
fin.on('data', data => {
if(data.length > 0) res += data;
}).on('end', () => {
if(res.length > 0) {
let resArr = res.trim().split(' ');
for(let i = 0; i < resArr.length; i++) {
resObjs.push(JSON.parse(resArr[i]));
}
module.exports.objects = resObjs;
}
});
As you can see, I am exporting the resObjs array, which is actually an array of objects, to an another file named AppendData.js, which is given below:
'use strict';
const fs = require('fs');
const getObjs = require('./GetData');
console.log(getObjs.objects);
But, when I run AppendData.js in Node.js 9.3.0 (ia32), it gives the following output:
You're trying to use the objects before they've been read. Remember that your code reading the stream runs asynchronously, and nothing in your code attempts to coordinate it with module loading. So AppendData.js isn't seeing the objects export because it doesn't exist yet as of when that code runs.
Instead, return a promise of the objects that AppendData.js can consume; see *** comments:
'use strict';
const fs = require('fs');
// *** Export the promise
module.exports.objectsPromise = new Promise((resolve, reject) => {
let res = '', resObjs = [];
let fin = fs.createReadStream('F:/RahulVerma/NodeJS/data.json', 'utf-8');
fin.on('data', data => {
if(data.length > 0) res += data;
}).on('end', () => {
if(res.length > 0) {
let resArr = res.trim().split(' ');
for(let i = 0; i < resArr.length; i++) {
resObjs.push(JSON.parse(resArr[i]));
}
resolve(resObjs); // *** Resolve the promise
}
}).on('error', error => {
reject(error); // *** Reject the promise
});
});
Note I added a handler for errors.
And then:
'use strict';
const fs = require('fs');
const getObjs = require('./GetData');
getObjs.objectsPromise
.then(console.log)
.catch(error => {
// Do something
});
Again note the error handler.
The problem happens because you're trying to use the objects in AppendData.js before they are loaded on GetData.js due to fs.createReadStream being asynchronous. To fix this just make module.exports be a function that expect a callback in GetData.js like:
'use strict';
const fs = require('fs');
module.exports = function(callback) {
let res = '', resObjs = [];
let fin = fs.createReadStream('F:/RahulVerma/NodeJS/data.json', 'utf-8');
fin.on('data', data => {
if(data.length > 0) res += data;
}).on('end', () => {
if(res.length > 0) {
let resArr = res.trim().split(' ');
for(let i = 0; i < resArr.length; i++) {
resObjs.push(JSON.parse(resArr[i]));
}
callback(resObjs); // call the callback with the array of results
}
});
}
Which you can then use like this in AppendData.js:
'use strict';
const fs = require('fs');
const getObjs = require('./GetData'); // getObjs is now a function
getObjs(function(objects) {
console.log(objects);
});

NodeJS Waiting for asynchronous function to complete foreach

Hi so i have been trying to make this tried using async module didn't really know how to convert this to one tried promising it didn't really work well i think i did it wrong so i reverted the function to the way it was at first
Basically i want to wait till the ReadJson() function is done with reading all the json files that are in the array then do other functions like editjson etc
Code:
App.js
const Reader = require('./Reader');
Reader.ReadJson();
Reader.js
const fsp = require('fs-promise');
const JsonFiles = ['json1.json', 'json2.json', 'json3.json', 'json4.json'];
const JsonContents = [];
class Reader {
static ReadJson() {
JsonFiles.forEach(name => {
let FileDir = "D:\\Development\\Java\\" + name;
fsp.readJson(FileDir).then(contents => {
if (contents) {
JsonContents.push(contents);
console.log(`Loaded >> ${name} ${Reader.JsonContents.length}/${JsonFiles.length}`);
}
});
});
console.log('Done Reading Json Content!');
//Other functions
}
}
Reader.JsonContents = JsonContents;
module.exports = Reader;
So basically the output is:
Done Reading Json Content!
Loaded >> json1.json 1/4
Loaded >> json2.json 2/4
Loaded >> json3.json 3/4
Loaded >> json4.json 4/4
When i need it to be:
Loaded >> json1.json 1/4
Loaded >> json2.json 2/4
Loaded >> json3.json 3/4
Loaded >> json4.json 4/4
Done Reading Json Content!
Thank you :)
Return a promise, track your progress in the forEach and resolve it only when JsonContents length is the same as JsonFiles length.
const fsp = require('fs-promise');
const JsonFiles = ['json1.json', 'json2.json', 'json3.json', 'json4.json'];
const JsonContents = [];
class Reader {
static ReadJson() {
return new Promise((resolve, reject) => {
JsonFiles.forEach(name => {
let FileDir = "D:\\Development\\Java\\" + name;
fsp.readJson(FileDir).then(contents => {
if (contents) {
JsonContents.push(contents);
console.log(`Loaded >> ${name} ${Reader.JsonContents.length}/${JsonFiles.length}`);
}
if (JsonContents.length == JsonFile.length) {
return resolve(JsonContents);
}
}).catch(err => {
return reject(err);
});
});
});
}
}
Reader.JsonContents = JsonContents;
module.exports = Reader;
And then use it in your app:
const Reader = require('./Reader');
Reader.ReadJson().then(() => { console.log('Done Reading Json Content!'); });
Another option is using Promise.all, because you are using fs-promise, but although it can be done with forEach, a regular for loop is better here.
const fsp = require('fs-promise');
const JsonFiles = ['json1.json', 'json2.json', 'json3.json', 'json4.json'];
const JsonContents = [];
class Reader {
static ReadJson() {
var promises = [];
for (let i = 0; i < JsonFiles.length; i++) {
let FileDir = "D:\\Development\\Java\\" + JsonFiles[i];
promises.push(fsp.readJson(FileDir).then(contents => {
if (contents) {
JsonContents.push(contents);
console.log(`Loaded >> ${JsonFiles[i]} ${Reader.JsonContents.length}/${JsonFiles.length}`);
}
}));
}
return Promise.all(promises);
}
}
Reader.JsonContents = JsonContents;
module.exports = Reader;
As an addendum to Ron Dadon's Promise.all method....
The Bluebird promise library provides some helper functions like Promise.map and Promise.filter that can remove a lot of the boiler plate of Promise array processing code.
const Promise = require('bluebird');
const fsp = require('fs-promise');
const path = require('path');
class Reader {
static readFiles(jsonPath, jsonFiles){
let fileReadCount = 0;
return Promise.map(jsonFiles, name => {
let filePath = path.join(jsonPath, name);
return fsp.readJson(filePath);
})
.filter((content, index, length) => {
if (!content) return false;
console.log(`Loaded >> ${jsonFiles[index]} ${index+1} / ${length}`);
return true;
})
}
static readJson() {
return this.readFiles(this.jsonPath, this.jsonFiles).then(contents => {
console.log('Done Reading Json Content!', contents);
return this.jsonContents = contents;
})
}
}
Reader.jsonFiles = ['json1.json', 'json2.json', 'json3.json', 'json4.json'];
Reader.jsonPath = 'D:\\Development\\Java';
module.exports = Reader;

Categories

Resources