Downloading images off a website with node.js - javascript

I'm trying to download every image on a website using a node script.
I wrote it and for the most part it seems to be working, however, it's only downloading the first image. It downloads it equal to the number of images on the site.
Here is my code.
const http = require('http'),
cheerio = require('cheerio'),
fs = require('fs');
var document = '';
var imageData = '';
http.get('http://www.wikihow.com/Start-a-Blog', function(res){
res.on('data', function(chunk){
document+=chunk;
})
res.on('end', function(){
let $ = cheerio.load(document);
var array = [];
var array = $("img").toArray();
var data = [];
array.forEach(function (ele) {
if (ele.attribs.src !== undefined)
data.push(ele.attribs.src);
})
var counter = 0;
data.forEach(function (ele) {
ripImage(ele, counter);
counter ++;
})
})
});
function ripImage(ele, counter){
http.get(ele, function(res){
console.log(res);
res.setEncoding('binary')
res.on('data', function(chunk){
imageData += chunk;
})
res.on('end', function(){
//console.log(ele);
fs.writeFile("dump/file" + counter + ".jpg", imageData, 'binary', function(err){
if (err) throw err
//console.log('File saved.')
});
//res.pipe(file);
})
});
}
I think the problem lies somewhere in the ripImage() function. If you guys can see the problem, and help me fix it, that'd be really appreciated.
Thanks guys.

#Mr.Phoenix is right, the async library is meant for this type of thing. It allows you to iterate over a collection with an asynchronous function, and fire a callback when all of the async functions have completed. Working code:
const http = require('http')
const cheerio = require('cheerio')
const fs = require('fs')
const async = require('async')
let document = ''
http.get('http://www.wikihow.com/Start-a-Blog', (res) => {
res.on('data', (chunk) => {
document += chunk
})
res.on('end', () => {
const $ = cheerio.load(document)
const data = $('img')
.toArray()
.filter((ele) => ele.attribs.src)
.map((ele) => ele.attribs.src)
async.eachOf(data, ripImage, (err) => {
if (err) throw err
console.log('all done!')
})
})
})
function ripImage (ele, i, callback) {
http.get(ele, (res) => {
let imageData = ''
res.setEncoding('binary')
res.on('data', (chunk) => {
imageData += chunk
})
res.on('end', () => {
fs.writeFile('./dump/file' + i + '.jpg', imageData, 'binary', callback)
})
})
}

Related

How to return data from http.get to parent function in nodejs

Task is to return data from getData function to main function.
function getData(){
const https = require('https')
const url = "https://...../api/movies";
https.get(url, res => {
let data = '';
res.on('data', chunk => {
data += chunk;
});
res.on('end', () => {
data = JSON.parse(data);
console.log(data);
//How can I return this data to main function?
})
}).on('error', err => {
console.log(err.message);
})
}
function main(){
console.log(getData());
}
I am not able to access data or print data in main function
I think you already have the answer in your comment '//How can I return this data to main function?'.
...
res.on('end', () => {
data = JSON.parse(data);
console.log(data);
return data;
})
...
So the return-value of your getData-function should now be the parsed json-data and be accessible in the main-function.
I would create a variable to save value which you want to return
const https = require('https')
function getData() {
let ret = null;
const url = "https://...../api/movies";
https.get(url, res => {
let data = '';
res.on('data', chunk => {
data += chunk;
});
res.on('end', () => {
ret = JSON.parse(data);
})
}).on('error', err => {
console.log(err.message);
})
return ret;
}
function main() {
console.log(getData());
}
What you could do is to wrap your request into a Promise. You would want to return this promise and and wait for it to be fullfilled:
function getData() {
const url = "https://...../api/movies";
return new Promise((resolve, reject) => {
const req = https.get(url, (res) => {
let data = '';
res.on('data', (chunk) => {
data += chunk;
});
res.on('end', () => {
resolve(JSON.parse(data));
});
});
req.on('error', (err) => {
reject(err);
});
req.end();
});
}
function main() {
getData().then(data => {
console.log("Response:", data);
}, error => {
console.error(error);
});
}
Moreover you can make your main function async and use await to get the response:
async function main() {
const data = await getData();
console.log(data);
}
// Start an IIFE to use `await` at the top level. See: https://stackoverflow.com/a/14220323/7661119
(async () => {
await main()
})();

I wanna send string that is from s3 object to a client

I have a problem to send string that is from s3 object body to a client
I'm using aws sdk for node, and apollo server(express), express, react
I did get object from s3 and create readable stream. and then i listen data event so I might send string to a client
let data = '';
s3.getObject(params).createReadStream().on('data', function(chunk) {
data += chunk;
});
return { data }
I thought data is not a empty string but it is empty string
what can I do to solve the problem?
Edit:
let data = '';
function promiseBasedRequest (params) {
return new Promise((resolve, reject) => {
s3.getObject(params).createReadStream()
.on('data', function (chunk) {
data += chunk;
})
.on('end', function () {
resolve(data);
})
.on('error', function (err) {
reject(err);
});
});
}
await promiseBasedRequest(params);
This works as I intended.
You are not waiting for the writing to end. First the function need to be asynchronous a promise or callback.
function getData(params) {
let data = ''
return new Promise((res, rej) => {
let data = '';
s3.getObject(params).createReadStream()
.on('data', function (chunk) {
data += chunk;
})
.on('end', function(){
res(data);
})
.on('error', function(){
rej()
})
})
}
You can use the function by:
(async(){
const data = await getData()
})();
Or getData().then(..)
EDIT: Also, getObject has one promise method as well.
s3.getObject(params).promise().then(...).catch(...)

Nested async queries

//express is the framework we're going to use to handle requests
const express = require('express');
//Create a new instance of express
const app = express();
const FormData = require("form-data");
const bodyParser = require("body-parser");
const http = require('http');
const async = require('async');
//This allows parsing of the body of POST requests, that are encoded in JSON
app.use(bodyParser.json());
var router = express.Router();
//AccuWeather API key
const weatherKey = process.env.WEATHER_KEY_TWO;
cityCode = ""; //City code
cityName = "";
//Current Conditions Vars
var ccWeatherText = ""; //Text for weather at location
var ccTemp = 0; //Degrees Farenheit
var ccIcon = 0; //weather icon number https://developer.accuweather.com/weather-icons
var ccURL = "test"; //URL for get
var hourlyData = [];
var fiveDayData = [];
router.post('/', (req, res) => {
let lat = req.body['lat'];
let lon = req.body['lon'];
var latLongCityCodeURL = ("http://dataservice.accuweather.com/locations/v1/cities/geoposition/search?apikey=" + weatherKey + "&q=" + lat + "," + lon);
//Get city code
const httpGet = url => {
return new Promise((resolve, reject) => {
http.get(url, res => {
let body = '';
res.on('data', chunk => body += chunk);
res.on('end', () => {
try {
body = JSON.parse(body);
} catch (err) {
reject(new Error(err));
}
resolve({
code: body.Key,
name: body.EnglishName
});
});
}).on('error', reject);
});
};
//Current Conditions
const ccGet = url => {
return new Promise((resolve, reject) => {
http.get(url, res => {
let body = '';
res.on('data', chunk => body += chunk);
res.on('end', () => {
try {
body = JSON.parse(body);
} catch (err) {
reject(new Error(err));
}
resolve({
text: body[0].WeatherText,
temp: body[0].Temperature.Imperial.Value,
icon: body[0].WeatherIcon
});
});
}).on('error', reject);
});
};
//12 hour
const twelveGet = url => {
return new Promise((resolve, reject) => {
http.get(url, res => {
let body = '';
res.on('data', chunk => body += chunk);
res.on('end', () => {
try {
body = JSON.parse(body);
} catch (err) {
reject(new Error(err));
}
resolve({
body: body
});
});
}).on('error', reject);
});
};
//5 day
const fiveGet = url => {
return new Promise((resolve, reject) => {
http.get(url, res => {
let body = '';
res.on('data', chunk => body += chunk);
res.on('end', () => {
try {
body = JSON.parse(body);
} catch (err) {
reject(new Error(err));
}
resolve({
body: body
});
});
}).on('error', reject);
});
};
//Get city code from lat lon
httpGet(latLongCityCodeURL).then(data => {
cityCode = data.code;
cityName = data.name;
ccURL = ("http://dataservice.accuweather.com/currentconditions/v1/" + cityCode + "?apikey=" + weatherKey);
twelveURL = ("http://dataservice.accuweather.com/forecasts/v1/hourly/12hour/" + cityCode + "?apikey=" + weatherKey);
fiveURL = ("http://dataservice.accuweather.com/forecasts/v1/daily/5day/" + cityCode + "?apikey=" + weatherKey);
//Get Current Conditions
ccGet(ccURL).then(dataCC => {
ccTemp = dataCC.temp;
ccWeatherText = dataCC.text;
ccIcon = dataCC.icon;
//Get 12 hour forecast
twelveGet(twelveURL).then(dataTwelve => {
//Generate hourly data
for (i = 0; i < dataTwelve.length; i++) {
hourlyData[i] = {
time: dataTwelve[i].EpochDateTime,
temp: dataTwelve[i].Temperature.Value,
text: dataTwelve[i].IconPhrase,
icon: dataTwelve[i].WeatherIcon
};
}
console.log("Hourly Data: " + hourlyData);
}).catch(err => console.log(err));
fiveGet(fiveURL).then(dataFive => {
//Generate five day data
for (i = 0; i < dataFive.length; i++) {
fiveDayData[i] = {
time: dataFive[i].EpochDate,
min: dataFive[i].Temperature.Minimum.Value,
max: dataFive[i].Temperature.Maximum.Value,
iconDay: dataFive[i].Day.Icon,
iconNight: dataFive[i].Night.Icon,
dayPhrase: dataFive[i].Day.IconPhrase,
nightPhrase: dataFive[i].Night.IconPhrase
};
console.log("5 Day Data:" + fiveDayData);
}
res.send({
success: true,
cityName: cityName,
cityCode: cityCode,
currentConditions: {
temp: ccTemp,
icon: ccIcon,
text: ccWeatherText
},
hourlyData: hourlyData,
fiveDayData: fiveDayData
});
}).catch(err => console.log(err));
}).catch(err => console.log(err));
}).catch(err => console.log('Got error ', err));
});
module.exports = router;
Ok so right now I'm creating an endpoint in NodeJS that is POST method which gets the arguments for latitude and longitude. When it gets those it makes calls to Accuweather's API. I got all the accuweather stuff working and returning proper results, but then I cut and pasted that code into my POST method router.post... and now it isn't working. I know it is an ASYNC issue, and I am just getting really lost with async, since I have like 3 or 4 nested async calls inside the router.post, which is another async call. So I'm thinking there is some way to maybe wrap the router.post into its own async call, which waits on the weather calls before returning results?
My end goal: For the user to send a POST with lat and lon, my code does all the weather calls, and returns the data for the POST.
What you want to do is called promise chaining:
new Promise(function(resolve, reject) {
setTimeout(() => resolve(1), 1000); // (*)
}).then(function(result) { // (**)
alert(result); // 1
return result * 2;
}).then(function(result) { // (***)
alert(result); // 2
return result * 2;
}).then(function(result) {
alert(result); // 4
return result * 2;
});
Also, check out MDN's page on using promises

Save JSON data from Node.js Get request to global variable or file

I need to save data from GET requests to a variable and then save it in a file. However, in some cases GET request does not save data to global variables.
var fs = require("fs");
var http = require("http");
var request = require('request');
var tmp_json = {};
var g_last = 0;
var data = {};
//request 1
http.get('server:api', (resp) => {
let data = '';
resp.on('data', (chunk) => {
data += chunk;
});
resp.on('end', () => {
tmp_json.server1 = {};
tmp_json.server1 = JSON.parse(data);
g_last = tmp_json.height; // 100500
console.log(g_last); // 100500
});
}).on("error", (err) => {
console.log("Error: " + err.message);
});
//request 2
http.get('server2:api', (resp) => {
let data = '';
resp.on('data', (chunk) => {
data += chunk;
});
resp.on('end', () => {
tmp_json.server2 = {};
tmp_json.server2 = JSON.parse(data);
g_last = tmp_json.height; // 256
console.log(g_last); // 256
});
}).on("error", (err) => {
console.log("Error: " + err.message);
});
console.log(g_last); // 0
data = JSON.stringify(tmp_json);
fs.writeFile('data.json', data, 'utf8'); // empty file
Also I was trying to do it with fs.createWriteStream, but again I can save one request to file, but if there more then one I catch only buffer data.
Your problem is that request1 and request2 are happening while you are writing the file. This is because of the async nature of node. The execution order looks something like this:
Declare empty variables
Request1 goes out
Request2 goes out
Write empty variables to file
Request1 comes back and writes to variables
Request2 comes back and writes to variables
One way to fix this would be with Promises. The following allows for the function in then to be executed after the promises in Promise.all([ ... ]) have resolved:
var fs = require("fs");
var http = require("http");
var tmp_json = {};
var g_last = 0;
var data = {};
//request 1
var req1 = new Promise((resolve, reject) => {
http.get('server:api', (resp) => {
let data = '';
resp.on('data', (chunk) => {
data += chunk;
});
resp.on('end', () => {
tmp_json.server1 = {};
tmp_json.server1 = JSON.parse(data);
g_last = tmp_json.height; // 100500
console.log(g_last); // 100500
resolve()
});
}).on("error", (err) => {
console.log("Error: " + err.message);
reject(error)
});
});
//request 2
var req2 = new Promise((resolve, reject) => {
http.get('server2:api', (resp) => {
let data = '';
resp.on('data', (chunk) => {
data += chunk;
});
resp.on('end', () => {
tmp_json.server2 = {};
tmp_json.server2 = JSON.parse(data);
g_last = tmp_json.height; // 256
console.log(g_last); // 256
resolve()
});
}).on("error", (err) => {
console.log("Error: " + err.message);
reject(error)
});
});
Promise.all([ req1, req2 ]).then(() => {
console.log(g_last);
data = JSON.stringify(tmp_json);
fs.writeFile('data.json', data, 'utf8');
})
Edit:
function handleGet (url) {
return new Promise((resolve, reject) => {
http.get(url, (resp) => {
let data = '';
resp.on('data', (chunk) => {
data += chunk;
});
resp.on('end', () => {
tmp_json.server1 = {};
tmp_json.server1 = JSON.parse(data);
g_last = tmp_json.height; // 100500
console.log(g_last); // 100500
resolve()
});
}).on("error", (err) => {
console.log("Error: " + err.message);
reject(error)
});
})
}
// Then use it
Promise.all([
handleGet('http://google.ca'),
handleGet('http://somesite.com')
])

Node JS for loop and array push

I have 1,211,434 IP addresses that needed to be converted into geolocations. I found an api that answers this question by using GET request. But the thing is, the when using a for loop, I can not send the ip address and receive the description correctly.
Majorly I have two questions:
I just can not output the ip_and_info array, and can't find the reason. Can anybody tell me what went wrong?
Now, the code I wrote can retrieve all the information that I need, there are around 200 ip addresses in the test_ip.txt. Would there be a potential problem if I try to send all those 1M IP addresses?
Is there anyone can give me some advice?
Much Appreciated.
My code is as below:
fs = require('fs')
async = require("async")
http = require('http')
ip_and_info = []
// getIPInfo("1.171.58.24")
fs.readFile("../test_ips.txt", "utf-8", (err, content, printArr) => {
content = content.split("\n")
async.each(content, (ip) => {
content = getIPInfo(ip)
// console.log(ip)
}, (err) => {
if (err) {
console.log(err)
} else {
console.log(ip_and_info)
}
})
// for (i in content) {
// ((ip) => {
// getIPInfo(ip)
// })(content[i])
// }
});
function getIPInfo(ipAddress) {
options = {
host: 'freegeoip.net',
path: '/csv/' + ipAddress
}
request = http.get(options, function(response) {
// console.log('STATUS: ' + response.statusCode)
// console.log('HEADERS: ' + JSON.stringify(response.headers))
// Buffer the body entirely for processing as a whole.
bodyChunks = []
response.on('data', function(chunk) {
bodyChunks.push(chunk)
}).on('end', function() {
body = Buffer.concat(bodyChunks)
content = body.toString('ascii')
ip_and_info.push(content)
console.log(content)
return content
})
})
request.on('error', function(e) {
console.log('ERROR: ' + e.message)
})
}
Much Appreciated!
The problem lies in this line
content = getIPInfo(ip)
getIPInfo should be an async function. One way of doing it would be to send a callback to the function and in the function return the output in the callback.
async.each(content, getIPInfo, (err) => {
if (err) {
console.log(err)
} else {
console.log(ip_and_info)
}
})
And in the getIPInfo function
function getIPInfo(ipAddress, callback) {
.....
.....
ip_and_info.push(content)
callback();
}
Also, instead of using async.each use async.eachSeries or async.eachLimit else it will try to send request for all 1,211,434 ips .
Use Promise.
Use the let and const keywords. Seriously, implicit global aren't fun.
Decide whether to use ' or " and stick with it, it is way more readable.
With Promise, no need for async or your ip_and_info variable.
'use strict';
const fs = require('fs'),
http = require('http');
fs.readFile('../test_ips.txt', 'utf-8', (err, content) => {
content = content.split('\n');
Promise.resolve().then(() => {
return getAllIPInfo(content);
}).then((ipsInfos) => {
console.log('Info:' + ipsInfos);
}).catch((error) => {
console.error('Error: ' + error);
});
});
function getAllIPInfo(ipsAddress) {
return new Promise((resolve, reject) => {
let ipsInfo = [];
ipsAddress.reduce((previous, current, index, ips) => {
return previous.then(() => {
return getIPInfo(ips[index]).then((content) => {
ipsInfo.push(content);
return Promise.resolve();
});
});
}, Promise.resolve()).then(() => {
resolve(ipsInfo);
}).catch((error) => {
reject(error);
});
});
}
function getIPInfo(ipAddress) {
return new Promise((resolve, reject) => {
let options = {
host: 'freegeoip.net',
path: '/csv/' + ipAddress
};
http.get(options, function(response) {
// console.log('STATUS: ' + response.statusCode)
// console.log('HEADERS: ' + JSON.stringify(response.headers))
// Buffer the body entirely for processing as a whole.
let bodyChunks = [];
response.on('data', function(chunk) {
bodyChunks.push(chunk);
}).on('end', function() {
let body = Buffer.concat(bodyChunks),
content = body.toString('ascii');
resolve(content);
});
}).on('error', function(e) {
console.log('ERROR: ' + e.message);
reject(e);
});
});
}
I think your problem might be that you are re-declaring the 'content' variable each loop you make.
So perhaps change the loop to this so you don't reset the variable each time the loop executes. I hope that fixes your issue:
IPList = content.split("\n")
async.each(IPList, (ip) => {
IPGeoLocation = getIPInfo(ip)
console.log(IPGeoLocation)
}, (err) => {
As for doing this with a million IPs, I cant see major problem as long as you have a decent amount of memory on your computer. You might like to add a 'wait' call so you don't hammer the server so consistently. They might block you!
I would wait 1 second between each call by adding
sleep(1000);
after getting the IP.

Categories

Resources