I got a simple async function where I "scrape" site from URLs.
Everything works fine, but now I want to save results into my txt file.
I tried to do simply array where I able to push every result also errors;
Now I got a problem where should I do write to file.
I tried putting it to a separated function then do await function inside my async function but function with write to file i always fired first.
There is full code
const https = require("https");
const fs = require("fs");
const readline = require("readline");
const path = require("path");
let urls = [];
let results = [];
(async function readUrls() {
const fileStream = fs.createReadStream("urls.txt");
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
});
for await (let line of rl) {
urls.push(line);
}
for await (let url of urls) {
https
.get(url, (res) => {
const {
statusCode
} = res;
const contentType = res.headers["content-type"];
let error;
if (statusCode !== 200) {
error = new Error("Request Failed.\n" + `Status Code: ${statusCode}`);
}
if (error) {
const firstPath = url.split("/")[7];
//there is array
results.push(firstPath);
//--------------
console.error("data : " + firstPath + " - " + " nothing found");
res.resume();
return;
}
res.setEncoding("utf8");
let rawData = "";
res.on("data", (chunk) => {
rawData += chunk;
});
(async () => {
await res.on("end", () => {
try {
const parsedData = JSON.parse(rawData);
const parsedResult = parsedData["data"]["id"] + " - " + parsedData["data"]["price"];
//there is array
results.push(parsedResult);
//--------------
console.log("data : " + parsedData["data"]["id"] + " - " + parsedData["data"]["price"]);
} catch (e) {
console.error(e.message);
}
});
})();
})
.on("error", (e) => {
console.error(`Got error: ${e.message}`);
});
}
})();
There is my simple function to write into file
fs.writeFile('result.txt', results, +(new Date()), function (err) {
if (err) {
console.log("Error occurred", err);
}
console.log("File write successfull");
});
I tried do something
async function secondFunction(){
await firstFunction();
// wait for firstFunction...
};
What I want to achive? I want to scrape every url from my text file and get ID and Price
( this is simple JSON response into browser no html - it works )
At the end I want to save everything into text file.
I made a version of your code that uses node-fetch to call the urls. I prefer this one as it is similar to what one can use on the web
To use it you should install it:
npm install node-fetch
const fetch = require("node-fetch"); // I prefer to use node-fetch for my calls
const fs = require("fs");
const readline = require("readline");
const path = require("path");
let urls = [];
let results = [];
(async function readUrls() {
const fileStream = fs.createReadStream("urls.txt");
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
});
for await (let line of rl) {
urls.push(line);
}
// Make the calls one after the other
for (let url of urls) {
try {
// We can call the urls with node-fetch and await the response
const res = await fetch(url);
const { status } = res;
let error;
if (status !== 200)
error = new Error("Request Failed.\n" + `Status Code: ${statusCode}`);
if (error) {
const firstPath = url.split('/')[7];
results.push(firstPath);
console.error("data : " + firstPath + " - " + " nothing found");
// As we are inside a loop here, we use continue instead of return
continue;
}
try {
// Here we try to take the response as json
const parsedData = await res.json();
const parsedResult = parsedData["data"]["id"] + " - " + parsedData["data"]["price"];
//there is array
results.push(parsedResult);
//--------------
console.log(`Data: ${parsedResult}`);
} catch (e) {
// In case we can't get the response as json we log the error
console.error(e.message);
}
} catch (httpError) {
//This is for when the call to fetch fails for some reason
console.error(httpError.message);
}
}
// Here we join the results to a string so that we can save it properly to the file
const resultAsText = results.join("\n");
// Then after all the urls are processed we can write them to a file
fs.writeFile('result.txt', resultAsText, 'utf8', function (err) {
if (err) {
console.log("Error occurred", err);
} else {
console.log("File write successfull");
}
});
})();
Related
am trying to build a web scraper that downloads all the pdfs in a website. i've written all the logic necessary to do this but for some reason it downloads an empty pdf file which is not suppose to be so, the problem seems to be coming from the downloadFile function when i try to pipe the data which for some reason seems not to be working because i get an empty pdf file after the function is ran. i'll would appreciate it if someone can help me out with this problem, thanks.
here's a sample of my code:
app.js
const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");
const axiosInstance = require("./getAxios");
const axios = axiosInstance();
const Surl = "https://www.health.gov.ng/";
// linkList sample: "https://www.health.gov.ng/index.php?option=com_content&view=article&id=143&Itemid=512";
let = connectionFailCount = 0;
let linkList = [];
let dlinkList = [];
const getWebsiteLinks = async (Surl) => {
try {
console.log(`Crawling all links from: ${Surl}`);
const response = await axios.get(Surl);
const $ = cheerio.load(response.data);
const ranges = $("a").each(function (idx, el) {
if ($(el).attr("href")) {
return $(el).attr("href");
}
});
for (let index = 0; index < ranges.length; index++) {
let raw_links = $("a")[index].attribs.href;
if (raw_links.startsWith("/")) {
linkList.push(Surl + raw_links);
}
}
if (linkList.length > 0) {
console.log(`Finished crawling links: Found ${linkList.length} links`);
console.log(
"--------------------------------------------------------\n\n"
);
}
return;
} catch (error) {
if (connectionFailCount === 0) {
connectionFailCount += 1;
getWebsiteLinks(Surl);
console.log(`Connection error. \n
Reconnecting to server....`);
} else if (connectionFailCount === 5) {
console.error(`Can not connect to server. Try again later.`);
}
}
};
const downloadLinks = async (linkList) => {
try {
console.log("Crawling links to find pdf links. this may take a while...");
for (const link of linkList) {
const response = await axios.get(link);
// Skip where there's delayed server response
if (response.code === "ECONNRESET") continue;
const $ = cheerio.load(response.data);
$("a").each(function (idx, el) {
if ($(el)?.attr("href")?.endsWith(".pdf")) {
let addr = $(el).attr("href");
let dlink = Surl + addr;
dlinkList.push({
pathName: addr,
url: dlink,
});
}
});
}
console.log(dlinkList);
if (dlinkList.length > 0) {
console.log(`Crawling Finish: Found ${dlinkList.length} pdf links`);
console.log(
"--------------------------------------------------------\n\n"
);
}
} catch (error) {
if (connectionFailCount === 0) {
connectionFailCount += 1;
console.log(`Connection error. \n
Reconnecting to server: ${connectionFailCount} count`);
downloadLinks(linkList);
}
if (connectionFailCount === 3) {
console.error(`Can not connect to server. Try again later.`);
return;
}
// console.error("downloadLinksError: ", error);
}
};
const downloadFiles = async (dlinkList) => {
console.log("Creating directory to save PDF files");
const appRoot = path.dirname(path.resolve(__dirname));
// Had to change and restructure code due to error
const folderName = `PDF/${Surl.split("/").pop()}`;
const subFolderName = Surl.split("/").pop();
try {
if (!fs.existsSync(path.join(appRoot, folderName))) {
fs.mkdirSync(path.join(appRoot, "PDF"));
fs.mkdirSync(path.join(`${appRoot}/PDF`, subFolderName));
}
dlinkList.forEach(async (link) => {
let name = link.pathName;
let url = link.url;
let file = fs
.createWriteStream(
`${appRoot}/${folderName}/${name.split("/").pop()}`,
"utf-8"
)
.on("error", (err) => {
console.error("createWriteStreamError: ", err);
});
try {
console.log("Downloading PDF file...");
const { data } = await axios({
url,
method: "GET",
responseType: "stream",
});
if (data) {
console.log("PDF file Downloaded");
data.pipe(file);
}
} catch (error) {
console.error(error);
}
});
return;
} catch (error) {
console.error("downloadFilesError: ", error);
}
};
(async () => {
await getWebsiteLinks(Surl);
await downloadLinks(linkList);
await downloadFiles(dlinkList);
})();
getAxios.js
const axios = require("axios");
const https = require("https");
module.exports = function () {
const domain = "https://www.health.gov.ng/";
let instance;
if (!instance) {
//create axios instance
instance = axios.create({
baseURL: domain,
timeout: 60000, // Increase time out incase of network delay or delayed server response
maxContentLength: 500 * 1000 * 1000, // Increase maximum response ata length
httpsAgent: new https.Agent({ keepAlive: true }),
headers: { "Content-Type": "application/xml" },
});
}
return instance;
};
I am trying to get a value from redis which I have set. When I call the checkCache function I get "CACHE: null" and afterwards it logs "NO ERROR + the data". I dont understand why since I used await.
const redis = require("redis");
const client = redis.createClient({
port : procces.env.PORT,
host : procces.env.HOST,
password : procces.env.PASSWORD,
});
const checkCache = async (key) => {
await client.get(key, (err, data) => {
if(err) throw err;
console.log("NO ERROR + " + data);
if(data !== null) { return JSON.parse(data); }
});
return null;
};
I call the methode like this:
const findAll = async(user) => {
const cache = await checkCache(user);
console.log('CACHE: ' + cache); // returns null
};
Module redis doesn't support Promise.
For this you have promisify it
const Promise = require('bluebird');
const redis = Promise.promisifyAll(require("redis"));
EDIT:
const Promise = require('bluebird');
const redis = Promise.promisifyAll(require("redis"));
const client = redis.createClient({
port : process.env.PORT,
host : process.env.HOST,
password : process.env.PASSWORD,
});
const checkCache = async (key) => {
const data = await client.getAsync(key);
console.log("NO ERROR + " + data);
if(data !== null) { return JSON.parse(data); }
return null;
};
I have a script that reads an excel file and gets data from a specific column to perform a search on the Google Maps API where I use axios. For each request made, I need to save it in the newFileList variable. After completing all the requests, I must save the contents of this variable in a file. However, whenever I run my code, the file is being saved without the content of the newFileList variable. How do I wait for all requests to finish before being able to save the content in the file?
Note: the reading, writing and requesting data are working. I just need the rescue to happen only after all the loop requests are finished. I tried to solve by placing the loop inside a promisse and at the end of the execution of this loop I used resolve.
const xlsx = require("node-xlsx");
const fs = require("fs");
const coordinate = require("./coordinate");
const resourcePath = `${__dirname}/resources`;
const contentFile = xlsx.parse(`${resourcePath}/file-2.xlsx`)[0].data;
const newFile = [[...contentFile, ...["Latitude", "Longitude"]]];
for (let i = 1; i < contentFile.length; i++) {
const data = contentFile[i];
const address = data[2];
coordinate
.loadCoordinates(address)
.then((response) => {
const { lat, lng } = response.data.results[0].geometry.location;
newFile.push([...data, ...[lat.toString(), lng.toString()]]);
})
.catch((err) => {
console.log(err);
});
}
console.log(newFile);
//The code below should only be executed when the previous loop ends completely
var buffer = xlsx.build([{ name: "mySheetName", data: newFile }]); // Returns a buffer
fs.writeFile(`${resourcePath}/file-3.xlsx`, buffer, function (err) {
if (err) {
return console.log(err);
}
console.log("The file was saved!");
});
The coordinate file:
const axios = require("axios");
module.exports = {
loadCoordinates(address) {
const key = "abc";
return axios
.get(`https://maps.googleapis.com/maps/api/geocode/json`, {
params: {
address,
key,
},
})
},
};
Will using an async IIFE help?
const xlsx = require("node-xlsx");
const fs = require("fs");
const coordinate = require("./coordinate");
const resourcePath = `${__dirname}/resources`;
const contentFile = xlsx.parse(`${resourcePath}/file-2.xlsx`)[0].data;
const newFile = [[...contentFile, ...["Latitude", "Longitude"]]];
(async() => {
try{
for (let i = 1; i < contentFile.length; i++) {
const data = contentFile[i];
const address = data[2];
await coordinate
.loadCoordinates(address)
.then((response) => {
const { lat, lng } = response.data.results[0].geometry.location;
newFile.push([...data, ...[lat.toString(), lng.toString()]]);
})
.catch((err) => {
console.log(err);
});
}
console.log(newFile);
//The code below should only be executed when the previous loop ends completely
var buffer = xlsx.build([{ name: "mySheetName", data: newFile }]); // Returns a buffer
fs.writeFile(`${resourcePath}/file-3.xlsx`, buffer, function (err) {
if (err) {
return console.log(err);
}
console.log("The file was saved!");
});
} catch(e) {
console.log(e)
}
})();
Do note that I added await before coordinate.loadCoordinates, in order to make sure the first axios request is finished before we proceed to the next one.
You need to use Promise.all() to wait until all the promises are resolved. After that execute the writeToFile part. For more info on Promise.all(), you can refer https://www.javascripttutorial.net/es6/javascript-promise-all/
const requestPromiseArray = [];
for (let i = 1; i < contentFile.length; i++) {
const data = contentFile[i];
const address = data[2];
requestPromiseArray.push(coordinate
.loadCoordinates(address))
}
Promise.all(requestPromiseaArray).then(results=>{
// Handle "results" which contains the resolved values.
// Implement logic to write them onto a file
var buffer = xlsx.build([{ name: "mySheetName", data: results }]);
fs.writeFile(`${resourcePath}/file-3.xlsx`, buffer, function (err) {
if (err) {
return console.log(err);
}
console.log("The file was saved!");
});
})
So i am having a few issues trying to figure out how to fix the snip below. As of right now it is returning the values before the 'request(scanurl,.....' section of the for of loop is running. The loop does run. Also i do not think the 'lines200' var is being updated via the counter. I am a learner so any explanation would be greatly appreciated.
async function processLineByLine(baseData) {
console.log('enter async func')
try {
const fileStream = fs.createReadStream('./file.txt');
let linesTotal = 0;
let lines200 = 0;
const rl = readline.createInterface({
input: fileStream
});
for await (let line of rl) {
console.log('in loop')
const scanurl = (baseData.match(/http/gi)) ? (baseData) : ('https://' + baseData + line);
linesTotal++;
request(scanurl, {json: true}, function (error, response, body) {
let statusCode = response.statusCode;
let htmlBody = body;
//console.log(htmlBody)
//console.log(statusCode)
if (statusCode == "200") {
console.log('in 2nd if')
let $ = cheerio.load(htmlBody);
let titleScrape = $('title').html();
console.log(titleScrape)
if (titleScrape.match(/404 | test/gi)) {
console.log('Matched')
} else {
lines200++;
console.log(lines200)
}
} else {
// Do nothing
}
});
}
return {
total: linesTotal,
count200: lines200,
};
} catch (error) {
console.error(error)
}
}
router.get('/:reqTarget', async (req, res) => {
console.log('starting')
var baseUrl = req.params.reqTarget;
try {
console.log('in the try')
const initTest = await processLineByLine(baseUrl);
const {total, count200} = initTest;
console.log(total, count200)
if (initTest) return res.status(200).send({
message: 'STATUS 200 COUNT: ' + count200 + ' ' + 'TOTAL: ' + total });
} catch (error) {
console.log(error)
}
});
Current Output:
starting
in the try
enter async func
in loop
in loop
in loop
in loop
in loop
in loop
in loop
33 0 //this is the return that is two early
in 2nd if
404 | test
Matched
in 2nd if
404 | test
Matched
When you have a loop containing asynchronous operations, you have one of two options. You can run them all in parallel and somehow track when they are all done. Or, you can run them sequentially one after the other. It appears your loop could be constructed either way, but I'll illustrate the sequential option.
The advent of async/await allows us to "pause" a for loop in the middle with an appropriate await. But, in order to do that, all asynchronous operations have to be promise-based so you can await those promises. To that end, I've switched from the request() library to the request-promise-native library which is a promise wrapper around the request library that uses native, built-in promises. It also has another nice feature in that it automatically checks for a 2xx status code so you don't have to do that yourself.
Here's what that code would look like:
const rp = require('request-promise-native');
async function processLineByLine(baseData) {
console.log('enter async func')
try {
const fileStream = fs.createReadStream('./file.txt');
let linesTotal = 0;
let lines200 = 0;
const rl = readline.createInterface({
input: fileStream
});
for await (let line of rl) {
console.log('in loop')
const scanurl = (baseData.match(/http/gi)) ? (baseData) : ('https://' + baseData + line);
linesTotal++;
try {
let htmlBody = await rp(scanurl, {json: true});
let $ = cheerio.load(htmlBody);
let titleScrape = $('title').html();
console.log(titleScrape);
if (titleScrape.match(/404 | test/gi)) {
console.log('Matched')
} else {
lines200++;
console.log(lines200)
}
} catch(e) {
console.log(`error on request(${scanurl})`, e);
// like your original code, this will only log the error
// and then continue with the rest of the URLs
}
}
return {
total: linesTotal,
count200: lines200,
};
} catch (error) {
console.error(error)
}
}
router.get('/:reqTarget', async (req, res) => {
console.log('starting')
var baseUrl = req.params.reqTarget;
try {
console.log('in the try')
const initTest = await processLineByLine(baseUrl);
const {total, count200} = initTest;
console.log(total, count200)
res.status(200).send({message: 'STATUS 200 COUNT: ' + count200 + ' ' + 'TOTAL: ' + total});
} catch (error) {
console.log(error)
res.sendStatus(500);
}
});
I am new to node.js and Javascript.
I have two javascript files "FabricUserController.js" and "UserController.js". So I have create the class in "FabricUserController.js" and export it into "UserController.js".
I am integrate the GetAll fucntion of "FabricUserController.js" to "UserController.js" in GetAllProduce fucntion.
I am trying run the below code however its giving me "TypeError: FabricUserControllers is not a constructor" error which is not handle in try catch{} block
Please see below code
let FabricUserControllers3 = require("./FabricUserController");
GetAllProduce: function (req, res, next) {
try{
let output = {};
var resArray = new Array();
let VZID = req.body.username;
console.log('test', 'GetAllProduce')
console.log('USername', VZID)
MongoClient.connect(config.Database.TEST.connectString, function (err, client) {
if (err) {
let connError = new Error(500, "Error connecting to TEST database", err);
res.status(connError.status).json(connError);
} else {
let query = {};
client.db(config.Database.TEST.dbName).collection("Produce").find(query).toArray(function (err, response) {
console.log(response);
if (err) {
let roleError = new Error(500, "Error getting Produce information", err);
res.status(500).json(roleError);
} else if (response.length > 0) {
//DO someting here
//FabricUserControllers3 = {};
FabricUserControllers3 = new FabricUserControllers();// getting issue here
FabricUserControllers3.GetAll((VZID), (response) => {
console.log("data result", result)
res.status(200).json(response);
client.close();
})
} else {
output.message = "Produce doesn't exist";
res.status(409).json(output);
client.close();
}
});
}
});
}catch(e){
if (e instanceof TypeError){
console.log('error1', e.message);
printError(e,true);
}else{
console.log("error2", e.message);
printError(e, false);
}
}
},
FabricUserController.js
'use strict';
const {
FileSystemWallet,
Gateway
} = require('fabric-network');
const fs = require('fs');
const path = require('path');
var MongoClient = require('mongodb').MongoClient;
var Client = require('node-rest-client').Client;
var client = new Client();
const configPath = path.resolve(__dirname, '..', 'config', 'Config.json');
const configJSON = fs.readFileSync(configPath, 'utf8');
const config1 = JSON.parse(configJSON);
var connection_file = config1.connection_file;
var appAdmin = config1.appAdmin;
var gatewayDiscovery = config1.gatewayDiscovery;
var appAdminSecret = config1.appAdminSecret;
var orgMSPID = config1.orgMSPID;
var caName = config1.caName;
const ccpPath = path.resolve(__dirname, '..', 'config', 'connection.json');
const ccpJSON = fs.readFileSync(ccpPath, 'utf8');
const ccp = JSON.parse(ccpJSON);
let response = {};
class FabricUserControllers {
constructor() {
console.log("constructer called")
}
async ProduceRegistration(Username, produceid, callback) {
// Create a new file system based wallet for managing identities.
try {
const setAsyncTimeout = (cb, timeout = 0) => new Promise(resolve => {
setTimeout(() => {
cb();
resolve();
}, timeout);
});
let query2 = {}
query2.PRODUCEID = produceid;
// console.log('PRODUCEID',produceid)
var PRODUCE = {};
const walletPath = path.join(process.cwd(), 'wallet');
const wallet = new FileSystemWallet(walletPath);
console.log(`Wallet path: ${walletPath}`);
console.log('Username', Username)
// Check to see if we've already enrolled the user.
const userExists = await wallet.exists(Username);
if (!userExists) {
console.log('An identity for the user: ' + Username + ' does not exist in the wallet');
console.log('call the registerUser before retrying');
response.data = null;
response.httpstatus = 400;
response.message = `An identity for the ${Username} does not exist in the wallet`;
return response;
}
// Create a new gateway for connecting to our peer node.
const gateway = new Gateway();
await gateway.connect(ccpPath, {
wallet,
identity: Username,
discovery: {
enabled: false,
asLocalhost: true
}
});
///
MongoClient.connect(config.Database.TEST.connectString, function (err, client) {
if (err) {
// let connError = new Error(500, "Error connecting to TEST database", err);
response.data=null;
response.httpstatus = 500;
response.message = "Error connecting to TEST database :" + err;
// res.status(connError.status).json(connError);
return response;
} else {
client.db(config.Database.TEST.dbName).collection("Produce").find(query2).toArray(function (err, docs) {
if (err) {
response.httpstatus = 500;
response.message = "Error with DB :" + err;
return response;
}
else{
console.log("blockchain_status", docs[0].blockchain_status)
console.log('Role name DB',docs);
console.log('Role name DB1',docs[0]);
if(docs[0].STATUS)
PRODUCE.produceid = docs[0].PRODUCEID;
PRODUCE.produceName = docs[0].PRODUCE;
PRODUCE.farmLocation = docs[0].FARMLOCATION;
PRODUCE.plantingDate = docs[0].PLANTINGDATE;
PRODUCE.harvestDate = docs[0].HARVESTDATE;
PRODUCE.status = docs[0].STATUS;
PRODUCE.produceQuantites = docs[0].VARIETY;
PRODUCE.gapInfo = docs[0].GAP;
PRODUCE.farmerID = docs[0].farmerID;
console.log('Produce', PRODUCE);
const doStuffAsync = async () => {
setAsyncTimeout(async () => {
// Get the network (channel) our contract is deployed to.
const network = await gateway.getNetwork('dfarmchannel');
// Get the contract from the network.
const contract = network.getContract(config1.chaincodeName);
var args = JSON.stringify(PRODUCE)
console.log("type of arg", typeof (args));
// Submit the specified transaction.
// console.log('produceID', args.produceID);
if(args==null || args==''){
console.log('Server not responding please try again');
}else
{
const result = await contract.submitTransaction('ProduceRegistration', args);
var argsJson = JSON.parse(result)
// console.log('result', argsJson)
// console.log('result1', result)
if(argsJson.produceID !="" && argsJson.produceID !=null && argsJson.produceID !="undefined" && argsJson.produceID !=undefined){
// // return false;
response.data = result
response.httpstatus = 200;
response.message = `Transaction has been submitted ansd successfull with Result :${result}`;
return callback(response);
// console.log('result before', response);
// console.log('Transaction has been submitted ansd successfull with Result :' + result);
}else{
console.log('blockchain server not responed')
// return false
response.httpstatus = 500;
response.message = `Please enter produce ID :`;
return response;
}
}
}, 4000);
};
doStuffAsync();
}
client.close();
})
}
})
await gateway.disconnect();
}
catch (error) {
// if(error) throw error;
response.error = error;
response.httpstatus = 500;
response.message = "Failed to enroll admin due to above error";
return response;
}
};
}
module.exports = FabricUserControllers;
#Abhirock, on your main file you have:
let FabricUserControllers3 = require("./FabricUserController");
FabricUserControllers3 = new FabricUserControllers();// getting issue here
You are trying to override FabricUserControllers3 creating a new object FabricUserControllers but you are not importing it. Try next solution to see if it solves your problem:
const FabricUserController = require("./FabricUserController");
const fabricUserControllers3 = new FabricUserController();
Hope it helps :))