I am new to Scraping.
This is my PDF downloading code.
I want to use Async Await in this code.
I don't know where I have to use async await in my code.
function scrapPdf(config, search_url, message) {
console.log('PDF downloading');
got(search_url).then(response => {
const $ = cheerio.load(response.body);
$('.search-result').find('li > a').each((idx, elem) => {
if($(elem).text().trim() == 'PDF'){
const item = $(elem).attr('href');
pdf_lists.push(item);
}
})
$('ul.pagination').find('li.page-item').each((idx, elem) => {
if($(elem).attr('class').includes('page-item active navigation')){
if($(elem).next().hasClass('page-item navigation')){
scrapPdf(config, $(elem).next('li').find('a').attr('href'), message);
} else {
const search_result_dir = `./${message.date_ini}-${message.date_end}`;
if(!fs.existsSync(search_result_dir)){
fs.mkdirSync(search_result_dir)
}
for(let i = 0;i < pdf_lists.length; i++){
const download = new DownloaderHelper(pdf_lists[i], search_result_dir);
download.on('end', () => console.log('Download Completed'))
download.start();
}
console.log(`${pdf_lists.length} files Downloaded!`);
uploadFile(search_result_dir);
return ;
}
console.log($(elem).next('li').find('a').attr('href'));
}
});
}).catch(err => {
console.log(err);
});
}
Here's one possibility assuming you only want to use async/await on the Promise returned from the .then:
async function scrapPdf(config, search_url, message) {
let response = null
console.log('PDF downloading');
try {
response = await got(search_url);
} catch (err) {
console.log(err);
}
if (response) {
const $ = cheerio.load(response.body);
$('.search-result').find('li > a').each((idx, elem) => {
if($(elem).text().trim() == 'PDF'){
const item = $(elem).attr('href');
pdf_lists.push(item);
}
})
$('ul.pagination').find('li.page-item').each((idx, elem) => {
if($(elem).attr('class').includes('page-item active navigation')){
if($(elem).next().hasClass('page-item navigation')){
scrapPdf(config, $(elem).next('li').find('a').attr('href'), message);
} else {
const search_result_dir = `./${message.date_ini}-${message.date_end}`;
if(!fs.existsSync(search_result_dir)){
fs.mkdirSync(search_result_dir)
}
for(let i = 0;i < pdf_lists.length; i++){
const download = new DownloaderHelper(pdf_lists[i], search_result_dir);
download.on('end', () => console.log('Download Completed'))
download.start();
}
console.log(`${pdf_lists.length} files Downloaded!`);
uploadFile(search_result_dir);
return ;
}
console.log($(elem).next('li').find('a').attr('href'));
}
});
}
}
Related
I tried to make the code asynchronous but I couldn't. What i need to do?
This is my functions:
1.
router.post('/urls', (req, response) => {
count = 2;
webUrl = req.body.url;
depth = req.body.depth;
letstart(webUrl, response);
});
function letstart(urlLink, response) {
request(urlLink, function (error, res, body) {
console.error('error:', error); // Print the error if one occurred
console.log('statusCode:', res && res.statusCode); // Print the response status code if a response was received
//console.log('body:', body); // Print the HTML for the Google homepage.
if (!error) {
getLinks(body);
if (!ifFinishAll) {
GetinsideLinks(linkslinst, response);
}
else {
console.log("Finish crawl");
}
}
else {
console.log("sorry");
return "sorry";
}
});
}
function GetinsideLinks(list, response) {
count++;
if (count <= depth) {
for (let i = 0; i < list.length; i++) {
const link = list[i].toString();
var includeUrl = link.includes(webUrl);
if (!includeUrl) {
request(link, function (error, res, body) {
console.error('error2:', error); // Print the error if one occurred
console.log('statusCode2:', res && res.statusCode); // Print the response status code if a response was received
if (!error) {
getLinks(body);
}
else {
console.log("sorry2");
}
});
}
}
ifFinishAll = true;
}
else {
console.log("finish");
ifFinishAll = true;
response.status(200).send(resArray);
};
return resArray;
}
function getLinks(body) {
const html = body;
const $ = cheerio.load(html);
const linkObjects = $('a');
const links = [];
linkObjects.each((index, element) => {
countLinks = linkObjects.length;
var strHref = $(element).attr('href');
var strText = $(element).text();
var existUrl = linkslinst.includes(strHref);
var existText = textslist.includes(strText);
if (strText !== '' && strText !== "" && strText !== null && strHref !== '' && strHref !== "" && strHref !== null && strHref !== undefined && !existUrl && !existText) {
var tel = strHref.startsWith("tel");
var mail = strHref.startsWith("mailto");
var linkInStart = isUrlValid(strHref);
if (!tel && !mail) {
if (linkInStart) {
links.push({
text: $(element).text(), // get the text
href: $(element).attr('href'), // get the href attribute
});
linkslinst.push($(element).attr('href'));
textslist.push($(element).text());
}
else {
links.push({
text: $(element).text(), // get the text
href: webUrl.toString() + $(element).attr('href'), // get the href attribute
});
linkslinst.push(webUrl.toString() + $(element).attr('href'))
textslist.push($(element).text());
}
}
}
});
const result = [];
const map = new Map();
for (const item of links) {
if (!map.has(item.text)) {
map.set(item.text, true); // set any value to Map
result.push({
text: item.text,
href: item.href
});
}
}
if (result.length > 0) {
resArray.push({ list: result, depth: count - 1 });
}
console.log('res', resArray);
return resArray;
}
I want to return/response finally to the "resArray". I tried to add async and await to function number 1 and number 2 but it didn't succeed. Maybe I need to add async/await to all functions? How can I fix that?
You can achieve your goal by using async-await.
An async function is a function declared with the async keyword, and the await keyword is permitted within them. The async and await keywords enable asynchronous, promise-based behavior to be written in a cleaner style, avoiding the need to explicitly configure promise chains.
Basic example:
function resolveImmediately() {
return new Promise(resolve => {
resolve(true);
});
}
function resolveAfter2Seconds() {
return new Promise(resolve => {
setTimeout(() => {
resolve('resolved');
}, 2000);
});
}
async function asyncCall() {
console.log('calling');
const result = await resolveImmediately();
console.log(result);
if(result) {
const anotherResult = await resolveAfter2Seconds();
console.log(anotherResult);
}
}
asyncCall();
Note: Your code is too long to debug. As a result, to make you understand about the approach (what & how to do), i have added a simple example into my answer.
I can't figure this one out. I have one function that connects to an SFTP server and downloads files. Then, I have a second function that reads the contents, puts the data in an array, and returns the array.
The problem is that the second function always runs first. I tried different methods but I can't get it to work. That connection to SFTP is quite slow, it can take like 10+ seconds to finish. But I need to somehow wait for it to finish before doing anything else.
const SFTPConfig = require('../config/keys').sftpconfig;
const getCSATFiles = async function(targetDate) {
try {
let Client = require('ssh2-sftp-client');
let sftp = new Client();
const date = moment(targetDate);
var dir = `../csv/${targetDate}/`;
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}
sftp
.connect(SFTPConfig, 'once')
.then(() => {
return sftp.list('/In/Archives/');
})
.then(data => {
data.forEach(item => {
const fileName = item.name;
const remotePath = '/In/Archives/' + fileName;
const localePath = path.join(dir + fileName);
if (
moment(item.modifyTime)
.format('YYYY-MM-DD hh:mm')
.toString()
.slice(0, 10) ===
date
.format('YYYY-MM-DD hh:mm')
.toString()
.slice(0, 10)
) {
sftp
.fastGet(remotePath, localePath, {})
.then(() => {
console.log('finished getting the files!');
sftp.end();
})
.catch(err => {
sftp.end();
console.log(err, 'fastGet method error');
});
}
});
});
} catch (error) {
console.log(error);
}
};
const readCSVFiles = async function(targetDate) {
try {
const casesBO = [];
var dir = `../csv/${targetDate}/`;
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}
const allLocalFiles = path.join(__dirname, dir);
const readDir = util.promisify(fs.readdir);
const files = await readDir(allLocalFiles);
for (let file of files) {
fs.createReadStream(allLocalFiles + file)
.pipe(csv.parse({ headers: true, delimiter: ';' }))
.on('error', error => console.error(error))
.on('data', row => {
if (row['[REGION2]'] !== 'FR') {
casesBO.push(row['[CALLERNO_EMAIL_SOCIAL]']);
console.log(
`${row['[AGENT]']} is ${row['[REGION2]']} and case = ${
row['[CALLERNO_EMAIL_SOCIAL]']
}`
);
}
})
.on('end', rowCount => {
console.log(`Parsed ${rowCount} rows`);
});
}
return casesBO;
} catch (error) {
console.log(error);
}
};
const testFunc = async () => {
const csatfiles = await getCSATFiles('2021-02-03');
const boData = await readCSVFiles('2021-02-03');
console.log(boData);
};
testFunc();
#1 as #messerbill suggested, you need to return the promise from your function.
#2 your promise has a loop inside of it that have more promises. In this case, you need to collect those promises and use Promise.all to resolve them before you second function runs. I put comments on the lines you need to change below. Try this:
const SFTPConfig = require('../config/keys').sftpconfig;
const getCSATFiles = function(targetDate) {
try {
let Client = require('ssh2-sftp-client');
let sftp = new Client();
const date = moment(targetDate);
var dir = `../csv/${targetDate}/`;
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}
// I return the promise here
return sftp
.connect(SFTPConfig, 'once')
.then(() => {
return sftp.list('/In/Archives/');
})
.then(data => {
// I set up my promises as a blank array
const promises = [];
data.forEach(item => {
const fileName = item.name;
const remotePath = '/In/Archives/' + fileName;
const localePath = path.join(dir + fileName);
if (
moment(item.modifyTime)
.format('YYYY-MM-DD hh:mm')
.toString()
.slice(0, 10) ===
date
.format('YYYY-MM-DD hh:mm')
.toString()
.slice(0, 10)
) {
// I collect the promises here
promises.push(sftp
.fastGet(remotePath, localePath, {})
.then(() => {
console.log('finished getting the files!');
sftp.end();
})
.catch(err => {
sftp.end();
console.log(err, 'fastGet method error');
}));
}
});
// I resolve them here
return Promise.all(promises);
});
} catch (error) {
console.log(error);
}
};
const readCSVFiles = async function(targetDate) {
try {
const casesBO = [];
var dir = `../csv/${targetDate}/`;
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}
const allLocalFiles = path.join(__dirname, dir);
const readDir = util.promisify(fs.readdir);
const files = await readDir(allLocalFiles);
for (let file of files) {
fs.createReadStream(allLocalFiles + file)
.pipe(csv.parse({ headers: true, delimiter: ';' }))
.on('error', error => console.error(error))
.on('data', row => {
if (row['[REGION2]'] !== 'FR') {
casesBO.push(row['[CALLERNO_EMAIL_SOCIAL]']);
console.log(
`${row['[AGENT]']} is ${row['[REGION2]']} and case = ${
row['[CALLERNO_EMAIL_SOCIAL]']
}`
);
}
})
.on('end', rowCount => {
console.log(`Parsed ${rowCount} rows`);
});
}
return casesBO;
} catch (error) {
console.log(error);
}
};
const testFunc = async () => {
const csatfiles = await getCSATFiles('2021-02-03');
const boData = await readCSVFiles('2021-02-03');
console.log(boData);
};
testFunc();
You need to take care, that you return the promise you want to resolve inside the function body in order to get the promise resolved at the right time.
async function promiseNotReturned() {
new Promise(resolve => setTimeout(resolve.bind(null), 5000))
}
async function promiseReturned() {
return new Promise(resolve => setTimeout(resolve.bind(null), 5000))
}
async function run() {
await promiseNotReturned()
console.log("does not wait for 5 seconds")
await promiseReturned()
console.log("waits for 5 seconds")
}
run()
I know that old school for loop works in the traditional way - that it waits for the await to finish getting results.
But in my use case, I need to read a file from local/s3 and process it line by line, and for each line I need to call an External API.
Generally I use await inside the loop because all are running inside a lambda and I don't want to use all memory for running it parallelly.
Here I am reading the file using a stream.on() method, and in order to use await inside that, I need to add async in read method, like so:
stream.on('data',async () =>{
while(data=stream.read()!==null){
console.log('line');
const requests = getRequests(); // sync code,no pblms
for(let i=0;i<requests.length;i++){
const result = await apiCall(request[i);
console.log('result from api')
const finalResult = await anotherapiCall(result.data);
}
}
});
This is working but order in which the lines are processed is not guaranteed. I need all in a sync manner. Any help?
Complete Code
async function processSOIFileLocal (options, params) {
console.log('Process SOI file');
const readStream = byline.createStream(fs.createReadStream(key));
readStream.setEncoding('utf8');
const pattern = /^UHL\s|^UTL\s/;
const regExp = new RegExp(pattern);
readStream.on('readable', () => {
let line;
while (null !== (line = readStream.read())) {
if (!regExp.test(line.toString())) {
totalRecordsCount++;
dataObject = soiParser(line);
const { id } = dataObject;
const XMLRequests = createLoSTRequestXML(
options,
{ mapping: event.mapping, row: dataObject }
);
console.log('Read line');
console.log(id);
try {
for (let i = 0;i < XMLRequests.length;i++) {
totalRequestsCount++;
console.log('Sending request');
const response = await sendLoSTRequest(
options,
{ data: XMLRequests[i],
url: LOST_URL }
);
console.log("got response");
const responseObj = await xml2js.
parseStringPromise(response.data);
if (Object.keys(responseObj).indexOf('errors') !== -1) {
fs.writeFileSync(`${ERR_DIR}/${generateKey()}-${id}.xml`, response.data);
failedRequestsCount++;
} else {
successRequestsCount++;
console.log('Response from the Lost Server');
console.log(response[i].data);
}
}
} catch (err) {
console.log(err);
}
}
}
})
.on('end', () => {
console.log('file processed');
console.log(`
************************************************
Total Records Processed:${totalRecordsCount}
Total Requests Sent: ${totalRequestsCount}
Success Requests: ${successRequestsCount}
Failed Requests: ${failedRequestsCount}
************************************************
`);
});
}
async function sendLoSTRequest (options, params) {
const { axios } = options;
const { url, data } = params;
if (url) {
return axios.post(url, data);
// eslint-disable-next-line no-else-return
} else {
console.log('URL is not found');
return null;
}
}
Code needs to flow like so:
read a line in a sync way
process the line and transform the line into an array of two members
for every member call API and do stuff
once line is complete, look for another line, all done in order
UPDATE: Got a workaround..but it fires stream.end() without waiting stream to finish read
async function processSOIFileLocal (options, params) {
console.log('Process SOI file');
const { ERR_DIR, fs, xml2js, LOST_URL, byline, event } = options;
const { key } = params;
const responseObject = {};
let totalRecordsCount = 0;
let totalRequestsCount = 0;
let failedRequestsCount = 0;
let successRequestsCount = 0;
let dataObject = {};
const queue = (() => {
let q = Promise.resolve();
return fn => (q = q.then(fn));
})();
const readStream = byline.createStream(fs.createReadStream(key));
readStream.setEncoding('utf8');
const pattern = /^UHL\s|^UTL\s/;
const regExp = new RegExp(pattern);
readStream.on('readable', () => {
let line;
while (null !== (line = readStream.read())) {
if (!regExp.test(line.toString())) {
totalRecordsCount++;
dataObject = soiParser(line);
const { id } = dataObject;
const XMLRequests = createLoSTRequestXML(
options,
{ mapping: event.mapping, row: dataObject }
);
// eslint-disable-next-line no-loop-func
queue(async () => {
try {
for (let i = 0;i < XMLRequests.length;i++) {
console.log('Sending request');
console.log(id);
totalRequestsCount++;
const response = await sendLoSTRequest(
options,
{ data: XMLRequests[i],
url: LOST_URL }
);
console.log('got response');
const responseObj = await xml2js.
parseStringPromise(response.data);
if (Object.keys(responseObj).indexOf('errors') !== -1) {
// console.log('Response have the error:');
// await handleError(options, { err: responseObj, id });
failedRequestsCount++;
fs.writeFileSync(`${ERR_DIR}/${generateKey()}-${id}.xml`, response.data);
} else {
console.log('Response from the Lost Server');
console.log(response[i].data);
successRequestsCount++;
}
}
} catch (err) {
console.log(err);
}
});
}
}
})
.on('end', () => {
console.log('file processed');
console.log(`
************************************************
Total Records Processed:${totalRecordsCount}
Total Requests Sent: ${totalRequestsCount}
Success Requests: ${successRequestsCount}
Failed Requests: ${failedRequestsCount}
************************************************
`);
Object.assign(responseObject, {
failedRequestsCount,
successRequestsCount,
totalRecordsCount,
totalRequestsCount
});
});
}
Thank You
The sample code at the top of your question could be rewritten like
const queue = (() => {
let q = Promise.resolve();
return (fn) => (q = q.then(fn));
})();
stream.on('data', async() => {
while (data = stream.read() !== null) {
console.log('line');
const requests = getRequests(); // sync code,no pblms
queue(async () => {
for (let i = 0; i < requests.length; i++) {
const result = await apiCall(request[i]);
console.log('result from api');
const finalResult = await anotherapiCall(result.data);
}
});
}
});
Hopefully that will be useful for the complete code
If anyone want a solution for synchronisely process the file, ie, linebyline read and execute some Async call, it's recommended to use inbuilt stream transform. There we can create a transform function and return a callback when finishes.
That's will help of any one face this issues.
Through2 is a small npm library that also can be used for the same.
I have a function that accesses a database to determine total emissions for each ingredients. This value is then saved into an object and into the database. However, the object is being saved before the emission totals can be calculated.
var emission_total = 0;
async function getEmissionTotal() {
for (var x in ingredients) {
Emission.findOne({food_name:ingredients[x]}).then(result => {
console.log(result);
if (result == null) {
emission_total += 0;
console.log(emission_total);
}
else {
emission_total += 0.7;
console.log(emission_total);
}
})
.catch(err => console.log(err));
}
return;
}
async function next() {
await getEmissionTotal();
const date = req.body.date;
const description = req.body.description;
const food_list = ingredients;
const photo = photo_link;
const nutrients = {
'calories': req.body.calories,
'fat': req.body.fat,
'sugar': req.body.sugar,
'carbs': req.body.carbs,
'cholesterol': req.body.cholesterol,
'protein': req.body.protein,
'emissions': emission_total
}
console.log(nutrients);
const newNutrition = new Nutrition({
date,
description,
photo,
nutrients,
food_list
});
console.log(newNutrition)
newNutrition.save()
.then(() => res.json('Nutrition added!'))
.catch(err => res.status(400).json('Error: ' + err));
}
next();
In essence, the next() function should be executed after the async function getEmissionTotal()
emission_total need to be brought inside getEmissionTotal method. Also you are returning nothing and not waiting for the result.
async function getEmissionTotal() {
let emission_total = 0;
for (var x in ingredients) {
try {
const result = await Emission.findOne({food_name:ingredients[x]});
console.log(result);
if (result == null) {
emission_total += 0;
console.log(emission_total);
} else {
emission_total += 0.7;
console.log(emission_total);
}
} catch(err) {
console.log(err);
}
}
return emission_total;
}
I'm trying to use await on var application = await SchedulerService().getIssues(issueId)
And it returns the error: SyntaxError: await is only valid in async function
I'm starting in node.js. What can I do to fix it?
I've tried already
Add async before initial function const SchedulerService = await function(){ at line 1
Add async on first return return async () => { where's return { at line 3
import schedulerConf from '../../config/scheduler';
import authConf from '../../config/auth';
import applicationConf from '../../config/application';
import request from 'request';
import schedule from 'node-schedule';
import dateformat from 'dateformat';
let interations = 0;
var serviceRecords = [];
var issueRecords = [];
const SchedulerService = function(){
return {
initialize: async () => {
console.log(`***** Starting Scheduler on ${dateformat(new Date(), "dd/mm/yyyy HH:MM:ss")}`);
var j = schedule.scheduleJob('*/1 * * * *', function(){
console.time('└─ Scheduler execution time');
if(interations === 0){
console.log(`Setting scheduler runtime to full time.`);
}else{
console.log(`------------------------`);
}
interations++;
console.log(`Job execution number: ${interations}.`);
SchedulerService().execute()
.then(response => {
console.log(`└─ Job ${interations} was successfully executed.`);
console.log(`└─ Execution date ${dateformat(new Date(), "dd/mm/yyyy HH:MM:ss")}`);
console.timeEnd('└─ Scheduler execution time');
}).catch(error => {
console.log(`└─ Job ${interations} returned error while executing.`);
});
});
},
execute: async () => {
return SchedulerService().getRecords2Sync()
.then(() => {
SchedulerService().sync().then(() => {
}).catch(error => {console.log({error})});
}).catch(error => {console.log({error})});
},
getRecords2Sync: async () => {
serviceRecords = [];
var options = {
url: `http://localhost:${authConf.serverPort}/application`,
method: 'GET',
headers: {
authorization: `${authConf.secret}`
}
}
return new Promise(function (resolve, reject) {
request(options, function (error, res, body) {
if (!error && res.statusCode == 200) {
const srs = JSON.parse(body);
const response = srs['response'];
for(let i =0;i < response.length;i++){
const { id, info } = response[i];
var status = "";
var issueId = "";
var statusClass = "";
for(let x = 0; x < info.length; x++){
if(info[x].key === "status"){
status = info[x].value;
statusClass = info[x].valueClass;
}
if(info[x].key === applicationConf.fields.issueId){
issueId = info[x].value;
}
}
if(statusClass === 0){
if(issueId !== null && issueId !== ""){
serviceRecords.push({id, status, issueId});
}
}
}
//console.log(serviceRecords);
resolve(serviceRecords);
} else {
//console.log(error);
reject(error);
}
});
});
},
getIssues : async (issueId) => {
issueRecords = [];
return new Promise(function(resolve, reject) {
var options = {
url: `http://localhost:${authConf.serverPort}/application2/${issueId}`,
method: 'GET',
headers: {
authorization: `${authConf.secret}`
}
}
request(options, function(error, res, body) {
if (!error && res.statusCode == 200) {
const issues = JSON.parse(body);
const { issue } = issues.response;
const { id, status, custom_fields } = issue;
issueRecords.push({id, status, custom_fields});
resolve(issueRecords);
} else {
reject(error);
}
});
});
},
sync : async () => {
return new Promise(function(resolve, reject) {
for (let i = 0; i < serviceRecords.length; i++) {
const application_id = serviceRecords[i].id;
const application_status = serviceRecords[i].status;
const application_issueId = serviceRecords[i].issueId;
//console.log('issueRecords.length: ', issueRecords);
//console.log('issueRecords.length: ', issueRecords.length);
console.log(`********** application`);
console.log(`└─ id ${application_id}`);
console.log(`└─ status ${application_status}`);
console.log(`└─ issueId ${application_issueId}`);
var application2 = await SchedulerService().getIssues(application_issueId)
.then(response => {
resolve(() => {
console.log(`i've found a record by issue_id ${application_issueId}`);
});
}).catch(error => {
reject(error);
});
}
});
}
}
}
export default new SchedulerService();
Thank you so much!
If you had getIssues resolve with issueId and issueRecords you might do something like this in sync:
sync: async () => {
// `map` over the serviceRecords and return a getIssues promise for each issueId
const promises = serviceRecords.map(({ issueId }) => SchedulerService().getIssues(issueId));
// Wait for all the promises to resolve
const data = await Promise.all(promises);
// Loop over the resolved promises and log the issueId
data.forEach((issueId, issueRecords) => console.log(issueId));
}