How to write file using fs.createWriteStream - javascript

am trying to build a web scraper that downloads all the pdfs in a website. i've written all the logic necessary to do this but for some reason it downloads an empty pdf file which is not suppose to be so, the problem seems to be coming from the downloadFile function when i try to pipe the data which for some reason seems not to be working because i get an empty pdf file after the function is ran. i'll would appreciate it if someone can help me out with this problem, thanks.
here's a sample of my code:
app.js
const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");
const axiosInstance = require("./getAxios");
const axios = axiosInstance();
const Surl = "https://www.health.gov.ng/";
// linkList sample: "https://www.health.gov.ng/index.php?option=com_content&view=article&id=143&Itemid=512";
let = connectionFailCount = 0;
let linkList = [];
let dlinkList = [];
const getWebsiteLinks = async (Surl) => {
try {
console.log(`Crawling all links from: ${Surl}`);
const response = await axios.get(Surl);
const $ = cheerio.load(response.data);
const ranges = $("a").each(function (idx, el) {
if ($(el).attr("href")) {
return $(el).attr("href");
}
});
for (let index = 0; index < ranges.length; index++) {
let raw_links = $("a")[index].attribs.href;
if (raw_links.startsWith("/")) {
linkList.push(Surl + raw_links);
}
}
if (linkList.length > 0) {
console.log(`Finished crawling links: Found ${linkList.length} links`);
console.log(
"--------------------------------------------------------\n\n"
);
}
return;
} catch (error) {
if (connectionFailCount === 0) {
connectionFailCount += 1;
getWebsiteLinks(Surl);
console.log(`Connection error. \n
Reconnecting to server....`);
} else if (connectionFailCount === 5) {
console.error(`Can not connect to server. Try again later.`);
}
}
};
const downloadLinks = async (linkList) => {
try {
console.log("Crawling links to find pdf links. this may take a while...");
for (const link of linkList) {
const response = await axios.get(link);
// Skip where there's delayed server response
if (response.code === "ECONNRESET") continue;
const $ = cheerio.load(response.data);
$("a").each(function (idx, el) {
if ($(el)?.attr("href")?.endsWith(".pdf")) {
let addr = $(el).attr("href");
let dlink = Surl + addr;
dlinkList.push({
pathName: addr,
url: dlink,
});
}
});
}
console.log(dlinkList);
if (dlinkList.length > 0) {
console.log(`Crawling Finish: Found ${dlinkList.length} pdf links`);
console.log(
"--------------------------------------------------------\n\n"
);
}
} catch (error) {
if (connectionFailCount === 0) {
connectionFailCount += 1;
console.log(`Connection error. \n
Reconnecting to server: ${connectionFailCount} count`);
downloadLinks(linkList);
}
if (connectionFailCount === 3) {
console.error(`Can not connect to server. Try again later.`);
return;
}
// console.error("downloadLinksError: ", error);
}
};
const downloadFiles = async (dlinkList) => {
console.log("Creating directory to save PDF files");
const appRoot = path.dirname(path.resolve(__dirname));
// Had to change and restructure code due to error
const folderName = `PDF/${Surl.split("/").pop()}`;
const subFolderName = Surl.split("/").pop();
try {
if (!fs.existsSync(path.join(appRoot, folderName))) {
fs.mkdirSync(path.join(appRoot, "PDF"));
fs.mkdirSync(path.join(`${appRoot}/PDF`, subFolderName));
}
dlinkList.forEach(async (link) => {
let name = link.pathName;
let url = link.url;
let file = fs
.createWriteStream(
`${appRoot}/${folderName}/${name.split("/").pop()}`,
"utf-8"
)
.on("error", (err) => {
console.error("createWriteStreamError: ", err);
});
try {
console.log("Downloading PDF file...");
const { data } = await axios({
url,
method: "GET",
responseType: "stream",
});
if (data) {
console.log("PDF file Downloaded");
data.pipe(file);
}
} catch (error) {
console.error(error);
}
});
return;
} catch (error) {
console.error("downloadFilesError: ", error);
}
};
(async () => {
await getWebsiteLinks(Surl);
await downloadLinks(linkList);
await downloadFiles(dlinkList);
})();
getAxios.js
const axios = require("axios");
const https = require("https");
module.exports = function () {
const domain = "https://www.health.gov.ng/";
let instance;
if (!instance) {
//create axios instance
instance = axios.create({
baseURL: domain,
timeout: 60000, // Increase time out incase of network delay or delayed server response
maxContentLength: 500 * 1000 * 1000, // Increase maximum response ata length
httpsAgent: new https.Agent({ keepAlive: true }),
headers: { "Content-Type": "application/xml" },
});
}
return instance;
};

Related

Save to file results in async function

I got a simple async function where I "scrape" site from URLs.
Everything works fine, but now I want to save results into my txt file.
I tried to do simply array where I able to push every result also errors;
Now I got a problem where should I do write to file.
I tried putting it to a separated function then do await function inside my async function but function with write to file i always fired first.
There is full code
const https = require("https");
const fs = require("fs");
const readline = require("readline");
const path = require("path");
let urls = [];
let results = [];
(async function readUrls() {
const fileStream = fs.createReadStream("urls.txt");
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
});
for await (let line of rl) {
urls.push(line);
}
for await (let url of urls) {
https
.get(url, (res) => {
const {
statusCode
} = res;
const contentType = res.headers["content-type"];
let error;
if (statusCode !== 200) {
error = new Error("Request Failed.\n" + `Status Code: ${statusCode}`);
}
if (error) {
const firstPath = url.split("/")[7];
//there is array
results.push(firstPath);
//--------------
console.error("data : " + firstPath + " - " + " nothing found");
res.resume();
return;
}
res.setEncoding("utf8");
let rawData = "";
res.on("data", (chunk) => {
rawData += chunk;
});
(async () => {
await res.on("end", () => {
try {
const parsedData = JSON.parse(rawData);
const parsedResult = parsedData["data"]["id"] + " - " + parsedData["data"]["price"];
//there is array
results.push(parsedResult);
//--------------
console.log("data : " + parsedData["data"]["id"] + " - " + parsedData["data"]["price"]);
} catch (e) {
console.error(e.message);
}
});
})();
})
.on("error", (e) => {
console.error(`Got error: ${e.message}`);
});
}
})();
There is my simple function to write into file
fs.writeFile('result.txt', results, +(new Date()), function (err) {
if (err) {
console.log("Error occurred", err);
}
console.log("File write successfull");
});
I tried do something
async function secondFunction(){
await firstFunction();
// wait for firstFunction...
};
What I want to achive? I want to scrape every url from my text file and get ID and Price
( this is simple JSON response into browser no html - it works )
At the end I want to save everything into text file.
I made a version of your code that uses node-fetch to call the urls. I prefer this one as it is similar to what one can use on the web
To use it you should install it:
npm install node-fetch
const fetch = require("node-fetch"); // I prefer to use node-fetch for my calls
const fs = require("fs");
const readline = require("readline");
const path = require("path");
let urls = [];
let results = [];
(async function readUrls() {
const fileStream = fs.createReadStream("urls.txt");
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
});
for await (let line of rl) {
urls.push(line);
}
// Make the calls one after the other
for (let url of urls) {
try {
// We can call the urls with node-fetch and await the response
const res = await fetch(url);
const { status } = res;
let error;
if (status !== 200)
error = new Error("Request Failed.\n" + `Status Code: ${statusCode}`);
if (error) {
const firstPath = url.split('/')[7];
results.push(firstPath);
console.error("data : " + firstPath + " - " + " nothing found");
// As we are inside a loop here, we use continue instead of return
continue;
}
try {
// Here we try to take the response as json
const parsedData = await res.json();
const parsedResult = parsedData["data"]["id"] + " - " + parsedData["data"]["price"];
//there is array
results.push(parsedResult);
//--------------
console.log(`Data: ${parsedResult}`);
} catch (e) {
// In case we can't get the response as json we log the error
console.error(e.message);
}
} catch (httpError) {
//This is for when the call to fetch fails for some reason
console.error(httpError.message);
}
}
// Here we join the results to a string so that we can save it properly to the file
const resultAsText = results.join("\n");
// Then after all the urls are processed we can write them to a file
fs.writeFile('result.txt', resultAsText, 'utf8', function (err) {
if (err) {
console.log("Error occurred", err);
} else {
console.log("File write successfull");
}
});
})();

How to run a function which call axios for every 30 seconds

I'm creating a web scraper using node, cheerio and calling the website using axios(async/await). I want the function to run every 30 seconds. I tried using setTimeout and setInterval but did not get the expected result. Instead got heap out of memory error. I want to run the mvcAppointmentSearch function in the while loop for every 30 seconds. Following is the code. also attaching the codepen link for better readability.
Code pen link
const express = require('express');
const request = require('request-promise');
const cheerio = require('cheerio');
const axios = require('axios');
const cssSelect = require('css-select');
const open = require('open');
// const mvcUrl = 'https://telegov.njportal.com/njmvc/AppointmentWizard/17/';
const mvcUrl = 'https://telegov.njportal.com/njmvc/AppointmentWizard/14/';
const mvcLocation = ['Edison', 'Rahway', 'SouthPlainfield'];
// const mvcLocationNumber = ['240', '252', '239'];
const mvcLocationNumber = ['163'];
const requiredMonths = ['September', 'October'];
const callUrl = async (url, locationNumberIndex) => {
try {
const response = await axios.get(url);
//console.log('call url', response.data);
getData(response.data, locationNumberIndex);
} catch (err) {
console.log(err);
}
};
const mvcAppointmentSearch = () => {
for (let i = 0; i < mvcLocationNumber.length; i++) {
const currentUrl = mvcUrl + mvcLocationNumber[i];
console.log(mvcLocationNumber[i]);
callUrl(currentUrl, i);
}
};
const getData = (html, locationNumberIndex) => {
let data = [];
let $ = cheerio.load(html);
console.log('datais ', $);
$.prototype.exists = function (selector) {
return this.find(selector).length > 0;
};
const checkerLength = $('div').exists('.alert-danger');
console.log(checkerLength);
if (checkerLength) {
console.log(
`No appointment available in ${mvcLocation[locationNumberIndex]}`
);
} else {
const dateString = $('.control-label').text();
const availableMonth = dateString.trim().split(' ')[7];
const exactDateAvailability = dateString.slice(24, -1);
console.log(availableMonth);
if (requiredMonths.includes(availableMonth)) {
console.log('Hurray there is an appointment available');
const message = `Appointment available for the location ${mvcLocation[locationNumberIndex]} on ${exactDateAvailability}`;
open(`${mvcUrl + mvcLocationNumber[locationNumberIndex]}`);
console.log(message);
} else {
console.log('required Month is not available still searching');
}
}
};
while (true) {
try {
// mvcAppointmentSearch();
// want to run the following function for every 30 seconds.
mvcAppointmentSearch();
} catch (err) {
console.log(`Error has Occured ${err}`);
}
}

JS - StreamSaver downlad does not start

I download data from API in chunks decrypt it and than pass to ReadableStream.
But after last chunk, the file is not downloaded.
I work with axios and StreamSaver.js
Code:
Above in the code I declare:
this.filestream = streamSaver.createWriteStream('sample.jpg');
this.writer = await this.filestream.getWriter();
let readableStream;
readableStream = new ReadableStream({
start(ctrl) {
const nextChunk = async () => {
let fileDataResponse = await that.$api.post(
'endpoint', {
file_id: UUID,
chunk_index: index
}, {
headers: {
...
}
}
);
done =
fileDataResponse.data.length <=
fileDataResponse.data.current_index;
if (fileDataResponse.data.data) {
let data = await that.decryptData(fileDataResponse.data.data);
ctrl.enqueue(data);
}
if (!done) {
index += 1;
nextChunk();
} else {
ctrl.close();
}
};
nextChunk();
}
});
const reader = readableStream.getReader();
const close = () => {
that.writer.close();
};
const pump = () =>
reader.read().then((res) => {
if (!res.done) {
that.writer.write(res.value).then(pump);
} else {
close();
}
});
pump();
Where could be my error here?
Thank you a lot!
Issue was the res.value is not an Int8Array

How to return REST API response after utility execution is finished in expressJs

I have written one POST endpoint in expressJS with node.when I a make call to API It runs a utility with setInterval() and I want to send the API response after utility executes clearInterval().
How I can I wait and send response after utility execution is finished?
Please see the code below
REST API code:
const router= express.Router();
const multer= require('multer');
const {readCSVFile}= require('../util/index');
var storage = multer.diskStorage({
destination: (req, file, cb) => {
cb(null, 'uploads');
},
filename: (req, file, cb) => {
cb(null, file.fieldname + '-' + Date.now()+'.xlsx');
}
});
var upload = multer({storage: storage});
router.post('/fileUpload', upload.single('filename'), async (req, res) => {
readCSVFile();
res.status(201).json({id:1});
});
router.get('/',(req,res)=>{
res.sendFile(__dirname+'/index.html');
});
module.exports=router;
Utilty Code
const config = require('config')
const excelToJson = require('convert-excel-to-json')
const HttpsProxyAgent = require('https-proxy-agent')
const AWS = require('aws-sdk')
const json2xls = require('json2xls')
const fs = require('fs')
const awsConfig = {
httpOptions: {
agent: new HttpsProxyAgent(
config.get('aws.proxy')
),
}
}
AWS.config.credentials = new AWS.SharedIniFileCredentials({
profile: config.get('aws.profile'),
})
AWS.config.update(awsConfig)
let uuidv4 = require('uuid/v4')
let csv = [];
const lexRunTime = new AWS.LexRuntime({
region: config.get('aws.region'),
})
let refreshId
const readCSVFile = () => {
const csvSheet = excelToJson({
sourceFile: './Test.xlsx',
})
csvSheet.Sheet1.forEach(element => {
csv.push((element.A.slice(0, element.A.length)))
})
runTask()
refreshId = setInterval(runTask, 1000)
}
let botParams = {
botAlias: config.get('bot.alias'),
botName: config.get('bot.name'),
sessionAttributes: {},
}
const missedUtterancesArray = []
const matchedUtterancesArray = []
let start = 0
let end = 50
let count = 50
const runTask = () => {
let itemsProcessed = 0
console.log('executing...')
const arrayChunks = csv.slice(start, end)
arrayChunks.forEach((element) => {
botParams.inputText = element
botParams.userId = `${uuidv4()}`
lexRunTime.postText(botParams, function (err, data) {
itemsProcessed++
if (err) console.log(err, err.stack)
else {
if (data.intentName === null) {
missedUtterancesArray.push({
Utterance: element,
})
}
else{
matchedUtterancesArray.push({
Utterance: element,
})
}
}
if (itemsProcessed === arrayChunks.length) {
start = csv.indexOf(csv[end])
end = start + count
}
if (start === -1) {
let xls = json2xls(missedUtterancesArray)
fs.writeFileSync('./MissedUtterances.xlsx', xls, 'binary')
let matchedXls = json2xls(matchedUtterancesArray)
fs.writeFileSync('./MatchedUtterances.xlsx', matchedXls, 'binary')
console.log('File saved successfully!! ')
console.log('Total Matched utterances count: ',csv.length-missedUtterancesArray.length)
console.log('Total Missed utterances count: ',missedUtterancesArray.length)
console.log('Total Utterances count: ',csv.length)
clearInterval(refreshId)
}
})
})
}
I would have needed few more information to answer this but pardon my try if this does not work -
the setInterval method in the readCSVFile the reason. Being an asynchronous function, this will not stop the code progression.
lexRunTime.postText also looks like asynchronous. I think you'd be better off with using promises while responding to the client.

How can I access the response headers of a request that is piped to a feedparser

I am trying to parse an RSS feed using request js and feedparser-promised libraries. I am able to parse the feed using the below code.
import Bottleneck from 'bottleneck';
const feedparser = require('feedparser-promised');
const limiter = new Bottleneck({
maxConcurrent: 1,
minTime: 333,
});
const httpOptions = {
uri: val.sourcefeedurl,
resolveWithFullResponse: true,
method: 'GET',
pool: false,
headers: {
'If-None-Match': val.etag,
'If-Modified-Since': val.LastModified,
Connection: 'keep-alive',
ciphers: 'DES-CBC3-SHA',
},
};
const response = await limiter.schedule(() => feedparser.parse(httpOptions));
But since I use the feedparser-promised library I am not able to cache the etag and Last Modified from the response headers.
I tried modifying feedparser-promised like this
'use strict';
const request = require('request');
const feedParser = require('./feedParser');
const parse = (requestOptions, feedparserOptions) => {
const metaData = {};
return new Promise((resolve, reject) => {
request.get(requestOptions).on('error', reject).on('response', async resp => {
if (resp.statusCode === 304) {
reject('Source not modified');
} else if (resp.statusCode === 200) {
metaData.etagin = await resp.headers.etag;
metaData.LastModifiedin = await resp.headers['last-modified'];
metaData.LastModifiedLocal = await resp.headers['last-modified'];
// console.log(metaData);
}
}).pipe(feedParser(feedparserOptions).on('error', reject).on('response', resolve));
});
};
module.exports = {
parse
};
Below is the feedParser file
'use strict';
const FeedParserStream = require('feedparser');
module.exports = (feedparserOptions, metaData) => {
// console.log(metaData, 'herre');
const parsedItems = [];
const feedparser = new FeedParserStream(feedparserOptions);
// console.log(feedparser);
feedparser.on('readable', () => {
// console.log(resp);
let item;
while (item = feedparser.read()) {
parsedItems.push(item);
}
return parsedItems;
}).on('end', function next() {
this.emit('response', parsedItems);
});
return feedparser;
};
So my question is how do I return the response headers along with the parsedItems (as in the code) while resolving the promise.
Help is very much appreciated.
Pass the metaData on end like
'use strict';
const FeedParserStream = require('feedparser');
module.exports = (feedparserOptions, metaData) => {
// console.log(metaData, 'herre');
const parsedItems = [];
const feedparser = new FeedParserStream(feedparserOptions);
// console.log(feedparser);
feedparser.on('readable', () => {
// console.log(resp);
let item;
while (item = feedparser.read()) {
parsedItems.push(item);
}
return parsedItems;
}).on('end', function next() {
this.emit('response', { parsedItems, metaData });
});
return feedparser;
};
and your feed-parser promised as
'use strict';
const request = require('request');
const feedParser = require('./feedParser');
const parse = (requestOptions, feedparserOptions) => {
const metaData = {};
return new Promise((resolve, reject) => {
request.get(requestOptions).on('error', reject).on('response', async resp => {
if (resp.statusCode === 304) {
reject('Source not modified');
} else if (resp.statusCode === 200) {
metaData.etagin = await resp.headers.etag;
metaData.LastModifiedin = await resp.headers['last-modified'];
metaData.LastModifiedLocal = await resp.headers['last-modified'];
// console.log(metaData);
}
}).pipe(feedParser(feedparserOptions, metaData).on('error', reject).on('response', resolve));
});
};
module.exports = {
parse
};

Categories

Resources