const { Builder, By, Key } = require('selenium-webdriver');
const fs = require('fs');
const categories = fs.readFileSync('./categories.txt').toString().split("\n");
const list = fs.readFileSync('./list.txt').toString().split("\n");
async function check(check, index) {
let driver = await new Builder()
.forBrowser('chrome')
.build();
await driver.manage().setTimeouts({
implicit: 3000,
pageLoad: 3000,
script: 3000
});
try {
console.log(`now list: ${list[index]}`)
await driver.get(`http://localhost/${check}`);
return new Promise(async (resolve, reject) => {
resolve(true)
})
} catch (err) {
console.log(err);
} finally {
await driver.quit();
}
}
(async () => {
var index = 0;
var i = 0;
while (true) {
if (index > list.length) {
index = 0;
}
var idx = i * 2;
var first = categories[idx];
var second = categories[idx + 1];
if (first !== undefined && second !== undefined) {
await Promise.all([check(first, index), check(second, index+1)])
} else {
if (first !== undefined) {
await check([first, index]);
} else {
console.log('end');
break
}
}
i += 2;
index += 2;
}
})();
I have 20,000 data, and I want to call a specific function every two in order.
But there seems to be something very wrong.
Want:
When 'selenium' runs on the 20th line,
It seems to run on the same 'port' under competition conditions.
I found this error. Error: ECONNREFUSED connect ECONNREFUSED 127.0.0.1:49809
I'm totally Newbie here. I need your help!
Related
I've created a script in node to scrape the links of different posts from a webpage. The script seems to be working in the right way. Now, I wish to capture the links of different posts from next pages also.
As I'm new to write code in node, I just don't find any idea how I can apply the logic of grabbing links from next pages within my current implementation.
const request = require('request');
const cheerio = require('cheerio');
const link = 'https://stackoverflow.com/questions/tagged/web-scraping';
const items = [];
let getLinks = () => {
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.summary > h3 > a.question-hyperlink').each(function() {
items.push(base_link + $(this).attr("href"));
});
resolve(items);
} catch (e) {
reject(e);
}
});
});
};
getLinks().then(resultList => {
var i;
for (i = 0; i < resultList.length; i++) {
console.log(resultList[i]);
}
})
Something like this?
const request = require('request');
const cheerio = require('cheerio');
const base_url = 'https://stackoverflow.com';
let requestURL = 'https://stackoverflow.com/questions/tagged/web-scraping';
let pageLimit = 5;
(async function main() {
while (pageLimit-- && requestURL) {
console.log(`----- current: ${requestURL}, remains: ${pageLimit}`);
const result = await getLinks(requestURL);
for (const link of result.links) {
console.log(link);
}
requestURL = result.nextPageURL;
}
})().catch(console.error);
function getLinks(link) {
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
if (error) return reject(error);
let $ = cheerio.load(html);
const links = [];
try {
$('.summary > h3 > a.question-hyperlink').each(function() {
links.push(base_url + $(this).attr('href'));
});
const nextPageURL = base_url + $('a[rel="next"]').attr('href');
resolve({ links, nextPageURL });
} catch (e) {
reject(e);
}
});
});
};
I have a directory of the tax file of employees. Each file has a filename as employee code. I am reading each file and extract some components and save to an array of employee objects.
const readline = require('readline');
let empArr = [];
function readFiles(dirname) {
fs.readdir(dirname, async function (err,filenames) {
if(err) {
return err;
}
for await (file of filenames) {
const filePath = path.join(__dirname,directoryPath,file);
const readStream = fs.createReadStream(filePath);
const fileContent = readline.createInterface({
input: readStream
});
let employeeObj = {
empId : '',
TotalEarning:'',
ProfessionalTax:0,
GrossIncome:0,
isDone:false
};
fileContent.on('line', function(line) {
if(!employeeObj.empId && line.includes("Employee:")) {
const empId = line.replace('Employee: ','').split(" ")[0];
employeeObj.empId = empId;
}
else if(line.includes('Total Earnings')) {
const amount = line.replace(/[^0-9.]/g,'');
employeeObj.TotalEarning = amount;
}
else if(line.includes('Profession Tax')) {
const amount = line.split(" ").pop() || 0;
employeeObj.ProfessionalTax = amount;
}
else if(line.includes('Gross Income')) {
const amount = line.replace(/[^0-9.]/g,'');
employeeObj.GrossIncome = amount ||0;
}
else if(line.includes('finance department immediately')) {
employeeObj.isDone =true;
empArr.push(employeeObj);
}
});
fileContent.on('close', function() {
fileContent.close();
});
}
})
}
readFiles(directoryPath);
I am not able to get empArr. After getting the array, I need to save to excel. That part I will try after getting the array of employee objects.
I got it working after reading several articles on closure and promises. The below code works for me and sends me array of employees that are processed.
const directoryPath = './tax/';
function readFiles(dirname) {
fs.readdir(dirname, async function (err,filenames) {
if(err) {
return err;
}
let promiseArr = filenames.map( file=> {
return new Promise((resolve)=>{
processFile(file, resolve)
})
});
Promise.all(promiseArr).then((ret)=>console.log(ret));
})
}
function processFile(file, callback) {
const filePath = path.join(__dirname,directoryPath,file);
const readStream = fs.createReadStream(filePath);
const fileContent = readline.createInterface({
input: readStream
});
let employeeObj = {
empId : '',
TotalEarning:'',
ProfessionalTax:0,
GrossIncome:0,
isDone:false
};
fileContent.on('line', function(line) {
if(!employeeObj.empId && line.includes("Employee:")) {
const empId = line.replace('Employee: ','').split(" ")[0];
employeeObj.empId = empId;
}
else if(line.includes('Total Earnings')) {
const amount = line.replace(/[^0-9.]/g,'');
employeeObj.TotalEarning = amount;
}
else if(line.includes('Profession Tax')) {
const amount = line.split(" ").pop() || 0;
employeeObj.ProfessionalTax = amount;
}
else if(line.includes('Gross Income')) {
const amount = line.replace(/[^0-9.]/g,'');
employeeObj.GrossIncome = amount ||0;
}
else if(line.includes('finance department immediately')) {
employeeObj.isDone = true;
return callback(employeeObj);
}
});
fileContent.on('close', function() {
fileContent.close();
});
}
readFiles(directoryPath);
Surely, the code can be improved further.
I've been trying this for ages now and I'm not making any progress.
I found this on google https://gist.github.com/Elements-/cf063254730cd754599e
and it's running but when I put that in a function and try to use it with my code its not running.
Code:
fs.readdir(`${__dirname}/data`, (err, files) => {
if (err) return console.error(`[ERROR] ${err}`);
files.forEach(file => {
if (file.endsWith(".mp4")) {
// getVideoDuration(`${__dirname}/data/${file}`)
group = new Group(file.split(".")[0], file, null, getVideoDuration(`${__dirname}/data/${file}`), 0);
groups.push(group);
}
});
console.log(groups);
});
function getVideoDuration(video) {
var buff = new Buffer.alloc(100);
fs.open(video, 'r', function (err, fd) {
fs.read(fd, buff, 0, 100, 0, function (err, bytesRead, buffer) {
var start = buffer.indexOf(new Buffer.from('mvhd')) + 17;
var timeScale = buffer.readUInt32BE(start, 4);
var duration = buffer.readUInt32BE(start + 4, 4);
var movieLength = Math.floor(duration / timeScale);
console.log('time scale: ' + timeScale);
console.log('duration: ' + duration);
console.log('movie length: ' + movieLength + ' seconds');
return movieLength;
});
});
}
Output:
[
Group {
_name: 'vid',
_video: 'vid.mp4',
_master: null,
_maxTime: undefined,
_currentTime: 0
},
Group {
_name: 'vid2',
_video: 'vid2.mp4',
_master: null,
_maxTime: undefined,
_currentTime: 0
}
]
time scale: 153600
duration: 4636416
movie length: 30 seconds
time scale: 153600
duration: 4636416
movie length: 30 seconds
its logging the information correctly but is returning undefined
This seems like a lot of extra work for little benefit, so I'm going to refer to get-video-duration https://www.npmjs.com/package/get-video-duration which does a great job of getting durations of any video file in seconds minutes and hours
Copying the last comment of the gist you sent, I came up with this:
const fs = require("fs").promises;
class Group {
constructor(name, video, master, maxTime, currentTime) {
this._name = name;
this._video = video;
this._master = master;
this._maxTime = maxTime;
this._currentTime = currentTime;
}
setMaster(master) {
if (this._master != null) {
this._master.emit('master');
}
this._master = master;
this._master.emit('master');
}
};
const asyncForEach = async (array, callback) => {
for (let index = 0; index < array.length; index++) {
await callback(array[index], index, array);
}
};
async function loadGroups() {
const files = await fs.readdir(`${__dirname}/data`);
const groups = []
await asyncForEach(files, async file => {
if (file.endsWith(".mp4")) {
const duration = await getVideoDuration(`${__dirname}/data/${file}`);
const group = new Group(file.split(".")[0], file, null, duration, 0);
groups.push(group);
}
});
console.log(groups);
}
async function getVideoDuration(video) {
const buff = Buffer.alloc(100);
const header = Buffer.from("mvhd");
const file = await fs.open(video, "r");
const {
buffer
} = await file.read(buff, 0, 100, 0);
await file.close();
const start = buffer.indexOf(header) + 17;
const timeScale = buffer.readUInt32BE(start);
const duration = buffer.readUInt32BE(start + 4);
const audioLength = Math.floor((duration / timeScale) * 1000) / 1000;
return audioLength;
}
loadGroups();
As to why your original code wasn't working, my guess is that returning inside the callback to fs.open or fs.read doesn't return for getVideoDuration. I couldn't easily figure out a way from the fs docs to figure out how to return the value of the callback, so I just switched over to promises and async/await, which will essentially run the code synchronously. This way you can save the output of fs.open and fs.read and use them to return a value in the scope of getVideoDuration.
I've figured out a work-around for this problem.
async function test() {
const duration = await getDuration(`${__dirname}/data/vid.mp4`);
console.log(duration);
}
test();
function getDuration(file) {
return new Promise((resolve, reject) => {
exec(`ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 ${file}`, (err, stdout, stderr) => {
if (err) return console.error(err);
resolve(stdout ? stdout : stderr);
});
});
}
I only tested it on linux so I dont know if it'll work on windows
I'm attempting to make an API call using promises. The API is paginated and as such, depending on the headers in that first API call make more to get the rest of the results if need be.
Here's what I have so far:
const get = (url, pageNo) => {
var options = {
url: url,
headers: {
'Authorization': `Token token=${apiToken}`
},
json: true,
page: pageNo
};
return new Promise((resolve, reject) => {
request.get(options, (err, resp) => {
err ? reject(err) : resolve(resp);
})
});
};
Using get() to loop and get all responses:
const getAll = (plannerId, timestamp, range) => {
const plannerBookingsUrl = new URL(
`/api/planners/${plannerId}/bookings?since=${timestamp}&range=${range}`,
baseUrl
);
let response = get(plannerBookingsUrl, 1);
let bookings = [];
bookings.push(response);
response.then(results => {
let moreRequests = true;
let currentPage = 1;
const totalPages = parseInt(results.headers['x-total-pages']);
while (moreRequests) {
if (currentPage < totalPages) {
nextBatch = get(plannerBookingsUrl, currentPage + 1);
bookings.push(nextBatch);
currentPage++;
} else {
moreRequests = false;
}
}
});
return Promise.all(bookings);
};
Main() where I'm using getAll(...):
const main = () => {
const response = getAll(
'11716',
'2020-02-27',
'7'
);
response.then(results => {
console.log(results);
.catch(error => console.log(error))
};
main();
This returns the initial promise but not the remaining promises.
What I'm really have a problem with is reading the first API, making the remainder and returning them all together to be using in my main function.
Any help would be much appreciated!
Thanks.
You could put all your fetching logic inside the while loop. The way you get your bookings is the same, except for the first time where you need to get a little more information on the amount of pages.
Accomplish this by making your function async and check the first time of the loop if the totalPages value is already known. If it's not, await the response and get the info from the headers, and otherwise just push the response to the bookings array.
const getAll = async (plannerId, timestamp, range) => {
const plannerBookingsUrl = new URL(
`/api/planners/${plannerId}/bookings?since=${timestamp}&range=${range}`,
baseUrl
);
let bookings = [];
let currentPage = 1;
let totalPages = null;
while (totalPages === null || currentPage < totalPages) {
let response = get(plannerBookingsUrl, currentPage);
if (totalPages === null) {
let results = await response;
totalPages = parseInt(results.headers['x-total-pages']);
}
bookings.push(response);
currentPage++;
}
return Promise.all(bookings);
};
The problem is that you are returning Promise.all(bookings) outside response.then callback, so at this point bookings contains only the first call get(plannerBookingsUrl, 1).
Here is a possible solution using async:
const getAll = async (plannerId, timestamp, range) => {
const plannerBookingsUrl = new URL(
`/api/planners/${plannerId}/bookings?since=${timestamp}&range=${range}`,
baseUrl
);
let response = get(plannerBookingsUrl, 1);
let bookings = [];
bookings.push(response);
const results = await response; // wait for results here
let moreRequests = true;
let currentPage = 1;
const totalPages = parseInt(results.headers['x-total-pages']);
while (moreRequests) {
if (currentPage < totalPages) {
nextBatch = get(plannerBookingsUrl, currentPage + 1);
bookings.push(nextBatch);
currentPage++;
} else {
moreRequests = false;
}
}
return Promise.all(bookings); // bookings now contains every next batch
};
adapt on main() function:
const main = async () => {
const results = await getAll(
'11716',
'2020-02-27',
'7'
);
...
};
main();
I am trying to read some data from a file and store it in a database.
This is part of a larger transaction and I need the returned ids for further steps.
async parseHeaders(mysqlCon, ghID, csv) {
var self = this;
var hIDs = [];
var skip = true;
var idx = 0;
console.log("Parsing headers");
return new Promise(async function(resolve, reject) {
try {
var lineReader = require('readline').createInterface({
input: require('fs').createReadStream(csv)
});
await lineReader.on('close', async function () {
console.log("done: ", JSON.stringify(hIDs));
resolve(hIDs);
});
await lineReader.on('line', async function (line) {
line = line.replace(/\"/g, '');
if (line.startsWith("Variable")) { //Variable,Statistics,Category,Control
console.log("found variables");
skip = false; //Ignore all data and skip to the parameter description.
return; //Skip also the header line.
}
if (!skip) {
var data = line.split(",");
if (data.length < 2) { //Variable section done return results.
console.log("Found sub?",line);
return lineReader.close();
}
var v = data[0];
var bidx = data[0].indexOf(" [");
if (bidx > 0)
v = data[0].substring(0, bidx); //[] are disturbing mysql (E.g.; Air temperature [�C])
var c = data[2];
hIDs[idx++] = await self.getParamID(mysqlCon, ghID, v, c, data);//, function(hID,sidx) { //add data in case the parameter is not in DB, yet.
}
});
} catch(e) {
console.log(JSON.stringify(e));
reject("some error occured: " + e);
}
});
}
async getParamID(mysqlCon,ghID,variable,category,data) {
return new Promise(function(resolve, reject) {
var sql = "SELECT ID FROM Parameter WHERE GreenHouseID="+ghID+" AND Variable = '" + variable + "' AND Category='" + category + "'";
mysqlCon.query(sql, function (err, result, fields) {
if(result.length === 0 || err) { //apparently not in DB, yet ... add it (Acronym and Machine need to be set manually).
sql = "INSERT INTO Parameter (GreenHouseID,Variable,Category,Control) VALUES ("+ghID+",'"+variable+"','"+category+"','"+data[3]+"')";
mysqlCon.query(sql, function (err, result) {
if(err) {
console.log(result,err,this.sql);
reject(err);
} else {
console.log("Inserting ",variable," into DB: ",JSON.stringify(result));
resolve(result.insertId); //added, return generated ID.
}
});
} else {
resolve(result[0].ID); //found in DB .. return ID.
}
});
});
}
The functions above are in the base class and called by the following code:
let headerIDs = await self.parseHeaders(mysqlCon, ghID, filePath);
console.log("headers:",JSON.stringify(headerIDs));
The sequence of events is that everything in parseHeaders completes except for the call to self.getParamID and control returns to the calling function which prints an empty array for headerIDs.
The console.log statements in self.getParamID are then printed afterward.
What am I missing?
Thank you
As you want to execute an asynchronous action for every line we could define a handler to do right that:
const once = (target, evt) => new Promise(res => target.on(evt, res));
function mapLines(reader, action) {
const results = [];
let index = 0;
reader.on("line", line => results.push(action(line, index++)));
return once(reader, "close").then(() => Promise.all(results));
}
So now you can solve that easily:
let skip = false;
const hIDs = [];
await mapLines(lineReader, async function (line, idx) {
line = line.replace(/\"/g, '');
if (line.startsWith("Variable")) { //Variable,Statistics,Category,Control
console.log("found variables");
skip = false; //Ignore all data and skip to the parameter description.
return; //Skip also the header line.
}
if (!skip) {
var data = line.split(",");
if (data.length < 2) { //Variable section done return results.
console.log("Found sub?",line);
return lineReader.close();
}
var v = data[0];
var bidx = data[0].indexOf(" [");
if (bidx > 0)
v = data[0].substring(0, bidx); //[] are disturbing mysql (E.g.; Air temperature [�C])
var c = data[2];
hIDs[idx] = await self.getParamID(mysqlCon, ghID, v, c, data);
}
});