I have the following code.
This code open a browser and I need to retreive a specific items with javascript.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var arr2 = [];
const puppeteer = require("puppeteer");
var mysql = require('mysql');
var cors = require('cors');
var conn = mysql.createConnection({
host : 'localhost',
user : 'root',
password : '',
database : 'todoapp'
});
// use it before all route definitions
app.use(cors({origin: 'http://localhost:4200'}));
app.get('/retreiveAllCountries', function(req, response,body){
(async () => {
var queryParam = req.query;
var url = queryParam.website;
var array = [];
//Retreive URLS
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "load" });
})();
})
app.listen('3006')
console.log('Web Scrape happens on port ');
exports = module.exports = app;
This is the code with Javascript that returns a list
const listItems1 = document.querySelectorAll('ul.menu.country-list.tournament-menu');
for (let i = 0; i < listItems1.length; i++) {
console.log(listItems1[i].textContent);
}
I need to know how to execute this Javascript Code inside puppeter.
To run the code in the puppeteer browser you need to user the evaluate function.
const result = await page.evaluate(() => {
const listItems1 = document.querySelectorAll(
"ul.menu.country-list.tournament-menu"
);
let textContent = [];
for (let i = 0; i < listItems1.length; i++) {
textContent.push(listItems1[i].textContent);
}
return textContent;
});
Related
I am working on a node script where I will be opening and closing the browser using child process but its opening the browser but not able to close it on pressing Ctrl + C.
Here's my code:
const { kill } = require('process');
var start = (process.platform == 'darwin'? 'open': process.platform == 'win32'? 'start': 'xdg-open');
//open browser
var url = 'https://www.google.com';
var cmd = start + ' ' + url;
var child = require('child_process').exec(cmd, function(error, stdout, stderr) {
if (error) {
console.log('exec error: ' + error);
}
});
process.on('SIGINT', function() {
kill(child.pid, 'SIGINT');
fs.writeFile(./scraper.js)
process.exit();
});
scarper.js
const cheerio = require('cheerio');
const request = require('request-promise');
const fs = require('fs');
const link="https://www.imdb.com/chart/top/?ref_=nv_mv_250";
(async()=>{
const html = await request(link);
const $ = cheerio.load(html);
const table = $('.lister-list');
const rows = table.find('tr');
const data = [];
for(let i=0;i<rows.length;i++){
const row = rows.eq(i);
const cols = row.find('td');
const rank = i+1;
const title = cols.eq(1).find('a').text().trim();
const year = cols.eq(1).find('span').text().trim();
const rating = cols.eq(2).text().trim();
const votes = cols.eq(4).text();
const image = cols.eq(1).find('img').attr('src');
const movie = {
rank: rank,
title: title,
year: year,
rating: rating,
votes: votes,
image: image
};
data.push(movie);
}
console.log(data);
fs.writeFile('./data.json',JSON.stringify(data));
})();
I am working on a node.js app where I am using socket.io to send data to multiple clients but the socket is only able to send data to one client i.e if I open my webpage in two tabs its not working in both the tabs but when I open just 1 tab of webpage it is able to transmit the data.I dont know why? Can someone help,Here's my code:
try to use
setTimeout((function() {
return process.abort();
}), 5000);
or try
setTimeout((function() {
return process.kill(process.pid);
}), 5000);
I am trying to web scrape with puppetter, currently the following code works but I think there can be optimizations made one of which I think is to only use one puppetter instance. But I don't really know how to do that. Can anyone help me?
This is the working but slow original:
const puppeteer = require('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('xpath of element');
const txt = await el.getProperty('textContent');
const rawTxt = await txt.jsonValue();
browser.close();
return rawTxt;
}
async function something() {
var some_varible = the length of some array;
process.setMaxListeners(some_varible);
for (var i = 0; i < some_varible; i++) {
var target = some_array[i].Name
var url = 'somewebsite' + target;
console.log(target + ": " + await scrapeProduct(url));
}
}
something();
This is my pathetic attempt at not using multiple instances of puppeteer: (Does not work)
const puppeteer = require('puppeteer');
async function scrapeProduct(url, page) {
await page.goto(url);
const [el] = await page.$x('xpath of element');
const txt = await el.getProperty('textContent');
const rawTxt = await txt.jsonValue();
return rawTxt;
}
async function something() {
var some_varible = the length of some array;
process.setMaxListeners(some_varible);
const browser = await puppeteer.launch();
const page = await browser.newPage();
for (var i = 0; i < some_varible; i++) {
var target = some_array[i].Name
var url = 'somewebsite' + target;
console.log(target + ": " + await scrapeProduct(url, page));
}
browser.close();
}
something();
Hey guys this is a follow on from my other question, i have created a Nodejs Scraper that doesnt seem to want to go through the pages, it stays on the first. my source code is below
const rp = require('request-promise');
const request = require('request');
const otcsv = require('objects-to-csv');
const cheerio = require('cheerio');
//URL To scrape
const baseURL = 'xxx';
const searchURL = 'xxxx';
//scrape info
const getCompanies = async () => {
// Pagination test
for (let index = 1; index <= 20; index = index + 1) {
const html = await rp.get(baseURL + searchURL + index);
const $ = await cheerio.load(html);
console.log("Loading Pages....");
console.log("At page number " + index);
// end pagination test
//const htmls = await rp(baseURL + searchURL);
const businessMap = cheerio('a.business-name', html).map(async (i, e) => {
const link = baseURL + e.attribs.href;
const innerHtml = await rp(link);
const emailAddress = cheerio('a.email-business', innerHtml).prop('href');
const name = e.children[0].data || cheerio('h1', innerHtml).text();
const phone = cheerio('p.phone', innerHtml).text();
return {
// link,
name,
emailAddress: emailAddress ? emailAddress.replace('mailto:', '') : '',
phone,
}
}).get();
return Promise.all(businessMap);
}
};
console.log("Finished Scraping.... Now Saving!")
//save to CSV
getCompanies()
.then(result => {
const transformed = new otcsv(result);
return transformed.toDisk('./output.csv');
})
.then(() => console.log('Scrape Complete :D '));
As you can see I have tried a few different ways to make this happen so any help will be gratefully appreciated.
I am trying to create REST API services reading data from Excel. But I can not use the include or indexOf property of Array Filter that is holding data from excel. It shows TypeError: Cannot read property 'includes' of null. I have tried using promise but did not work. Please help me.
var express = require('express');
var http = require('http');
var app = express();
function DeliveryData(Carrier, REF, PRO, ETA, DEL, NAME) {
this.Carrier = Carrier;
this.REF = REF;
this.PRO = PRO;
this.ETA = ETA;
this.DEL = DEL;
this.NAME = NAME;
};
var Excel = require('exceljs');
var workbook = new Excel.Workbook();
var DeliveryRowData = [];
workbook.xlsx.readFile('DeliveryData.xlsx').then(function () {
var worksheet = workbook.getWorksheet('DeliveryData');
worksheet.eachRow({
includeEmpty: false
}, function (row, rowNumber) {
DeliveryRowData.push(new DeliveryData(
worksheet.getRow(rowNumber).getCell(1).value,
worksheet.getRow(rowNumber).getCell(2).value,
worksheet.getRow(rowNumber).getCell(3).value,
worksheet.getRow(rowNumber).getCell(4).value,
worksheet.getRow(rowNumber).getCell(5).value,
worksheet.getRow(rowNumber).getCell(6).value
));
});
});
app.get('/dlv/:id', (req, res) => {
var aaa = [];
var output = '';
console.log(req.params.id);
var results = JSON.parse(req.params.id);
results.forEach(element => {
output = DeliveryRowData.filter(item => item.REF.includes(element));
aaa.push(output);
console.log(output);
console.log(element);
});
res.json(aaa);
});
var server = http.createServer(app);
var port = process.env.PORT || 989;
server.listen(port);
I am beginner in nodejs. I am building a simple server that writes json data to csv file. The question is:
For authorization, the “appKey” parameter needs to be set in the request header:
appKey: 9a3ab6d8-9ffe-49a5-8194-bc7d61123f4a
I could not understand what I am going to do.
This is what I have so far:
var fs = require('fs');
var express = require('express');
var app = express();
var inFilename = 'power_plants.json',
outFilename = 'powerplants.csv';
app.get('/', function (req, res) {
writeToCsv();
res.send('Successfully Created!');
})
var server = app.listen(8081, function () {
var host = server.address().address
var port = server.address().port
console.log("Example app listening at http://%s:%s", host, port)
})
function writeToCsv(){
var inJSON = fs.readFileSync(inFilename);
inJSON = JSON.parse(inJSON);
var outCSV = inJSON.rows;
var csv = [];
for(var k in outCSV) {
var items = [[outCSV[k].PowerPlant , outCSV[k].meter]];
for (index = 0; index < items.length; ++index) {
csv.push(items[index].join(', ') + '\n');
}
}
fs.writeFile(outFilename, csv, function (err) {
if (err) {
return console.log(err);
}
console.log('FILE SUCCESSFULLY WRITTEN!\n');
});
}
To extract the value of the header appKey you have to get it with this:
var appKey = req.headers.appKey;