Puppeteer scraper taking too long - javascript

I'm trying to scrape little pieces of data from a webpage, but it is taking soo long to scrape... any reason why this is happening ? Initially, it stops scraping, then i had to set the default timeout to 0. This same code was working perfectly fine earlier..
Now it's taking forever to scrape the data i need
code below
const puppeteer = require("puppeteer");
const express = require("express");
//const ejs = require("ejs");
const port = 5000;
const app = express();
app.set('view engine', 'ejs');
app.use(express.static(__dirname + '/public', {
types: {
'js': "text/javascript"
}
}));
var usdBuy;
var usdSell;
var gbpBuy;
var gbpSell;
var eurBuy;
var eurSell;
app.get('/', function(req, res) {
async function start() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await page.goto("URL");
const prices = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".overlay-text")).map(x => x.textContent)
})
usdBuy = prices[0]
usdSell = prices[1]
gbpBuy = prices[2]
gbpSell = prices[3]
eurBuy = prices[4]
eurSell = prices[5]
console.log(usdBuy);
console.log(usdSell);
console.log(gbpBuy);
console.log(gbpSell);
console.log(eurBuy);
console.log(eurSell);
await page.close();
await browser.close();
}
start();
res.render("home", {
usdBuy: usdBuy,
usdSell: usdSell,
gbpBuy: gbpBuy,
gbpSell: gbpSell,
eurBuy: eurBuy,
eurSell: eurSell
});
})
app.listen(process.env.PORT || port, function () {
console.log(`Listening on port ${port}`)
})

Related

how to send multiple script at once

I am using puppeteer and express js to make an api. I have done my code stuff and now i am testing the api on postman but i have a problem which is i could not send multiple scripts at once i mean i can send but just returning the last one.So what is the problem.
const express = require("express");
const app = express();
app.use(express.json())
const puppeteer = require("puppeteer");
const port = 3000;
let browser = null;
var target_url = ""
app.post('/get/script_result', async (req, res) => {
if (browser == null) browser = await puppeteer.launch({
headless: false, defaultViewport: null, args: ['--disable-infobars', '--disable-notifications', '--disable-default-apps',
'--no-sandbox', '--mute-audio', '--ignore-certificate-errors', '--disable-features=LookalikeUrlNavigationSuggestionsUI'], ignoreHTTPSErrors: true
});
try {
var page = await browser.newPage()
await page.goto(req.body.target_url);
var script = ""
var script_delay = ""
await page.waitForTimeout(`${req.body.script_delay}`)
const handle = await page.evaluate(`${req.body.script}`)
res.send(handle)
res.end()
//await page.close();
return res.status(200)
}
catch (e) {
console.log(e);
res.send(e)
res.status(500)
}
})
app.listen(port, () => {
console.log(`app is running on port: ${port}`);
});

MongoDb query return empty object

I'm trying to perform a simple .find() query on my mongodbAtlas, but the result of this query is an empty object.
this is my server file:
require("dotenv").config({ path: "./config.env" });
const { MongoClient, ServerApiVersion } = require("mongodb");
const express = require("express");
const { ServiceBroker } = require("moleculer");
const AUTH_SERVICE = require("./controller/services/auth/auth.service");
global.broker = new ServiceBroker({
nodeID: "auth",
});
const app = express();
app.use(express.json());
app.use("/auth", require("./routes/auth"));
const { PORT, URI } = process.env || 5000;
global.broker.createService(AUTH_SERVICE);
const start = async () => {
const dba = await MongoClient.connect(URI, {
useNewUrlParser: true,
useUnifiedTopology: true,
});
global.db = dba.db("Auth");
try {
await dba.connect();
console.log("DATABASE CONNESSO CON SUCCESSO📡");
} catch (e) {
console.error(e);
}
await global.broker.start();
app.listen(PORT, () => console.log(`PORT IT'S UP AND RUNNING 🚀 ON ${PORT}`));
};
start();
this is my routes file:
const express = require("express");
const router = express.Router();
router.get("/register", async (req, res) => {
const data = global.db.collection("Users").find({}).toArray();
res.send(data);
});
module.exports = router;
this is how my document is populated:
{"_id":{"$oid":"6297bbc83a95b81d74882f65"},"username":"Test","email":"test#gmail.com","password":"1234"}
I think you are missing the "await" keyword after const data..... as API data fetching calls are asynchronous and required promise/ async-await to handle. Being async in nature, it moves forward to the next instruction and returns an empty array.
const express = require("express");
const router = express.Router();
router.get("/register", async (req, res) => {
const data = await global.db.collection("Users").find({}).toArray();
res.send(data);
});
module.exports = router;

cannot run async function inside express, how to fix?

Async function not run inside express view, console log, before and after function, separately code working, but with express it doesn't.
const path = require('path')
const fs = require('fs')
const puppeteer = require('puppeteer');
const express = require('express');
const bodyParser = require('body-parser');
const app = express()
const port = 9596;
app.use(bodyParser.json());
app.post('/', async function(req, res){
siteName = req.body.siteName;
links = req.body.links;
screenshotPageSource = req.body.screenshotPageSource;
await createScreenshots(siteName, links, screenshotPageSource);
res.end('Hello World!');
});
app.listen(port, () => {
console.log(`Example app listening at http://localhost:${port}`)
})
async function createScreenshots(siteName, links, screenshotPageSource){
const browser = await puppeteer.launch({defaultViewport: { width: 1366, height: 768 }});
const page = await browser.newPage();
linksLength = links.Length;
for (var i = 0; i < linksLength; i++) {
link = links[i]
if(screenshotPageSource){
currentLink = `view-source:${link}`;
} else {
currentLink = link;
}
pageNumber = i + 1;
console.log(pageNumber);
await page.goto(currentLink, {waitUntil: 'load', timeout: 60000}).then(() => {
console.log(`Страница ${pageNumber}/${linksLength}, удалось сделать скриншот.`)
}).catch((res) => {
console.log(`Страница ${pageNumber}/${linksLength}, не удалось сделать скриншот.`)
});
await page.screenshot({ path: `${siteName}__${pageNumber}.png` });
}
await browser.close();
return links;
};
It looks like your post is mostly code; please add some more details.It looks like your post is mostly code; please add some more details.It looks like your post is mostly code; please add some more details.
The problem I guess is in app.post.
Below is have added a code for your reference which might help you to edit your code.
app.post('/', async(req, res, next) => {
const { error } = validateBody(req.body);
if (error) {
return res.status(400).send(error.details[0].message);
}
const newData= new createScreenshots({
siteName = req.body.siteName;
links = req.body.links;
screenshotPageSource = req.body.screenshotPageSource; });
await newData.save();
console.log('saving the document');
res.send(newData);
})
The above code is just for your reference as to how you can write app.post()

How to access next.js rendered HTML via custom server

I want a server side generated page in next.js to be served as a file. So I wanted to grab the rendered content inside a custom server.js file:
const express = require('express');
const next = require('next');
const port = parseInt(process.env.PORT, 10) || 3000;
const dev = process.env.NODE_ENV !== 'production';
const app = next({dev});
const handle = app.getRequestHandler();
app.prepare().then(() => {
const server = express();
server.get('/', async (req, res) => {
const nextResponse = await app.renderToHTML(req, res, '/', req.query);
console.log('nextResponse', nextResponse);
console.log('res.body', res.body);
});
server.get('*', (req, res) => {
return handle(req, res);
});
server.listen(port, (err) => {
if (err) throw err;
console.log(`> Ready on http://localhost:${port}`);
});
});
Oddly enough every console.log returns null or undefined.
I thought that renderToHTML would just return the rendered HTML string. Is there any way to do this?
This one is a bit tricky but achievable.
The idea is to override res.end function in order to catch rendered HTML there. The tricky part is that Next.js gzips and streams response using the compression library that's overriding res.end somewhere in the middle of the handle function.
The compression library is initialized using the handleCompression function of the Next.js's Server object (which is accessible using the app.getServer()), so that function needs to get overridden too.
So it should be looking something like this:
const { parse } = require('url');
const next = require('next');
const express = require('express');
const dev = process.env.NODE_ENV !== 'production';
const app = next({ dev });
const port = process.env.PORT || 3000;
const handle = app.getRequestHandler();
app.prepare()
.then(() => {
const server = express();
server.get('*', async (req, res) => {
const parsedUrl = parse(req.url, true);
const nextServer = await app.getServer();
const _handleCompression = nodeServer.handleCompression.bind(nodeServer);
nextServer.handleCompression = (req, res) => {
_handleCompression(req, res);
const _resEnd = res.end.bind(res)
res.end = function (payload) {
console.log('Rendered HTML: ', payload);
return _resEnd(payload);
}
}
return handle(req, res, parsedUrl);
});
server.listen(port, err => {
if (err) throw err;
console.log('> Ready on http://localhost:' + port);
});
});
After you get rendered HTML you don't have to use the saved _resEnd function. Feel free to manipulate, serve as a file, or whatever you want with it.

Error sending json in puppeteer and express

When I try to pass a parameter through the URL in express to send a json with puppeteer, another browser opens after a while and does not send the json, returns something empty.
const express = require('express');
const path = require('path'); // NEW
const puppeteer = require('puppeteer');
const fs = require('fs');
const config = require('../src/config.js');
const app = express();
const port = process.env.PORT || 3006;
const DIST_DIR = path.join(__dirname, '../dist'); // NEW
const HTML_FILE = path.join(DIST_DIR, 'index.html'); // NEW
const mockResponse = {
foo: 'bar',
bar: 'foo'
};
app.use(express.static(DIST_DIR)); // NEW
app.get('/api', (req, res) => {
res.send(mockResponse);
});
app.get('/getapi', async (req, res) => {
function randomIntFromInterval(min, max) { // min and max included
return Math.floor(Math.random() * (max - min + 1) + min);
}
const next_class = 'snByac';
const browser = await puppeteer.launch({
headless: false,
dumpio: true
});
const page = await browser.newPage();
await page.goto('https://adwords.google.com/ko/KeywordPlanner/Home?', {waitUntil: 'networkidle2'});
await page.waitFor(randomIntFromInterval(20000,25000));
await page.type('input[name=identifier]', config.mail);
await page.click('span.' + next_class);
await page.waitFor(randomIntFromInterval(20000,23000));
await page.type('input[name=password]', config.password)
await page.click('span.' + next_class)
await page.waitFor(randomIntFromInterval(39000,49000));
await page.click('[icon="arrow_forward"]')
await page.waitFor(randomIntFromInterval(3800,6000));
await page.type('[aria-autocomplete="list"]', req.query.palabra)
await page.waitFor(randomIntFromInterval(1400,2400));
await page.keyboard.press('Enter');
await page.waitFor(randomIntFromInterval(10000,14000));
await page.keyboard.press('Enter');
await page.waitFor(randomIntFromInterval(10000,14000));
res.send(mockResponse);
await browser.close();
});
app.get('/', (req, res) => {
res.sendFile(HTML_FILE); // EDIT
});
app.listen(port, function () {
console.log('App listening on port: ' + port);
});
I enclose the complete code, I don't get any console errors.
localhost has not sent any data.
ERR_EMPTY_RESPONSE
This, i get
Update: I solved it by placing within the route, the following: req.setTimeout (500000);

Categories

Resources