Node js pass parameter to function - javascript

I have this code :
const { Logger } = require ("telegram/extensions");
const { TelegramClient } = require ("telegram");
const { StringSession } = require ("telegram/sessions");
const { NewMessage } = require ("telegram/events");
const { NewMessageEvent } = require ("telegram/events/NewMessage");
const { Message } = require ("telegram/tl/custom/message");
const input = require('input'); // npm i input
const puppeteer = require('puppeteer-extra');
async function eventHandler(event, browser) {
//get event.message, ....
const page = await browser.newPage();
}
const client = new TelegramClient(
new StringSession(stringSession),
apiId,
apiHash,
{ connectionRetries: 5 }
);
(async () => {
console.log('Loading interactive example...')
const browser = await puppeteer.launch({args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--shm-size=2gb', '--start-maximized', '--disable-features=IsolateOrigins,site-per-process', '--disable-web-security'], headless: true});
await client.start({
phoneNumber: "+33...",
password: async () => await input.text('password?'),
phoneCode: async () => await input.text('Code ?'),
onError: (err) => console.log(err),
});
console.log('Telegram bot connected');
console.log(client.session.save());
client.addEventHandler(eventHandler, new NewMessage({}), browser);
})();
I want to pass the browser variable to the eventHandler function.
I try like that, but it does not work, browser came "undefined" in eventHandler.
How pass the browser variable to my eventHandler?

Not sure what is the signature of client.addEventHandler but assuming it takes a single param event, you could try replacing your last line with something like:
client.addEventHandler(
(event) => eventHandler(event, browser),
new NewMessage({}),
);

Related

How handle multiple functions in puppeteer-cluster?

I have a two step program :
Get a list of href from a page
Loop infinitely on each page of this list, get an element and display it in console
I try to use function with Puppeteer-Cluter but it doesn't work properly.
const { Cluster } = require('puppeteer-cluster');
const fs = require("fs");
const { addExtra } = require("puppeteer-extra");
const vanillaPuppeteer = require("puppeteer");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
var moment = require('moment');
var regexTemps = /(\d+)\s(\w+)$/;
const urlsToCheck = [];
TZ = 'Europe/Paris'
process.env.TZ = 'Europe/Paris'
(async () => {
const puppeteer = addExtra(vanillaPuppeteer);
puppeteer.use(StealthPlugin());
const cluster = await Cluster.launch({
puppeteer,
puppeteerOptions: {
headless: false,
args: ['--no-sandbox'],
},
maxConcurrency: 10,
concurrency: Cluster.CONCURRENCY_CONTEXT,
monitor: false,
skipDuplicateUrls: true,
timeout:30000,
retryLimit:10,
})
cluster.on('taskerror', (err, data, willRetry) => {
if (willRetry) {
console.warn(`Encountered an error while crawling ${data}. ${err.message}\nThis job will be retried`);
} else {
console.error(`Failed to crawl ${data}: ${err.message}`);
}
});
const getElementOnPage = async ({ page, data: url }) => {
console.log('=> Go to URL : ',url);
await page.goto(url);
while (true) {
console.log('=> Reload URL : ',page.url())
await page.reload();
await page.waitForTimeout(1000);
let allNews = await page.$$("article.news"); // [] if nothing
let firstNews = allNews[0];
await page.waitForTimeout(1000);
let info = await firstNews.$eval('.info span', s => s.textContent.trim());
console.log(new Date(), 'info : ',info);
}
};
const getListOfPagesToExplore = async ({ page, data: url }) => {
console.log(new Date(), 'Get the list of deal pages to explore');
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitForTimeout(500);
const hrefsToVisit = await page.$x('//a');
let idxTab=0;
for( let hrefToVisit of hrefsToVisit ) {
var link = await page.evaluate(el => el.getAttribute("href"), hrefToVisit);
console.log(new Date(), 'adding link to list : ', link);
idxTab++;
urlsToCheck.push(link);
}
};
cluster.queue('https://www.apagewithsomelinks.com', getListOfPagesToExplore);
await cluster.idle();
await cluster.close();
console.log(urlsToCheck);
//Display correctly several link in an array
for( let url of urlsToCheck ) {
console.log('Push in queue : ',url);
cluster.queue(url, getElementOnPage);
}
await cluster.idle();
await cluster.close();
})();
When I launch it, it retrieve the links to scrap.
It display "Push in queue : ..." for each URL.
But then, the method getElementOnPage is launched only for the first URL, and runs infinitely like asked. But why the other URL are note launched ??
Before that, I don't use function, I used one unique task with :
await cluster.task(async ({ page, data: url }) => {
But how combine function and this thing ?

playwright test website with Keycloak, can't find a way to logIn via pageObjectModel

Hi (i am pretty new in palywright),
I need/want to find a way in playwright, to login via pageObjectModel in a aplication which uses keycloak, but I don't know how.
I found a way without pageObject, to logIn in first test and saving the auth in process.env.STORAGE and then use test.use({storageState: auth_storage_path}) in the rest of the test inside the file.spec.js;
note: keycloak works (sorry for this basic info)
user not login -> visit.baseUrl, redirects you to keycloak authPage
user already logIn -> visit.baseUrl goes direct to bareUrl. (so no logIn button in homepage etc)
//tests_with_auth_store.spec.js
const {test, expect} = require('#playwright/test');
const auth_storage_path = 'storage_auth.json';
const baseUrl = 'https://myBaseUrl_xyz.com';
test('mylogin', async ({page, context}) => {
const usernameId = '[id="username"]';
const passwordId = '[id="password"]';
const idLogin = '[id="login-button"]';
const usernameValue = '*****';
const passwordValue = '*****';
//login:
await page.goto(baseUrl);
await page.fill(usernameId, usernameValue);
await page.fill(passwordId, passwordValue);
await Promise.all([
page.waitForNavigation(/*{ baseUrl: baseUrl }*/),
page.click(idLogin)
]);
process.env.STORAGE = null;
const storage = await context.storageState({ path: auth_storage_path });
process.env.STORAGE = JSON.stringify(storage);
JSON.parse(process.env.STORAGE);
});
test.describe("testDescription login via pageObjectModel", () => {
test.use({storageState: auth_storage_path});
test('i- firstTest whatever ', async ({page}) => {
await page.goto(baseUrl);
......
....
});
test('ii- secondTest whatever ', async ({page}) => {
await page.goto(baseUrl);
......
....
});
});
This works ok and all test under test.use({storageState: auth_storage_path});
can jump to baseUrl directly. The problem is that I can not find a way to encapsulate test('login') into a playwright pageObject (in cypress we did it in simple func in commands.js and saving the auth into cookies)
My demo for login-page.js:
// login-page.js
const { expect, test} = require('#playwright/test');
const baseUrl = 'https://myBaseUrl_xyz.com';
const auth_storage_path = 'storage_auth.json';
exports.LoginPage = class LoginPage {
/**
* #param {import('#playwright/test').Page} page
*/
constructor(page) {
this.page = page;
}
async login() {
process.env.STORAGE = null;
const baseUrl = 'https://myBaseUrl_xyz';
await this.page.goto(baseUrl);
await this.page.fill('[id="username"]', '*****');
await this.page.fill('[id="password"]', '*****');
await Promise.all([
this.page.waitForNavigation(/*{ baseUrl: baseUrl }*/),
this.page.click('[id="fc-login-button"]')
]);
const storage = await this.page.context().storageState({path: auth_storage_path});
process.env.STORAGE = JSON.stringify(storage);
JSON.parse(process.env.STORAGE);
}
// async gotoBaseUrl() {
// test.use({storageState: auth_storage_path});
// return this.page.goto(baseUrl);
// }
}
call login-page from tests_with_auth_store_viaPage.spec.js
// tests_with_auth_store_viaPage.spec.js
const {test, expect} = require('#playwright/test');
const { LoginPage } = require('../login/login-page');
const auth_storage_path = 'storage_auth.json';
const baseUrl = 'https://myBaseUrl_xyz.com';
test('login', async ({page}) => {
const loginPage = new LoginPage(page);
await loginPage.login();
});
test.describe("testDEscription tests with save login into json", () => {
test.use({storageState: auth_storage_path});
test('i- firstTest whatever', async ({page}) => {
await page.goto(baseUrl);
......
....
});
But here test('i- firstTest whatever') page.goto(baseUrl) does NOT jump to baseUrl but to keycloak auth page :(.
Eventhoug test('login') is login and creating storage_auth.json. So I am doing something wrong maybe I need something like loginPage.gotoBaseUrl(), but it doesn't work as well.
By now, I am writing the rest all the test including always the first test('mylogin) but I am sure there is a way via pageObject
Regards
Already found the way. I found myself the way. If someone needs any help, do not hesitate to ask me
demo_login.spec.js
const {test, expect} = require('#playwright/test');
const auth_storage_path = 'storage_auth.json';
const {Login} = require('../../yourpathforfile/login_page.js');
const {Logout} = require("../../yourpathforfile/logout_page.js");
test('login_a', async ({page}) => {
const login = new Login(page);
await login.visit_baseurl();
await login.method_login();
await login.check_login_success();
await login.saveAuth(auth_storage_path);
await login.print_url_tenant();
await login.print_browser();
});
test.describe("Login_a tests --> ", () => {
test.use({storageState: auth_storage_path});
test('demo_a test', async () => {
expect( 1 +1).toBe(2); });
test('logout_a', async ({page}) => {
const logout = new Logout(page);
await logout.visit_baseurl();
await logout.method_check_pageIsWorkplace();
await logout.method_logout_workplace();
await logout.method_check_logout();
});
});
objectPage-> login_page.js
const { expect, firefox} = require('#playwright/test');
const uaParser= require("ua-parser-js");
exports.Login = class Login {
/**
* #param {import('#playwright/test').Page} page
*/
constructor(page) {
this.page = page; }
async visit_baseurl () {
await this.page.goto("/");
//await this.page.goto(process.env.env_baseurl );
}
async method_login() {
const usernameId = selectorUserId;
const passwordId = selectorPassId;
const idLogin = '[id="fc-login-button"]';
const usernameValue = 'demo_user_123';
const passwordValue = 'demo_pass_123';
const mydelay = 200;
await this.page.click(usernameId);
await this.page.type(usernameId, usernameValue, {delay:mydelay});
await this.page.click(passwordId);
await this.page.type(passwordId, passwordValue, {delay:mydelay});
await this.page.click(idLogin,{delay:mydelay})
}
etc ....
}
the same for logout_page
If you need any more info just let mne know.

Puppeteer web scraping is not loading profile for scraping

I am trying to get the puppeteer web scraping to scrape clutch.co using the below scripts but not working properly. I'm developing a simple web scraper for clutch.co. I want to extract contacts and company names for my lead generation project. In fact, to achieve this goal would improve my understanding of javascript.
Brower.js
const puppeteer = require("puppeteer");
async function startBrowser() {
let browser;
try {
console.log("Opening the browser......");
browser = await puppeteer.launch({
headless: false,
args: ["--disable-setuid-sandbox"],
ignoreHTTPSErrors: true,
});
} catch (err) {
console.log("Could not create a browser instance => : ", err);
}
return browser;
}
module.exports = {
startBrowser,
};
Pagecontroller.js
const pageScraper = require("./pageScraper");
async function scrapeAll(browserInstance) {
let browser;
try {
browser = await browserInstance;
await pageScraper.scraper(browser);
} catch (err) {
console.log("Could not resolve the browser instance => ", err);
}
}
module.exports = (browserInstance) => scrapeAll(browserInstance);
Pagescraper.js
const scraperObject = {
url: "https://clutch.co/sitemap",
async scraper(browser) {
let page = await browser.newPage();
console.log(`Navigating to ${this.url}...`);
await page.setDefaultNavigationTimeout(0);
await page.goto(this.url);
// Wait for the required DOM to be rendered
await page.waitForSelector(".container");
// Get the link to all Categories
let urls = await page.$$eval(".sitemap-menu > li", (links) => {
// Extract the links from the data
links = links.map((el) => el.querySelector("div > a").href);
return links;
});
// Loop through each of those links, open a new page instance and get the relevant data from them
let pagePromise = (link) =>
new Promise(async (resolve, reject) => {
let dataObj = {};
let newPage = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await newPage.goto(link);
dataObj["companyName"] = await newPage.$eval(
"h3 > a",
(text) => text.textContent
);
dataObj["tagLine"] = await newPage.$eval(
".tagline",
(text) => text.textContent
);
resolve(dataObj);
await newPage.close();
});
for (link in urls) {
let currentPageData = await pagePromise(urls[link]);
// scrapedData.push(currentPageData);
console.log(currentPageData);
}
},
};
module.exports = scraperObject;
index.js
const browserObject = require("./browser");
const scraperController = require("./pageController");
//Start the browser and create a browser instance
let browserInstance = browserObject.startBrowser();
// Pass the browser instance to the scraper controller
scraperController(browserInstance);
I can't get the script to load and scrape the profile pages. it keeps giving me errors.

Can't access functions made through prototype with WebdriverJS

So, I'm using selenium webdriver with javascript, the browser is firefox.
The base_page looks like this:
function Page() {
this.driver = new Builder().forBrowser('firefox').build();
const driver = this.driver;
this.visit = async(url) => {
try {
return await driver.get(url);
} finally {
console.log("visit: OK");
}
};
...
The home_page looks like this:
const Page = require('./base_page');
Page.prototype.requestBtn = async() => {
try {
await this.write('input', 'user#fakemail.com');
return {
opacity: await this.find('.btn-lg').getCssValue('opacity'),
state: await this.find('.btn-lg').isEnabled()
};
} catch(err) {
console.log(err);
}
};
...
In the mocha.test.js I import the home_page. I can use the functions created in base_page, but can't access functions in home_page.
For example, page.visit() and page.quit() are qorking, but page.requestBtn() not.
const assert = require('assert');
const {Builder, By, Key, until} = require('selenium-webdriver');
const Page = require('../lib/home_page');
let page;
describe('Test', () => {
beforeEach(async() => {
page = new Page();
await page.visit('https://library-app.firebaseapp.com');
});
afterEach(async() => {
await page.quit();
});
it("Example", async() => {
await page.requestBtn();
});
...

How can I download images on a page using puppeteer?

I'm new to web scraping and want to download all images on a webpage using puppeteer:
const puppeteer = require('puppeteer');
let scrape = async () => {
// Actual Scraping goes Here...
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('https://memeculture69.tumblr.com/');
// Right click and save images
};
scrape().then((value) => {
console.log(value); // Success!
});
I have looked at the API‌ docs but could not figure out how to acheive this. So appreciate your help.
If you want to skip the manual dom traversal you can write the images to disk directly from the page response.
Example:
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('response', async response => {
const url = response.url();
if (response.request().resourceType() === 'image') {
response.buffer().then(file => {
const fileName = url.split('/').pop();
const filePath = path.resolve(__dirname, fileName);
const writeStream = fs.createWriteStream(filePath);
writeStream.write(file);
});
}
});
await page.goto('https://memeculture69.tumblr.com/');
await browser.close();
})();
See the documentation for page.on and for the HTTPResponse object that you get from page.on('response', ...).
Here is another example. It goes to a generic search in google and downloads the google image at the top left.
const puppeteer = require('puppeteer');
const fs = require('fs');
async function run() {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.setViewport({ width: 1200, height: 1200 });
await page.goto('https://www.google.com/search?q=.net+core&rlz=1C1GGRV_enUS785US785&oq=.net+core&aqs=chrome..69i57j69i60l3j69i65j69i60.999j0j7&sourceid=chrome&ie=UTF-8');
const IMAGE_SELECTOR = '#tsf > div:nth-child(2) > div > div.logo > a > img';
let imageHref = await page.evaluate((sel) => {
return document.querySelector(sel).getAttribute('src').replace('/', '');
}, IMAGE_SELECTOR);
console.log("https://www.google.com/" + imageHref);
var viewSource = await page.goto("https://www.google.com/" + imageHref);
fs.writeFile(".googles-20th-birthday-us-5142672481189888-s.png", await viewSource.buffer(), function (err) {
if (err) {
return console.log(err);
}
console.log("The file was saved!");
});
browser.close();
}
run();
If you have a list of images you want to download then you could change the selector to programatically change as needed and go down the list of images downloading them one at a time.
You can use the following to scrape an array of all the src attributes of all images on the page:
const images = await page.evaluate(() => Array.from(document.images, e => e.src));
Then you can use the Node File System Module and HTTP or HTTPS Module to download each image.
Complete Example:
'use strict';
const fs = require('fs');
const https = require('https');
const puppeteer = require('puppeteer');
/* ============================================================
Promise-Based Download Function
============================================================ */
const download = (url, destination) => new Promise((resolve, reject) => {
const file = fs.createWriteStream(destination);
https.get(url, response => {
response.pipe(file);
file.on('finish', () => {
file.close(resolve(true));
});
}).on('error', error => {
fs.unlink(destination);
reject(error.message);
});
});
/* ============================================================
Download All Images
============================================================ */
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
let result;
await page.goto('https://www.example.com/');
const images = await page.evaluate(() => Array.from(document.images, e => e.src));
for (let i = 0; i < images.length; i++) {
result = await download(images[i], `image-${i}.png`);
if (result === true) {
console.log('Success:', images[i], 'has been downloaded successfully.');
} else {
console.log('Error:', images[i], 'was not downloaded.');
console.error(result);
}
}
await browser.close();
})();
The logic is simple i think. You just need to make a function which will take url of image and save it to your directory. The puppeteer will just scrape the image url and pass it to downloader function. Here is an example:
const puppeteer = require('puppeteer');
const fs = require('fs');
const request = require('request');
// This is main download function which takes the url of your image
function download(uri, filename) {
return new Promise((resolve, reject) => {
request.head(uri, function (err, res, body) {
request(uri).pipe(fs.createWriteStream(filename)).on('close', resolve);
});
});
}
let main = async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto('https://memeculture69.tumblr.com/');
await page.waitFor(1000);
const imageUrl = await page.evaluate(
// here we got the image url from the selector.
() => document.querySelector('img.image')
);
// Now just simply pass the image url
// to the downloader function to download the image.
await download(imageUrl, 'image.png');
};
main();
This code saves all images found on the page into images folder
page.on('response', async (response) => {
const matches = /.*\.(jpg|png|svg|gif)$/.exec(response.url());
if (matches && (matches.length === 2)) {
const extension = matches[1];
const buffer = await response.buffer();
fs.writeFileSync(`images/${matches[0]}.${extension}`, buffer, 'base64');
}
});
For image download by its selector I did the following:
Obtained uri for the image using selector
Passed uri to the download function
const puppeteer = require('puppeteer');
const fs = require('fs');
var request = require('request');
//download function
var download = function (uri, filename, callback) {
request.head(uri, function (err, res, body) {
console.log('content-type:', res.headers['content-type']);
console.log('content-length:', res.headers['content-length']);
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
});
};
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'], //for no sandbox
});
const page = await browser.newPage();
await page.goto('http://example.com');// your url here
let imageLink = await page.evaluate(() => {
const image = document.querySelector('#imageId');
return image.src;
})
await download(imageLink, 'myImage.png', function () {
console.log('done');
});
...
})();
Resource: Downloading images with node.js
It is possible to get all the images without visiting each url independently. You need to listen to all the requests to the server:
await page.setRequestInterception(true)
await page.on('request', function (request) {
request.continue()
})
await page.on('response', async function (response) {
// Filter those responses that are interesting
const data = await response.buffer()
// data contains the img information
})
You can also filter based on the request type.
const blocked_resources = [
'stylesheet',
/*'image',*/
'media',
'font'
];
const _handleRequest = request => {
const type = request.resourceType();
if (blocked_resources.some(r => type === r)) {
request.abort();
return;
}
request.continue();
return;
}
const puppeteer = require("puppeteer")
const fs = require("fs/promises")
// add the url of website below which you want to scrape
const yourURL = "example.com"
async function scrapeIt() {
// it will launch browser
const browser = await puppeteer.launch()
// This line of code opens new page in browser
const page = await browser.newPage()
// page will open the webpage of your provided url
await page.goto(yourURL)
const photos = await page.$$eval("img", (imgs) => {
return imgs.map((x) => x.src)
})
for (const photo of photos) {
const imagepage = await page.goto(photo)
await fs.writeFile(photo.split("/").pop(), await imagepage.buffer())
}
await browser.close()
}
scrapeIt()
Download google images with 100% quality based on your search query using puppeteer in nodejs.
It is a straightforward approach.
Open google images.
Search for images using keyword.
Click the images one by one to open it's right preview panel.
Store all the links.
Download the images.
Note: If you download the images without previewing , you will lose quality.
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
var puppeteer = require('puppeteer');
const readline = require("readline-sync");
const path = require('path');
const axios = require('axios').default;
// fileUrl: the absolute url of the image or video you want to download
// downloadFolder: the path of the downloaded file on your machine
const downloadFile = async (fileUrl,localFilePath) => {
try {
const response = await axios({
method: 'GET',
url: fileUrl,
responseType: 'stream',
});
const w = response.data.pipe(fs.createWriteStream(localFilePath));
w.on('finish', () => {
console.log('Successfully downloaded file!');
});
} catch (err) {
throw new Error(err);
}
};
const Google_Image = 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'
let data = 'Ramayana HD Images Good Quality wallpaper'
let search_url = Google_Image + 'q=' + data;
var imagelinkslist =[];
let main = async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
let result;
await page.goto(search_url);
// /html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div[1]/a[1]/div[1]/img
// /html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div[2]/a[1]/div[1]/img
let previewimagexpath = '/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img'
// previewimagexpath = '//*[#id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img'
for(let i=1;i<20;i++)
{
let imagexpath = '/html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div['+i+']/a[1]/div[1]/img'
const elements = await page.$x(imagexpath)
await elements[0].click();
await page.waitForTimeout(3000);
const image = await page.$x(previewimagexpath);
let d = await image[0].getProperty('src')
//console.log(d._remoteObject.value);
imagelinkslist.push(d._remoteObject.value)
}
await browser.close();
};
main().then(()=>{
console.log('Got Image links');
imagelinkslist.map((el,index)=>{
let url = el;
//console.log(url);
const path = `./images/image${index+1}.png`;
if(url.includes('https'))
downloadFile(url , path);
})
// console.log(imagelinkslist)
});

Categories

Resources