Get video src from ifram using Javascipt - javascript

I am trying to get video src from a movie website. The iframe content is in most websites hidden because it's hosted on another server and the only way to get it as of my knowledge is to use the inspect element of the dev tools to inspect the video and then the iframe shows up with the video src. I have tried using Node JS and Puppeteer but couldn't manage to successfully use the dev tools via puppeteer.
Here is an instance of what I have tried so far.
const puppeteer = require("puppeteer");
async function scrapeMovie() {
const url = 'https://shahed4u.in/%D9%81%D9%8A%D9%84%D9%85-swords-drawn-2022-%D9%85%D8%AA%D8%B1%D8%AC%D9%85-%D8%A7%D9%88%D9%86-%D9%84%D8%A7%D9%8A%D9%86/watch/';
//start pupeteer browser
const browser = await puppeteer.launch({
headless: false,
executablePath: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
efaultViewport: null,
// devtools: true,
});
//open a black page
const page = await browser.newPage();
//open the desire page
await page.goto(url,{ waitUntil: 'networkidle2' },);
await page.waitForTimeout(3000);
var i = 1;
while (i <= 10) {
await page.click('li[data-i="0"]')
console.log("here yes")
// await page.select('iframe[frameborder="0"]')'
// await page.click('iframe')
await page.waitForTimeout(3000);
const [el] = await page.$x('//*[#id="vplayer"]/div[2]/div[3]/video');
const test = await page.evaluate(() => {
const videoTag = document.querySelectorAll("jw-video.jw-reset video")
return videoTag.src;
});
if(test !== undefined || el !== undefined) {
console.log("here is the results of test: -> ",test);
console.log("Here is the results of el: ->", el)
break;
}
i++
}
browser.close();
};
**//The result I want is simply video src and then out of it the .mp4 source.**
**//But The result I now get is undefined as expected.**
Another example could be done on this movie site as well;
https://www2.solarmovie.to/solar.html
Note: there are ads on those sites mentioned above but its not an issue right now I exit them manually when the automation of Puppeteer does not, for testing purposes, but later on there will be a way around it for sure.
Any help, note or suggestion would be much appreciated and I am sure there are a lot that are in such situations. Thanks

Related

puppeteer not scraping full information from website

I had a puppeteer scrape algorithm that scrapes youtube for the image URL source of videos but my current code only prints 4 strings of output with their URL source and the rest prints empty strings. To check if the error was only with the image source I added code for scraping the video titles as well and the video title scrape code prints all the titles without any empty string. What is the cause of this and how can I fix it to print all image URL sources? I taught of one potential reason why the image source would only be printing 4 strings which is, it might be because youtube has 4 thumbnails per row and the puppeteer is somehow only reading 1 row then printing empty strings for the others but the code I wrote for scraping video titles prints all the video titles which kind of disproves my hypothesis. Any help is appreciated. Thanks in advance.
const puppeteer = require('puppeteer');
async function scrape(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, {timeout: 0});
const selector1 = 'ytd-thumbnail > a > yt-img-shadow > #img'
const src1 = await page.$$eval(selector1, elems => elems.map(el => el.src))
const selector2 = 'h3 > a > #video-title'
const src2 = await page.$$eval(selector2, elems => elems.map(el => el.textContent))
browser.close();
console.log({src1, src2})
}
scrape("http://www.youtube.com")
It is an Infinite Scrolling behavior on Youtube that ensures the client browser only fetches the items once the user scrolled them into view. You can open DevTools elements tab and investigate that last (nth) ytd-rich-item-renderer:nth-child(n). You will see the yt-img-shadow inside:
<yt-img-shadow
ftl-eligible=""
class="style-scope ytd-thumbnail no-transition empty"
style="background-color: transparent;">
<!--css-build:shady-->
<img id="img" class="style-scope yt-img-shadow" alt="" width="9999">
</yt-img-shadow>
Then you scroll down until the element will be in view and the inner <img> will be changed:
<yt-img-shadow
ftl-eligible=""
class="style-scope ytd-thumbnail no-transition"
style="background-color: transparent;"
loaded="">
<!--css-build:shady-->
<img id="img" class="style-scope yt-img-shadow" alt="" width="9999" src="https://i.ytimg.com/vi/_{id}/hqdefault.jpg?sqp={parameter}">
</yt-img-shadow>
There are many answers on Stackoverflow how to deal with infinite scrolling with puppeteer.
Most probably you will need to use vanilla JS (e.g scrollTo) inside a page.evaluate to scroll as much as you want.
You can get video thumbnails from YouTube like in the code below (also check it on the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const mainPageUrl = "https://www.youtube.com";
async function scrollPage(page, scrollElements) {
let currentElement = 0;
while (true) {
let elementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
for (; currentElement < elementsLength; currentElement++) {
await page.waitForTimeout(200);
await page.evaluate(
(currentElement, scrollElements) => {
document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
},
currentElement,
scrollElements
);
}
await page.waitForTimeout(5000);
let newElementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
if (newElementsLength === elementsLength || currentElement > 100) break; // if you want to get all elements (or some other number of elements) change number to 'Infinity' (or some other number)
}
}
async function getThumbnails() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(mainPageUrl);
await page.waitForSelector("#contents");
const scrollElements = "a#thumbnail";
await scrollPage(page, scrollElements);
await page.waitForTimeout(10000);
const urls = await page.$$eval("a#thumbnail #img", (els) => els.map(el => el.getAttribute('src')).filter(el => el));
await browser.close();
return urls;
}
getThumbnails().then(console.log);
Output
[
"https://i.ytimg.com/vi/02oeySm1CJA/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBmrYMHESpY_f1oTNx00iuR3tNeCQ",
"https://i.ytimg.com/vi/RMo2haIPYBM/hq720_live.jpg?sqp=CNifxJcG-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBw4ogzR0709SqbttRdEzfL-aTdgQ",
"https://i.ytimg.com/vi/qJFFp_ta1Zk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBJ-44OFgBUuVUYWBVh3Yi3hQgwIg",
"https://i.ytimg.com/vi/OZoTjoN-Sn0/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCOeGTCnlT4U0wV1SNclkmFUEHLaA",
"https://i.ytimg.com/vi/L8cH2gI67uk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAuvZ3khIjpvAVTGjmR9FDxQrPIgQ",
"https://i.ytimg.com/vi/6rUyVKyJnGY/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCifsTG4MlA3mf8CcJDkfKdWaZkaA",
"https://i.ytimg.com/vi/xpaURivPZFk/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLA5oFDDsVzbV3tUqyfogfuf3LPahQ",
"https://i.ytimg.com/vi/MsR76PyVdUs/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAEBYGNvif-7LWx2mqW4G9o-OUhEQ",
"https://i.ytimg.com/vi/liasQRRVt5w/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAUcMpyKY0GhmNAHHtP_cDkAp18DQ",
"https://i.ytimg.com/vi/Dr5IqlTLMDM/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBOSUi6mgjdD5a-Jx8Ns24SlexB1g",
"https://i.ytimg.com/vi/E8kit8xJKdI/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLDDStn95G7ei5DTusGXE4RimzdLUw",
"https://i.ytimg.com/vi/SqEaahOmLHU/hq720_2.jpg?sqp=-oaymwEdCM0CENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBDcWLCklNxEAuT1ZvSTKrIplGOag",
...and other results
]
You can read more about scraping YouTube from my blog posts:
Web scraping YouTube search video results with Nodejs
Web scraping YouTube secondary search results with Nodejs
Web scraping YouTube video page with Nodejs

Can't scrape from a page I navigate to by using Puppeteer

I'm fairly new to Puppeteer and I'm trying to practice keep tracking of a selected item from Amazon. However, I'm facing a problem when I try to retrieve some results from the page.
The way I intended this automation to work is by following these steps:
New tab.
Go to the home page of Amazon.
Enter the given product name in the search element.
Press the enter key.
Return the product title and price.
Check this example below:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (req) => { // don't load any fonts or images on my requests. To Boost the performance
if (req.resourceType() == 'font' /* || req.resourceType() == 'image' || req.resourceType() == 'stylesheet'*/) {
req.abort();
}
else {
req.continue(); {
}
}
});
const baseDomain = 'https://www.amazon.com';
await page.goto(`${baseDomain}/`, { waitUntil: "networkidle0" });
await page.click("#twotabsearchtextbox" ,{delay: 50})
await page.type("#twotabsearchtextbox", "Bose QuietComfort 35 II",{delay: 50});
await page.keyboard.press("Enter");
await page.waitForNavigation({
waitUntil: 'networkidle2',
});
let productTitle = await page.$$(".a-size-medium, .a-color-base, .a-text-normal")[43]; //varible that holds the title of the product
console.log(productTitle );
debugger;
})();
when I execute this code, I get in the console.log a value of undefined for the variable productTitle. I had a lot of trouble with scraping information from a page I navigate to. I used to do page.evaluate() and it only worked when I'm scraping from the page that I have told the browser to go to.
The first problem is on this line:
let productTitle = await page.$$(".a-size-medium, .a-color-base, .a-text-normal")[43];
// is equivalent to:
let productTitle = await (somePromise[43]);
// As you guessed it, a Promise does not have a property `43`,
// so I think you meant to do this instead:
let productTitle = (await page.$$(".a-size-medium, .a-color-base, .a-text-normal"))[43];
Once this is fixed, you don't get the title text, but a handle to the DOM element. So you can do:
let titleElem = (await page.$$(".a-size-medium, .a-color-base, .a-text-normal"))[43];
let productTitle = await titleElem.evaluate(node => node.innerText);
console.log(productTitle); // "Microphone"
However, I'm not sure that simply selecting the 43rd element will always get you the one you want, but if it isn't, that would be a topic for another question.

switch tabs in playwright test

I'm trying to switch between tabs using playwright tests
but it's not taking control of windows element.
Do we have any method similar to selenium driver.switchto().window() in playwright?
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({ headless: false, args: ['--start-maximized'] });
const context = await browser.newContext({ viewport: null });
context.on("page", async newPage => {
console.log("***newPage***", await newPage.title())
})
const page = await context.newPage()
const navigationPromise = page.waitForNavigation()
// dummy url
await page.goto('https://www.myapp.com/')
await navigationPromise
// User login
await page.waitForSelector('#username-in')
await page.fill('#username-in', 'username')
await page.fill('#password-in', 'password')
await page.click('//button[contains(text(),"Sign In")]')
await navigationPromise
// User lands in application home page and clicks on link in dashboard
// link will open another application in new tab
await page.click('(//span[text()="launch-app-from-dashboard"])[2]')
await navigationPromise
await page.context()
// Waiting for element to appear in new tab and click on ok button
await page.waitForTimeout(6000)
await page.waitForSelector('//bdi[text()="OK"]')
await page.click('//bdi[text()="OK"]')
})()
Assuming "launch-app-from-dashboard" is creating a new page tag, you can use the following pattern to run the subsequent lines of code on the new page. See multi-page scenarios doc for more examples.
// Get page after a specific action (e.g. clicking a link)
const [newPage] = await Promise.all([
context.waitForEvent('page'),
page.click('a[target="_blank"]') // Opens a new tab
])
await newPage.waitForLoadState();
console.log(await newPage.title());
Since you run headless, it might also be useful to switch the visible tab in the browser with page.bringToFront (docs).
The browserContext?.pages() is an array that contains the tabs opened by your application, from there you can use a temporal page to make a switch, once completed your validations you can switch back.
playwright.pageMain: Page = await playwright.Context.newPage();
playwright.pageTemp: Page;
// Save your current page to Temp
playwright.pageTemp = playwright.pageMain;
// Make the new tab launched your main page
playwright.pageMain = playwright.browserContext?.pages()[1];
expect(await playwright.pageMain.title()).toBe('Tab Title');
Assume you only created one page (via browser context), but for some reason, new pages/tabs open.
You can have a list of all the pages by : context.pages,
Now each element of that list represents a <class 'playwright.async_api._generated.Page'> object.
So, now you can assign each page to any variable and access it. (For eg. page2 = context.pages[1])
it('Open a new tab and check the title', async function () {
await page.click(button, { button: "middle" }); //to open an another tab
await page.waitForTimeout(); // wait for page loading
let pages = await context.pages();
expect(await pages[1].title()).equal('Title'); /to compare the title of the second page
})

How do you get all the links from a page with node puppeteer?

I'm trying to build a web crawler with node and came across the puppeteer package which looks perfect for what I want. My end result is to gather all the links from a page, all of its text content, and then a screenshot of the page itself.
I ran the following and it appears to gather a large number of links, however on actual inspection of the site there are links that it is not gathering.
const puppeteer = require('puppeteer');
module.exports = () => {
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://pixabay.com/en/columbine-columbines-aquilegia-3379045/');
await page.screenshot({ path: 'myscreenshot.png', fullPage: true });
let text = await page.$eval('*', el => el.innerText.split(' '));
text = text.map(string => {
return string.replace(/[^\w\s]/gi, '');
});
let hrefs = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('a'))
return links.map(link => link.href);
});
console.log('done');
await browser.close();
})();
};
for example this link : /go/?t=image-details-shutterstock&id=699165328 is nowhere in the array of hrefs. What's worse is these are links that lead out of the site, the exact type of thing I want to do, otherwise I'm stuck only crawling the one site.
Is there a reason my script is only showing some of the links? is the querySelector too narrow or rejecting certain links?
That links are generated by onclick event, it saved in data-go attribute, for example
<a data-go="image-details-shutterstock&id=458320033">
It only need to prepend /go/?t= and to get it
return links.map(link => link.href || link.getAttribute('data-go'));
there are also empty link for menu like
<a><i class="icon icon_menu_user"></i></a>

Unable to choose by selectors using Puppeteer

I have a problem with getting elements by their selectors.
A page on which I struggle is: http://html5.haxball.com/.
What I have succeded is to log in, but that was kind of a hack, because I used the fact, that the field I need to fill is already selected.
After typing in nick and going into lobby I want to click the button 'Create room'. Its selector:
body > div > div > div > div > div.buttons > button:nth-child(3)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
args: ['--no-sandbox'], headless: false, slowMo: 10
});
const page = await browser.newPage();
await page.goto('http://html5.haxball.com/index.html');
await page.keyboard.type(name);
await page.keyboard.press('Enter');
//at this point I am logged in.
let buttonSelector = 'body > div > div > div > div > div.buttons > button:nth-child(3)';
await page.waitForSelector('body > div > div');
await page.evaluate(() => {
document.querySelector(buttonSelector).click();
});
browser.close();
})();
after running such code I get error:
UnhandledPromiseRejectionWarning: Error: Evaluation failed: TypeError: Cannot read property 'click' of null
My initial approach was with:
await page.click(buttonSelector);
instead of page.evaluate but it also fails.
What frustrates my the most is the fact that when I run in Chromium console:
document.querySelector(buttonSelector).click();
it works fine.
A few things to note:
The selector you are using to retrieve the button is more complex than it needs to be. Try something simpler like: 'button[data-hook="create"]'.
The game is within an iframe, so you're better off calling document.querySelector using the iframe's document object as opposed to the containing window's document
The function passed to evaluate is executed in a different context than where you are running your node script. For this reason, you have to explicitly pass variables from your node script to the window script otherwise buttonSelector will be undefined:
Making the changes above, your code will input your name and successfully click on "Create Room":
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
args: ['--no-sandbox'], headless: false, slowMo: 10
});
const page = await browser.newPage();
await page.goto('http://html5.haxball.com/index.html');
await page.keyboard.type('Chris');
await page.keyboard.press('Enter');
//at this point I am logged in.
let buttonSelector = 'button[data-hook="create"]';
await page.waitForSelector('body > div > div');
await page.evaluate((buttonSelector) => {
var frame = document.querySelector('iframe');
var frameDocument = frame.contentDocument;
frameDocument.querySelector(buttonSelector).click();
}, buttonSelector);
browser.close();
})();

Categories

Resources