Javascript generated content detection using BeautifulSoup and Selenium - javascript

I'm trying to get all the books regarding computer science from Pearson's website (starting from this url: https://www.pearson.com/us/higher-education/professional---career/computer-science/computer-science.html) but the list of books in each category is generated via javascript.
I've tried to use Selenium to get the page open and then parse it using BeautifulSoup. After I open a category page I can't find the
tag that contains all the info about a book.
from selenium.webdriver.support import expected_conditions as ec
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Safari()
driver.get('https://www.pearson.com/us/higher-education/professional---career/computer-science/computer-science.html')
wait = WebDriverWait(driver, 2)
content = driver.page_source
soup = BeautifulSoup(content)
#first I loop through categories
categories = list(driver.find_elements_by_xpath('//ul[#class="category-child-list-level-2"]//a'))
for i in range(len(categories)):
print('CATEGORY : {}/170'.format(i+1))
categories[i].click()
while next_page_link != None:
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.CLASS_NAME, "content-tile-book-box")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
print(soup.findAll('li', attrs={'class':'content-tile-book-box visible'})) #it results always empty
for a in soup.findAll('li', attrs={'class':'content-tile-book-box visible'}):
#I would like to have access to the books' links
book_title_link = a.find_element_by_xpath('/div[#class="wrap-list-block"]//a')
#loop through all the book pages of the current category
next_page_link = driver.find_element_by_xpath('//a[#aria-label="Next"]')
next_page_link.click()
Hope you can help me, thank you!

Since you need to navigate back & forth between pages, i have provided selenium solution here and did not use BS.I have also used chromedriver.
from selenium.webdriver.support import expected_conditions as ec
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Chrome(executable_path='C:\\Selenium\\chromedriver.exe')
url = 'https://www.pearson.com/us/higher-education/professional---career/computer-science/computer-science.html'
driver.get(url)
#first I loop through categories
categories = list(driver.find_elements_by_xpath('//ul[#class="category-child-list-level-2"]//a'))
Total_Category = len(categories)
for i in range(Total_Category):
WebDriverWait(driver, 10).until(ec.visibility_of_all_elements_located((By.XPATH, '//ul[#class="category-child-list-level-2"]//a')))
categories = list(driver.find_elements_by_xpath('//ul[#class="category-child-list-level-2"]//a'))
print('CATEGORY : {}/170'.format(i+1))
categories[i].click()
print("Category: " + categories[i].text)
try:
#loop through all the book pages of the current category
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, "//a[#aria-label='Next']")))
next_page_link = driver.find_element_by_xpath('//a[#aria-label="Next"]')
while next_page_link != None:
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.CLASS_NAME, "content-tile-book-box")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
#print(soup.findAll('li', attrs={'class':'content-tile-book-box visible'})) #it results always empty
#for a in soup.findAll('li', attrs={'class':'content-tile-book-box visible'}):
#I would like to have access to the books' links
# book_title_link = a.find_element_by_xpath('//div[#class="wrap-list-block"]//a')
WebDriverWait(driver, 10).until(ec.visibility_of_any_elements_located((By.XPATH, "//div[#class='product-search-results-list section']//li")))
links = driver.find_elements_by_xpath('//div[#class="wrap-list-block"]//a')
print(len(links))
book_links =[link.get_attribute('href') for link in links]
#for link in links:
print(book_links)
try:
next_page_link = driver.find_element_by_xpath('//a[#aria-label="Next"]')
except NoSuchElementException as exception:
print("Reached end of all books in this category")
driver.get(url)#Go back to main listing
break
next_page_link.click()
except TimeoutException as exception:
print("Next button is not available")
WebDriverWait(driver, 10).until(ec.visibility_of_any_elements_located((By.XPATH, "//div[#class='product-search-results-list section']//li")))
links = driver.find_elements_by_xpath('//div[#class="wrap-list-block"]//a')
print(len(links))
book_links =[link.get_attribute('href') for link in links]
#for link in links:
print(book_links)
driver.get(url)#Go back to main listing

Related

Collecting links from a JS-Based Webpage using Selenium

I need to collect all links from a webpage as seen below (25 links from each 206 pages, around 5200 total links), which also has a load more news button (as three dots). I wrote my script, but my script does not give any links that I tried to collect. I updated some of Selenium attributes. I really don't know why I could not get all the links.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
from selenium.webdriver import Chrome
#Initialize the Chrome driver
driver = webdriver.Chrome()
driver.get("https://www.mfa.gov.tr/sub.en.mfa?ad9093da-8e71-4678-a1b6-05f297baadc4")
page_count = driver.find_element(By.XPATH, "//span[#class='rgInfoPart']")
text = page_count.text
page_count = int(text.split()[-1])
links = []
for i in range(1, page_count + 1):
# Click on the page number
driver.find_element(By.XPATH, f"//a[text()='{i}']").click()
time.sleep(5)
# Wait for the page to load
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Extract the links from the page
page_links = soup.find_all('div', {'class': 'sub_lstitm'})
for link in page_links:
links.append("https://www.mfa.gov.tr"+link.find('a')['href'])
time.sleep(5)
driver.quit()
print(links)
I tried to run my code but actually I couldn't. I need to have some solution for this.
You can easily do everything in Selenium using the following method:
Wait for the links to be visible on the page
Get titles and urls
Get the current page number
If there is the button for the next page, then click it and repeat from step 1., otherwise it means we are in the last page hence the execution ends
What follows is the complete code to scrape all 206 pages
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.mfa.gov.tr/sub.en.mfa?ad9093da-8e71-4678-a1b6-05f297baadc4")
titles, urls = [], []
while 1:
print('current page:', driver.find_element(By.CSS_SELECTOR, 'td span').text, end='\r')
links = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.sub_lstitm a")))
for link in links:
titles.append( link.text )
urls.append( link.get_attribute('href') )
try:
driver.find_element(By.XPATH, '//td//span/parent::td/following-sibling::td[1]').click()
except:
print('next page button not found')
break
for i in range(len(titles)):
print(titles[i],'\n',urls[i],'\n')
Using only Selenium you can easily collect all links from the webpage inducing WebDriverWait for visibility_of_all_elements_located() and you can use either of the following locator strategies:
Using CSS_SELECTOR:
driver.get('https://www.mfa.gov.tr/sub.en.mfa?ad9093da-8e71-4678-a1b6-05f297baadc4')
print([my_elem.get_attribute("href") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.sub_lstitm > a")))])
Using XPATH:
driver.get('https://www.mfa.gov.tr/sub.en.mfa?ad9093da-8e71-4678-a1b6-05f297baadc4')
print([my_elem.get_attribute("href") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='sub_lstitm']/a")))])
Console Output:
['https://www.mfa.gov.tr/no_-17_-turkiye-ve-yemen-arasinda-gerceklestirilecek-konsolosluk-istisareleri-hk.en.mfa', 'https://www.mfa.gov.tr/no_-16_-sayin-bakanimizin-abd-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-15_-iran-islam-cumhuriyeti-disisleri-bakani-huseyin-emir-abdullahiyan-in-ulkemize-yapacagi-ziyaret-hk.en.mfa', 'https://www.mfa.gov.tr/no_-14_-nepal-de-meydana-gelen-ucak-kazasi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-13_-bosna-hersek-bakanlar-konseyi-baskan-yrd-ve-disisleri-bakani-bisera-turkovic-in-ulkemizi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-12_-turkiye-iran-konsolosluk-istisareleri-hk.en.mfa', 'https://www.mfa.gov.tr/no_-11_-kuzey-kibris-turk-cumhuriyeti-kurucu-cumhurbaskani-sayin-rauf-raif-denktas-in-vefatinin-on-birinci-yildonumu-hk.en.mfa', 'https://www.mfa.gov.tr/no_-10_-italya-basbakan-yardimcisi-ve-disisleri-ve-uluslararasi-isbirligi-bakani-antonio-tajani-nin-ulkemizi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-9_-kirim-tatar-soydaslarimiz-hakkinda-mahkumiyet-karari-verilmesi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-8_-kuzeybati-suriye-ye-yonelik-bm-sinir-otesi-insani-yardim-mekanizmasinin-uzatilmasi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-7_-brezilya-da-devlet-baskani-lula-da-silva-hukumeti-ni-ve-demokratik-kurumlari-hedef-alan-siddet-olaylari-hk.en.mfa', 'https://www.mfa.gov.tr/no_-6_-sudan-daki-gelismeler-hk.en.mfa', 'https://www.mfa.gov.tr/no_-5_-senegal-in-gniby-kentinde-meydana-gelen-kaza-hk.en.mfa', 'https://www.mfa.gov.tr/no_-4_-sayin-bakanimizin-afrika-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-3_-deas-teror-orgutu-ile-iltisakli-bir-sebekenin-malvarliklarinin-abd-makamlari-ile-eszamanli-olarak-dondurulmasi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-2_-somali-de-meydana-gelen-teror-saldirisi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-1_-israil-ulusal-guvenlik-bakani-itamar-ben-gvir-in-mescid-i-aksa-ya-baskini--hk.en.mfa', 'https://www.mfa.gov.tr/no_-386_-sayin-bakanimizin-brezilya-yi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/sc_-32_-gkry-nin-dogu-akdeniz-de-devam-eden-hidrokarbon-faaliyetleri-hk-sc.en.mfa', 'https://www.mfa.gov.tr/no_-385_-afganistan-da-yuksekogretimde-kiz-ogrencilere-getirilen-egitim-yasagi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-384_-isvec-disisleri-bakani-tobias-billstrom-un-turkiye-yi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-383_-yemen-cumhuriyeti-disisleri-ve-yurtdisindaki-yemenliler-bakani-dr-ahmed-awad-binmubarak-in-ulkemizi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-382_-gambiya-disisleri-uluslararasi-isbirligi-ve-yurtdisindaki-gambiyalilar-bakani-nin-ulkemizi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-381_-bosna-hersek-e-ab-adaylik-statusu-verilmesi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-380_-turkiye-meksika-ust-duzey-iki-uluslu-komisyonu-siyasi-komitesinin-ikinci-toplantisinin-duzenlenmesi-hk.en.mfa']
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

Using Selenium to scrape webpage with javascript

I want to scrape a google scholar page with 'show more' button. I understand from my previous question that it is not a html but a javascript and there are several ways to scrape such pages. I tries selenium and tried the following code.
from selenium import webdriver
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
chrome_path = r"....path....."
driver = webdriver.Chrome(chrome_path)
driver.get("https://scholar.google.com/citations?user=TBcgGIIAAAAJ&hl=en")
driver.find_element_by_xpath('/html/body/div/div[13]/div[2]/div/div[4]/form/div[2]/div/button/span/span[2]').click()
soup = BeautifulSoup(driver.page_source,'html.parser')
papers = soup.find_all('tr',{'class':'gsc_a_tr'})
for paper in papers:
title = paper.find('a',{'class':'gsc_a_at'}).text
author = paper.find('div',{'class':'gs_gray'}).text
journal = [a.text for a in paper.select("td:nth-child(1) > div:nth-child(3)")]
print('Paper Title:', title, '\nAuthor:', author, '\nJournal:', journal)
The browser now clicks the 'show more' button and displays the entire page. But, I am still getting the information only for the first 20 papers. I dont understand why. Please help!
Thanks!
I believe your problem is that the new elements haven't completely loaded in when your program checks the website. Try importing time and then sleeping for a few minutes. Like this (I removed the headless features so you can see the program work):
from selenium import webdriver
import time
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
driver = webdriver.Chrome()
driver.get("https://scholar.google.com/citations?user=TBcgGIIAAAAJ&hl=en")
time.sleep(3)
driver.find_element_by_id("gsc_bpf_more").click()
time.sleep(4)
soup = BeautifulSoup(driver.page_source, 'html.parser')
papers = soup.find_all('tr', {'class': 'gsc_a_tr'})
for paper in papers:
title = paper.find('a', {'class': 'gsc_a_at'}).text
author = paper.find('div', {'class': 'gs_gray'}).text
journal = [a.text for a in paper.select("td:nth-child(1) > div:nth-child(3)")]
print('Paper Title:', title, '\nAuthor:', author, '\nJournal:', journal)
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.page_load_strategy = 'normal'
driver = webdriver.Chrome(options=options)
driver.get("https://scholar.google.com/citations?user=TBcgGIIAAAAJ&hl=en")
# Awkward method
# Loading all available articles and then iterating over them
for i in range(1, 3):
driver.find_element_by_css_selector('#gsc_bpf_more').click()
# waits until elements are loaded
time.sleep(3)
# Container where all data located
for result in driver.find_elements_by_css_selector('#gsc_a_b .gsc_a_t'):
title = result.find_element_by_css_selector('.gsc_a_at').text
authors = result.find_element_by_css_selector('.gsc_a_at+ .gs_gray').text
publication = result.find_element_by_css_selector('.gs_gray+ .gs_gray').text
print(title)
print(authors)
print(publication)
# just for separating purpose
print()
Part of the output:
Tax/subsidy policies in the presence of environmentally aware consumers
S Bansal, S Gangopadhyay
Journal of Environmental Economics and Management 45 (2), 333-355
Choice and design of regulatory instruments in the presence of green consumers
S Bansal
Resource and Energy economics 30 (3), 345-368

How to scrape content from a dynamic table using python?

I'm trying to extract RSI indicator present on this page under the 'Oscillators' tab.
URL : https://in.tradingview.com/markets/stocks-india/market-movers-active/
I know that I'll have to use something like Selenium to access the tab first, but how do I access the 'oscilators' div.
I'll need to use selenium, and then I could use beautiful-soup to find the right tags and data, right?
Edit -
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from time import sleep
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
# create object for chrome options
chrome_options = Options()
base_url = 'https://in.tradingview.com/markets/stocks-india/market-movers-active/'
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument("disable-infobars")
# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
# invoke the webdriver
browser = webdriver.Chrome(executable_path = r'/Users/judhjitganguli/Downloads/chromedriver',
options = chrome_options)
browser.get('chrome://settings/')
browser.execute_script('chrome.settingsPrivate.setDefaultZoom(0.5);')
browser.get(base_url)
delay = 5 #seconds
while True:
try:
# find tab/button
osiButton = browser.find_element_by_css_selector('.tv-screener-toolbar__favorites div div div:nth-child(8)')
print('button text: ' + osiButton.text)
osiButton.click()
WebDriverWait(browser, 9).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'th:nth-child(2) .js-head-title'), "OSCILLATORS RATING"))
# table updated, get the data
for row in browser.find_elements_by_css_selector(".tv-data-table__tbody tr"):
print(row.text)
#for cell in browser.find_elements_by_css_selector('td'):
# print(cell.text)
except Exception as ex:
print(ex)
# close the automated browser
browser.close()
In the output, I get the required data but it is an infinite loop. How do I get it into a pandas df?
after Oscillators clicked, wait and monitor element th:nth-child(2) .js-head-title for change, from Last to Oscillators Rating using WebDriverWait
# if running headless make sure to add this argument
# or the oscillators tab will not visible or can't be clicked
#chrome_options.add_argument("window-size=1980,960");
try:
# find tab/button
osiButton = driver.find_element_by_css_selector('.tv-screener-toolbar__favorites div div div:nth-child(8)')
print('button text: ' + osiButton.text)
osiButton.click()
WebDriverWait(driver, 9).until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'th:nth-child(2) .js-head-title'), "OSCILLATORS RATING"))
# table updated, get the data
for row in driver.find_elements_by_css_selector('.tv-data-table__tbody tr'):
print(row.text)
#for cell in driver.find_elements_by_css_selector('td'):
#print(cell.text)
except Exception as ex:
print(ex)

How to scrape links that do not have a href and not available in page source

Im trying to use selenium web driver to extract data and i get to see that one of the links that i want to click does not have a href. The html tags i see in inspect element are also not available in the page source.I badly want the link to be clicked and proceed to the next page.
The anchor tag that i see during inspect is as below and this seems to be having a angular JS
< a id="docs" ng-click="changeFragment('deal.docs')">
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('URL here');
time.sleep(5) # Let the user actually see something!
username = driver.find_element_by_name('USERID')
username.send_keys('12345')
password = driver.find_element_by_name('PASSWORD')
password.send_keys('password')
#search_box.submit()
driver.find_element_by_id("submitInput").submit()
time.sleep(5) # Let the user actually see something!
lnum = driver.find_element_by_name('Number')
lnum.send_keys('0589403823')
checkbox = driver.find_element_by_name('includeInactiveCheckBox').click()
driver.find_element_by_id("searchButton").click()
time.sleep(5)
driver.execute_script("​changeFragment('deal.docs')").click()
driver.quit()
I tried to use find element by xpath and script but both didnt work .
The url im trying access cant be shared as it can be accessed only through a specific network

Scraping elements rendered using React JS with BeautifulSoup

I want to scrape anchor links with class="_1UoZlX" from the search results from this particular page - https://www.flipkart.com/search?as=on&as-pos=1_1_ic_sam&as-show=on&otracker=start&page=6&q=samsung+mobiles&sid=tyy%2F4io
When I created a soup from the page I realised that the search results are being rendered using React JS and hence I can't find them in the page source (or in the soup).
Here's my code
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
listUrls = ['https://www.flipkart.com/search?as=on&as-pos=1_1_ic_sam&as-show=on&otracker=start&page=6&q=samsung+mobiles&sid=tyy%2F4iof']
PHANTOMJS_PATH = './phantomjs'
browser = webdriver.PhantomJS(PHANTOMJS_PATH)
urls=[]
for url in listUrls:
browser.get(url)
wait = WebDriverWait(browser, 20)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "_1UoZlX")))
soup = BeautifulSoup(browser.page_source,"html.parser")
results = soup.findAll('a',{'class':"_1UoZlX"})
for result in results:
link = result["href"]
print link
urls.append(link)
print urls
This is the error I'm getting.
Traceback (most recent call last):
File "fetch_urls.py", line 19, in <module>
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "_1UoZlX")))
File "/usr/local/lib/python2.7/site-packages/selenium/webdriver/support/wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
Screenshot: available via screen
Someone mentioned in this answer that there is a way to use selenium to process the javascript on a page. Can someone elaborate on that? I did some googling but couldn't find an approach that works for this particular case.
There is no problem with your code but the website you are scraping - it does not stop loading for some reason that prevents the parsing of the page and subsequent code you wrote.
I tried with wikipedia to confirm the same:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
listUrls = ["https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India"]
# browser = webdriver.PhantomJS('/usr/local/bin/phantomjs')
browser = webdriver.Chrome("./chromedriver")
urls=[]
for url in listUrls:
browser.get(url)
soup = BeautifulSoup(browser.page_source,"html.parser")
results = soup.findAll('a',{'class':"mw-redirect"})
for result in results:
link = result["href"]
urls.append(link)
print urls
Outputs:
[u'/wiki/List_of_states_and_territories_of_India_by_area', u'/wiki/List_of_Indian_states_by_GDP_per_capita', u'/wiki/Constitutional_republic', u'/wiki/States_and_territories_of_India', u'/wiki/National_Capital_Territory_of_Delhi', u'/wiki/States_Reorganisation_Act', u'/wiki/High_Courts_of_India', u'/wiki/Delhi_NCT', u'/wiki/Bengaluru', u'/wiki/Madras', u'/wiki/Andhra_Pradesh_Capital_City', u'/wiki/States_and_territories_of_India', u'/wiki/Jammu_(city)']
P.S. I'm using a chrome driver in order to run the script against the real chrome browser for debugging purposes. Download the chrome driver from https://chromedriver.storage.googleapis.com/index.html?path=2.27/
Selenium will render the page including the Javascript. Your code is working properly. It is waiting for the element to be generated. In your case, Selenium didn't get that CSS element. The URL which you gave is not rendering the result page. Instead of that, It is generating the following error page.
http://imgur.com/a/YwFyE
This page is not having the CSS class. Your code is waiting for that particular CSS element. Try Firefox web driver to see what is happening.

Categories

Resources