How to scrape content from a dynamic table using python? - javascript

I'm trying to extract RSI indicator present on this page under the 'Oscillators' tab.
URL : https://in.tradingview.com/markets/stocks-india/market-movers-active/
I know that I'll have to use something like Selenium to access the tab first, but how do I access the 'oscilators' div.
I'll need to use selenium, and then I could use beautiful-soup to find the right tags and data, right?
Edit -
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from time import sleep
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
# create object for chrome options
chrome_options = Options()
base_url = 'https://in.tradingview.com/markets/stocks-india/market-movers-active/'
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument("disable-infobars")
# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
# invoke the webdriver
browser = webdriver.Chrome(executable_path = r'/Users/judhjitganguli/Downloads/chromedriver',
options = chrome_options)
browser.get('chrome://settings/')
browser.execute_script('chrome.settingsPrivate.setDefaultZoom(0.5);')
browser.get(base_url)
delay = 5 #seconds
while True:
try:
# find tab/button
osiButton = browser.find_element_by_css_selector('.tv-screener-toolbar__favorites div div div:nth-child(8)')
print('button text: ' + osiButton.text)
osiButton.click()
WebDriverWait(browser, 9).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'th:nth-child(2) .js-head-title'), "OSCILLATORS RATING"))
# table updated, get the data
for row in browser.find_elements_by_css_selector(".tv-data-table__tbody tr"):
print(row.text)
#for cell in browser.find_elements_by_css_selector('td'):
# print(cell.text)
except Exception as ex:
print(ex)
# close the automated browser
browser.close()
In the output, I get the required data but it is an infinite loop. How do I get it into a pandas df?

after Oscillators clicked, wait and monitor element th:nth-child(2) .js-head-title for change, from Last to Oscillators Rating using WebDriverWait
# if running headless make sure to add this argument
# or the oscillators tab will not visible or can't be clicked
#chrome_options.add_argument("window-size=1980,960");
try:
# find tab/button
osiButton = driver.find_element_by_css_selector('.tv-screener-toolbar__favorites div div div:nth-child(8)')
print('button text: ' + osiButton.text)
osiButton.click()
WebDriverWait(driver, 9).until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'th:nth-child(2) .js-head-title'), "OSCILLATORS RATING"))
# table updated, get the data
for row in driver.find_elements_by_css_selector('.tv-data-table__tbody tr'):
print(row.text)
#for cell in driver.find_elements_by_css_selector('td'):
#print(cell.text)
except Exception as ex:
print(ex)

Related

Collecting links from a JS-Based Webpage using Selenium

I need to collect all links from a webpage as seen below (25 links from each 206 pages, around 5200 total links), which also has a load more news button (as three dots). I wrote my script, but my script does not give any links that I tried to collect. I updated some of Selenium attributes. I really don't know why I could not get all the links.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
from selenium.webdriver import Chrome
#Initialize the Chrome driver
driver = webdriver.Chrome()
driver.get("https://www.mfa.gov.tr/sub.en.mfa?ad9093da-8e71-4678-a1b6-05f297baadc4")
page_count = driver.find_element(By.XPATH, "//span[#class='rgInfoPart']")
text = page_count.text
page_count = int(text.split()[-1])
links = []
for i in range(1, page_count + 1):
# Click on the page number
driver.find_element(By.XPATH, f"//a[text()='{i}']").click()
time.sleep(5)
# Wait for the page to load
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Extract the links from the page
page_links = soup.find_all('div', {'class': 'sub_lstitm'})
for link in page_links:
links.append("https://www.mfa.gov.tr"+link.find('a')['href'])
time.sleep(5)
driver.quit()
print(links)
I tried to run my code but actually I couldn't. I need to have some solution for this.
You can easily do everything in Selenium using the following method:
Wait for the links to be visible on the page
Get titles and urls
Get the current page number
If there is the button for the next page, then click it and repeat from step 1., otherwise it means we are in the last page hence the execution ends
What follows is the complete code to scrape all 206 pages
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.mfa.gov.tr/sub.en.mfa?ad9093da-8e71-4678-a1b6-05f297baadc4")
titles, urls = [], []
while 1:
print('current page:', driver.find_element(By.CSS_SELECTOR, 'td span').text, end='\r')
links = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.sub_lstitm a")))
for link in links:
titles.append( link.text )
urls.append( link.get_attribute('href') )
try:
driver.find_element(By.XPATH, '//td//span/parent::td/following-sibling::td[1]').click()
except:
print('next page button not found')
break
for i in range(len(titles)):
print(titles[i],'\n',urls[i],'\n')
Using only Selenium you can easily collect all links from the webpage inducing WebDriverWait for visibility_of_all_elements_located() and you can use either of the following locator strategies:
Using CSS_SELECTOR:
driver.get('https://www.mfa.gov.tr/sub.en.mfa?ad9093da-8e71-4678-a1b6-05f297baadc4')
print([my_elem.get_attribute("href") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.sub_lstitm > a")))])
Using XPATH:
driver.get('https://www.mfa.gov.tr/sub.en.mfa?ad9093da-8e71-4678-a1b6-05f297baadc4')
print([my_elem.get_attribute("href") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='sub_lstitm']/a")))])
Console Output:
['https://www.mfa.gov.tr/no_-17_-turkiye-ve-yemen-arasinda-gerceklestirilecek-konsolosluk-istisareleri-hk.en.mfa', 'https://www.mfa.gov.tr/no_-16_-sayin-bakanimizin-abd-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-15_-iran-islam-cumhuriyeti-disisleri-bakani-huseyin-emir-abdullahiyan-in-ulkemize-yapacagi-ziyaret-hk.en.mfa', 'https://www.mfa.gov.tr/no_-14_-nepal-de-meydana-gelen-ucak-kazasi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-13_-bosna-hersek-bakanlar-konseyi-baskan-yrd-ve-disisleri-bakani-bisera-turkovic-in-ulkemizi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-12_-turkiye-iran-konsolosluk-istisareleri-hk.en.mfa', 'https://www.mfa.gov.tr/no_-11_-kuzey-kibris-turk-cumhuriyeti-kurucu-cumhurbaskani-sayin-rauf-raif-denktas-in-vefatinin-on-birinci-yildonumu-hk.en.mfa', 'https://www.mfa.gov.tr/no_-10_-italya-basbakan-yardimcisi-ve-disisleri-ve-uluslararasi-isbirligi-bakani-antonio-tajani-nin-ulkemizi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-9_-kirim-tatar-soydaslarimiz-hakkinda-mahkumiyet-karari-verilmesi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-8_-kuzeybati-suriye-ye-yonelik-bm-sinir-otesi-insani-yardim-mekanizmasinin-uzatilmasi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-7_-brezilya-da-devlet-baskani-lula-da-silva-hukumeti-ni-ve-demokratik-kurumlari-hedef-alan-siddet-olaylari-hk.en.mfa', 'https://www.mfa.gov.tr/no_-6_-sudan-daki-gelismeler-hk.en.mfa', 'https://www.mfa.gov.tr/no_-5_-senegal-in-gniby-kentinde-meydana-gelen-kaza-hk.en.mfa', 'https://www.mfa.gov.tr/no_-4_-sayin-bakanimizin-afrika-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-3_-deas-teror-orgutu-ile-iltisakli-bir-sebekenin-malvarliklarinin-abd-makamlari-ile-eszamanli-olarak-dondurulmasi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-2_-somali-de-meydana-gelen-teror-saldirisi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-1_-israil-ulusal-guvenlik-bakani-itamar-ben-gvir-in-mescid-i-aksa-ya-baskini--hk.en.mfa', 'https://www.mfa.gov.tr/no_-386_-sayin-bakanimizin-brezilya-yi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/sc_-32_-gkry-nin-dogu-akdeniz-de-devam-eden-hidrokarbon-faaliyetleri-hk-sc.en.mfa', 'https://www.mfa.gov.tr/no_-385_-afganistan-da-yuksekogretimde-kiz-ogrencilere-getirilen-egitim-yasagi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-384_-isvec-disisleri-bakani-tobias-billstrom-un-turkiye-yi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-383_-yemen-cumhuriyeti-disisleri-ve-yurtdisindaki-yemenliler-bakani-dr-ahmed-awad-binmubarak-in-ulkemizi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-382_-gambiya-disisleri-uluslararasi-isbirligi-ve-yurtdisindaki-gambiyalilar-bakani-nin-ulkemizi-ziyareti-hk.en.mfa', 'https://www.mfa.gov.tr/no_-381_-bosna-hersek-e-ab-adaylik-statusu-verilmesi-hk.en.mfa', 'https://www.mfa.gov.tr/no_-380_-turkiye-meksika-ust-duzey-iki-uluslu-komisyonu-siyasi-komitesinin-ikinci-toplantisinin-duzenlenmesi-hk.en.mfa']
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

JS/AJAX Content Not loading when URL is accessed using Selenium(Python)

I'm trying to scrape this URL: https://www.wheel-size.com/size/acura/mdx/2001/
The values that I want to scrape are loaded dynamically e.g Center Bore
If you open the link in normal browser the content is loaded just fine but if I use Selenium(chromedriver) it just keeps loading and the values are never displayed.
Any idea how can I scrape it?
Below is the picture of how it looks like. You can also see the loading for 1-2 seconds when you open the link in normal browser.
To extract the desired texts e.g. 64.1 mm, 5x114.3 etc as the elements are Google Tag Manager enabled elements you need to induce WebDriverWait for the visibility_of_element_located() and you can use the following locator strategies:
options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
s = Service('C:\\BrowserDrivers\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get('https://www.wheel-size.com/size/acura/mdx/2001/')
print(WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[contains(., 'Center Bore')]//following::span[1]"))).text)
print(WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[contains(., 'PCD')]//following::span[1]"))).text)
Console Output:
64.1 mm
5x114.3
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
You can find a relevant discussion in How to retrieve the text of a WebElement using Selenium - Python

How to scrape links that do not have a href and not available in page source

Im trying to use selenium web driver to extract data and i get to see that one of the links that i want to click does not have a href. The html tags i see in inspect element are also not available in the page source.I badly want the link to be clicked and proceed to the next page.
The anchor tag that i see during inspect is as below and this seems to be having a angular JS
< a id="docs" ng-click="changeFragment('deal.docs')">
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('URL here');
time.sleep(5) # Let the user actually see something!
username = driver.find_element_by_name('USERID')
username.send_keys('12345')
password = driver.find_element_by_name('PASSWORD')
password.send_keys('password')
#search_box.submit()
driver.find_element_by_id("submitInput").submit()
time.sleep(5) # Let the user actually see something!
lnum = driver.find_element_by_name('Number')
lnum.send_keys('0589403823')
checkbox = driver.find_element_by_name('includeInactiveCheckBox').click()
driver.find_element_by_id("searchButton").click()
time.sleep(5)
driver.execute_script("​changeFragment('deal.docs')").click()
driver.quit()
I tried to use find element by xpath and script but both didnt work .
The url im trying access cant be shared as it can be accessed only through a specific network

Save section of webpage as HTML

I'm trying to save a portion of a webpage and save it as html file.
I can do it manually like this:
When I use F12 (developer tools) in Chrome or Mozilla and use the selector to select the position of the website I want, I see a div and I copy the Xpath. Then I copy the HTML of that element and paste into a notepad editor and save it as HTML.
I've used before Selenium IDE but I don't find a way to save the content of the Xpath of that div.
Is there a way to do it with a combination of Selenium IDE and JavaScript or Python?
Maybe someone could suggest me how to achieve this.
Thanks
this is just selenium example, not your particular answer.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import random
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
seed = 1
random.seed(seed)
driver = webdriver.Chrome()
driver.get("https://www.myntra.com/")
element = driver.find_element_by_xpath("//*[#id='desktop-header-cnt']/div[2]/div[3]/input")
# Put the word "history" in the search box and hit enter
element.send_keys("pantaloons")
element.send_keys(Keys.RETURN)
time.sleep(3)
for i in range(1000):
time.sleep(1)
for i in range(120):
actions = ActionChains(driver)
actions.send_keys(Keys.ARROW_DOWN)
actions.perform()
time.sleep(0.10)
element=driver.find_element_by_xpath(" //*[#id='desktopSearchResults']/div[2]/section/div[2]/ul/li[12]/a")
element.click()
time.sleep(1)
#
#
# # Get a list of elements (videos) that get returned by the search
# search_results = driver.find_elements_by_id("video-title")
#
# # Click randomly on one of the first five results
# search_results[random.randint(0,10)].click()
#
# # Go to the end of the page (I don't know if this is necessary
#
# #
# time.sleep(4)
#
# # Get the recommended videos the same way as above. This is where the problem starts, because recommended_videos essentially becomes the same thing as the previous page's search_results, even though the browser is in a new page now.
# while True:
# recommended_videos = driver.find_elements_by_xpath("//*[#id='dismissable']/div/a")
# print(recommended_videos)
# recommended_videos[random.randint(1,4)].click()
# time.sleep(4)
You can try to dump page source and parse it, or dump only element source.
Page source to pageSource variable (Java):
String pageSource = driver.getPageSource();
Element source to elementSource variable (Java):
WebElement element = driver.findElement(By.id("id"));
String elementSource = element.getAttribute("innerHTML");

Selenium Webdriver not executing JavaScript

I'm trying to scrape data from Aliexpress product page. example.
I need this section. (transaction history)
my code:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
my_url = 'https://www.aliexpress.com/item/Cosmetic-Brush-Makeup-Blusher-Eye-Shadow-Kabuki-Brushes-Set-Tool-Kit-22pcs/32765190537.html?ws_ab_test=searchweb0_0'
chrome_options = Options()
chrome_options.add_argument("--enable-javascript")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(my_url)
innerHTML = driver.execute_script('return document.getElementsByTagName("html")[0].innerHTML')
page_html = driver.page_source
When i run
document.getElementsByTagName("html")[0].innerHTML
on the chrome console i get the entire html including the section that i need.
but, the innerHTML object give me the same html as driver.page_source (without the section that i need)
as far as i know this section is not under iFrame.
Some help please :-)
You probably want to look for this specific table.
Using
innerHTML = document.querySelectorAll('table.transaction-feedback-table');
Will probably find it
The trasactions is generated after the element ID j-transaction-feedback is visible, you have to scroll to the element and wait Ajax request finished.
from selenium.webdriver.support.ui import WebDriverWait
....
....
driver.get(my_url)
# scroll to the element
driver.find_element_by_css_selector('#j-transaction-feedback').location_once_scrolled_into_view
# wait until Ajax finished and render the element
transaction = WebDriverWait(driver, 15).until(
lambda d: d.find_element_by_css_selector('.transaction-feedback-content')
)
total_transaction = driver.find_element_by_css_selector('#j-transaction-feedback .text')
page_source = driver.page_source
print('total_transaction: ' + total_transaction.text)

Categories

Resources