Scraping a webpage - beginner - javascript
I am using python with linux. I believe I have the correct packages installed. I tried getting the content from this page, but the output is too convoluted for me to understand.
When I inspect the html in the browser, I can see the actual page information if I drill down far enough, as shown in the image below, I can see 'Afghanistan' and 'Chishti Sufis' nested in appropriate tags. When I try to get the contents of the webpage using the code below, I've tried 2 methods, I get what looks like a script calling functions referring to the information stored elsewhere. Can someone please give me pointers on how to understand this structure to get the information I need. I want to be able to extract the regions, titles and the year range from this page(https://religiondatabase.org/browse/regions)
How can I tell if this page allows me to extract their data or if it is wrong to do so.
Thanks in advance for your help.
I tried the two approaches below. In the second approach I tried to extract the information in the script tag, but I can't understand it. I was expecting to see the actual page content nested under various html tags, but I don't
import requests
from bs4 import BeautifulSoup
import html5lib
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
URL = 'https://religiondatabase.org/browse/regions'
r = requests.get(url =URL, headers=headers)
print(r.content)
**Output: **
b'<!doctype html>You need to enable JavaScript to run this app.!function(e){function r(r){for(var n,u,c=r[0],i=r1,f=r[2],p=0,s=[];p<c.length;p++)u=c[p],Object.prototype.hasOwnProperty.call(o,u)&&o[u]&&s.push(o[u][0]),o[u]=0;for(n in i)Object.prototype.hasOwnProperty.call(i,n)&&(e[n]=i[n]);for(l&&l(r);s.length;)s.shift()();return a.push.apply(a,f||[]),t()}function t(){for(var e,r=0;r<a.length;r++){for(var t=a[r],n=!0,c=1;c<t.length;c++){var i=t[c];0!==o[i]&&(n=!1)}n&&(a.splice(r--,1),e=u(u.s=t[0]))}return e}var n={},o={3:0},a=[];function u(r){if(n[r])return n[r].exports;var t=n[r]={i:r,l:!1,exports:{}};return e[r].call(t.exports,t,t.exports,u),t.l=!0,t.exports}u.e=function(e){var r=[],t=o[e];if(0!==t)if(t)r.push(t[2]);else{var n=new Promise((function(r,n){t=o[e]=[r,n]}));r.push(t[2]=n);var a,c=document.createElement("script");c.charset="utf-8",c.timeout=120,u.nc&&c.setAttribute("nonce",u.nc),c.src=function(e){return u.p+"static/js/"+({}[e]||e)+"."+{0:"e5e2acc6",1:"e2cf61a4",5:"145ce2fe",6:"c5a670f3",7:"33c0f0b5",8:"e18577cc",9:"2af95b97",10:"66591cf5",11:"ebaf6d39",12:"2c9c3ea5",13:"1f5b00d2"}[e]+".chunk.js"}(e);var i=new Error;a=function(r){c.onerror=c.onload=null,clearTimeout(f);var t=o[e];if(0!==t){if(t){var n=r&&("load"===r.type?"missing":r.type),a=r&&r.target&&r.target.src;i.message="Loading chunk "+e+" failed.\n("+n+": "+a+")",i.name="ChunkLoadError",i.type=n,i.request=a,t1}o[e]=void 0}};var f=setTimeout((function(){a({type:"timeout",target:c})}),12e4);c.onerror=c.onload=a,document.head.appendChild(c)}return Promise.all(r)},u.m=e,u.c=n,u.d=function(e,r,t){u.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},u.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},u.t=function(e,r){if(1&r&&(e=u(e)),8&r)return e;if(4&r&&"object"==typeof e&&e&&e.__esModule)return e;var t=Object.create(null);if(u.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&r&&"string"!=typeof e)for(var n in e)u.d(t,n,function(r){return e[r]}.bind(null,n));return t},u.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return u.d(r,"a",r),r},u.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},u.p="/browse/",u.oe=function(e){throw console.error(e),e};var c=this["webpackJsonpbrowse-app"]=this["webpackJsonpbrowse-app"]||[],i=c.push.bind(c);c.push=r,c=c.slice();for(var f=0;f<c.length;f++)r(c[f]);var l=i;t()}([])'
and
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://religiondatabase.org/browse/regions')
print(r.content)
script = r.html.find('script', first = True)
print(script.text)
**Output: **
!function(e){function r(r){for(var n,u,c=r[0],i=r1,f=r[2],p=0,s=[];p<c.length;p++)u=c[p],Object.prototype.hasOwnProperty.call(o,u)&&o[u]&&s.push(o[u][0]),o[u]=0;for(n in i)Object.prototype.hasOwnProperty.call(i,n)&&(e[n]=i[n]);for(l&&l(r);s.length;)s.shift()();return a.push.apply(a,f||[]),t()}function t(){for(var e,r=0;r<a.length;r++){for(var t=a[r],n=!0,c=1;c<t.length;c++){var i=t[c];0!==o[i]&&(n=!1)}n&&(a.splice(r--,1),e=u(u.s=t[0]))}return e}var n={},o={3:0},a=[];function u(r){if(n[r])return n[r].exports;var t=n[r]={i:r,l:!1,exports:{}};return e[r].call(t.exports,t,t.exports,u),t.l=!0,t.exports}u.e=function(e){var r=[],t=o[e];if(0!==t)if(t)r.push(t[2]);else{var n=new Promise((function(r,n){t=o[e]=[r,n]}));r.push(t[2]=n);var a,c=document.createElement("script");c.charset="utf-8",c.timeout=120,u.nc&&c.setAttribute("nonce",u.nc),c.src=function(e){return u.p+"static/js/"+({}[e]||e)+"."+{0:"e5e2acc6",1:"e2cf61a4",5:"145ce2fe",6:"c5a670f3",7:"33c0f0b5",8:"e18577cc",9:"2af95b97",10:"66591cf5",11:"ebaf6d39",12:"2c9c3ea5",13:"1f5b00d2"}[e]+".chunk.js"}(e);var i=new Error;a=function(r){c.onerror=c.onload=null,clearTimeout(f);var t=o[e];if(0!==t){if(t){var n=r&&("load"===r.type?"missing":r.type),a=r&&r.target&&r.target.src;i.message="Loading chunk "+e+" failed.\n("+n+": "+a+")",i.name="ChunkLoadError",i.type=n,i.request=a,t1}o[e]=void 0}};var f=setTimeout((function(){a({type:"timeout",target:c})}),12e4);c.onerror=c.onload=a,document.head.appendChild(c)}return Promise.all(r)},u.m=e,u.c=n,u.d=function(e,r,t){u.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},u.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},u.t=function(e,r){if(1&r&&(e=u(e)),8&r)return e;if(4&r&&"object"==typeof e&&e&&e.__esModule)return e;var t=Object.create(null);if(u.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&r&&"string"!=typeof e)for(var n in e)u.d(t,n,function(r){return e[r]}.bind(null,n));return t},u.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return u.d(r,"a",r),r},u.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},u.p="/browse/",u.oe=function(e){throw console.error(e),e};var c=this["webpackJsonpbrowse-app"]=this["webpackJsonpbrowse-app"]||[],i=c.push.bind(c);c.push=r,c=c.slice();for(var f=0;f<c.length;f++)r(c[f]);var l=i;t()}([])
Related
Best way to get html text element on website and copy it to clipboard?
I constantly need to copy some text data from the same website. Copying it myself is tedious and what I would love to have is: When I enter that website, and this particular element (specified with XPath) has some text in it (it's not blank) then the script gets Text from this element and copies it to my clipboard. It would be great if it would work in my Chrome browser. Is it something I could achieve with Python and Selenium for example?
If you are just web scrapping then you really don't need selenium unless you need to input something or can't get there with just the URL. Just get the HTML and extract what you need. import pyperclip import time from bs4 import BeautifulSoup import requests def get_page_html(url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"} page = requests.get(url, headers=headers) return page.content def parse_html(url): page_contents = get_page_html(url) soup = BeautifulSoup(page_contents, 'html.parser') divs = soup.find("div", {"class": "fulfillment-fulfillment-summary"}) return divs.encode_contents() pyperclip.copy(parse_html(<URL>)) s = pyperclip.paste()#If needed
control Javascript translation in Google search
The code performs a google search using the below init.py def search(term, num_results=10, lang="en", lr="lang_en"): usr_agent = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/61.0.3163.100 Safari/537.36'} def fetch_results(search_term, number_results, language_code): escaped_search_term = search_term.replace(' ', '+') google_url = 'https://www.google.com/search?q={}&num={}&hl={}&lr={}'.format(escaped_search_term, number_results+1, language_code, lr) ... Some of the returned links use javascript do translate the website: <script type="text/javascript"> var home = '/de/', root = '/', country = 'ch', language = 'de', w = { "download_image": "Bild download (Niedrige Qualität)", ... another example: <script> dataLayer.push({ 'brand' : 'Renault', 'countryCode' : 'BE', 'googleAccount' : 'UA-23041452-1', 'adobeAccount' : 'renaultbeprod', 'languageCode' : 'nl', ... Is there a way to filter out the results translated through javascript and get search results only in one language ?
My apologies. Please disregard this question. I made a trivial mistake of putting tripple quoted comments inside a function in another part of the code. It disabled the parameters to be passed effectively. After removing tripple quote comment from below, I get the results only in english: for google_url in search(query, # The query you want to run lang='en', # User interface language (host language) num_results = 10, # Number of results per page lr="lang_en" # Langauge of the documents received ''' lr - parameter is implemented in __init__.py of googlesearch It should be handled only here. Other useful search parameters not used yet are: cr - restricts search results to documents originating in a particular country. (ex. cr=countryCA) gl - boosts search results whose country of origin matches the parameter value. (ex. gl=uk) ''' ):
Scrapy: scraping JSON data from URL that is constructed with dates
I have read many posts on using Scrapy to scrape JSON data, but haven't found one with dates in the URL. I am using Scrapy version 2.1.0 and I am trying to scrape this site which populates based on date ranges in the URL. Here is the rest of my code which includes headers I copied from the site I am trying to scrape, but I am trying to use the following while loop to generate the URLs: while start_date <= end_date: start_date += delta dates_url = (str(start_date) + "&end=" + str(start_date)) ideas_url = base_url+dates_url request = scrapy.Request( ideas_url, callback=self.parse_ideas, headers=self.headers ) print(ideas_url) yield request Then I am trying to scrape using the following: def parse_ideas(self, response): raw_data = response.body data = json.loads(raw_data) yield { 'Idea' : data['dates']['data']['idea'] } Here is a more complete error output from when I try to runspider and export to a CSV, but I keep getting the error: File "/usr/lib/python3.7/json/decoder.py", line 355, in raw_decode raise JSONDecodeError("Expecting value", s, err.value) from None json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) Is this the best approach for scraping a site that uses dates in its URL to populate? And, if so, what I am doing wrong with my JSON request that I am not getting any results? Note in case it matters, in settings.py I enabled and edited the following: USER_AGENT = 'Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/4537.36' ROBOTSTXT_OBEY = False COOKIES_ENABLED = False And I added the following at the end of settings.py HTTPERROR_ALLOWED_CODES = [400] DOWNLOAD_DELAY = 2
The problem with this is that you are trying to scrape a JavaScript app. In the html it says: <noscript>You need to enable JavaScript to run this app.</noscript> Another problem is that the app pulls it's data from this api which requires authorization. So I guess your best bet is to either use Splash or Selenium to wait for the page to load and use the html generated by them. Personally I mostly use something very similar to scrapy-selenium. There is also a Package available for it here
Issue scraping javascript generated content with Selenium and python
I'm trying to scrape real estate data off of this website: example As you can see the relevant content is placed into article tags. I'm running selenium with phantomjs: driver = webdriver.PhantomJS(executable_path=PJSpath) Then I generate the URL in python, because all search results are part of the link, so I can search what I'm looking for in the program without needing to fill out the form. Before calling driver.get(engine_link) I copy engine_link to the clipboard and it opens fine in chrome. Next I wait for all possible redirects to happen: def wait_for_redirect(wdriver): elem = wdriver.find_element_by_tag_name("html") count = 0 while True: count += 1 if count > 5: print("Waited for redirect for 5 seconds!") return time.sleep(1) try: elem = wdriver.find_element_by_tag_name("html") except StaleElementReferenceException: return Now at last I want to iterate over all <article> tags on the current page: for article in driver.find_elements_by_tag_name("article"): But this loop never returns anything. The program doesn't find any article tags, I've tried it with xpath and css selectors. Moreover, the articles are enclosed in a section tag, that can't be found either. Is there a problem with this specific type of tags in Selenium or am I missing something JS related here? At the bottom of the page there are JavaScript templates whose naming suggests that they generate the search results. Any help appreciated!
Pretend not to be PhantomJS and add an Explicit Wait (worked for me): from selenium import webdriver from selenium.webdriver import DesiredCapabilities from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # set a custom user-agent user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36" dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = user_agent driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.get("http://www.seloger.com/list.htm?cp=40250&org=advanced_search&idtt=2&pxmin=50000&pxmax=200000&surfacemin=20&surfacemax=100&idtypebien=2&idtypebien=1&idtypebien=11") # wait for arcitles to be present wait = WebDriverWait(driver, 10) wait.until(EC.presence_of_element_located((By.TAG_NAME, "article"))) # get articles for article in driver.find_elements_by_tag_name("article"): print(article.text)
Crawling through pages with PostBack data javascript Python Scrapy
I'm crawling through some directories with ASP.NET programming via Scrapy. The pages to crawl through are encoded as such: javascript:__doPostBack('ctl00$MainContent$List','Page$X') where X is an int between 1 and 180. The MainContent argument is always the same. I have no idea how to crawl into these. I would love to add something to the SLE rules as simple as allow=('Page$') or attrs='__doPostBack', but my guess is that I have to be trickier in order to pull the info from the javascript "link." If it's easier to "unmask" each of the absolute links from the javascript code and save those to a csv, then use that csv to load requests into a new scraper, that's okay, too.
This kind of pagination is not that trivial as it may seem. It was an interesting challenge to solve it. There are several important notes about the solution provided below: the idea here is to follow the pagination page by page passing around the current page in the Request.meta dictionary using a regular BaseSpider since there is some logic involved in the pagination it is important to provide headers pretending to be a real browser it is important to yield FormRequests withdont_filter=True since we are basically making a POST request to the same URL but with different parameters The code: import re from scrapy.http import FormRequest from scrapy.spider import BaseSpider HEADERS = { 'X-MicrosoftAjax': 'Delta=true', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36' } URL = 'http://exitrealty.com/agent_list.aspx?firstName=&lastName=&country=USA&state=NY' class ExitRealtySpider(BaseSpider): name = "exit_realty" allowed_domains = ["exitrealty.com"] start_urls = [URL] def parse(self, response): # submit a form (first page) self.data = {} for form_input in response.css('form#aspnetForm input'): name = form_input.xpath('#name').extract()[0] try: value = form_input.xpath('#value').extract()[0] except IndexError: value = "" self.data[name] = value self.data['ctl00$MainContent$ScriptManager1'] = 'ctl00$MainContent$UpdatePanel1|ctl00$MainContent$agentList' self.data['__EVENTTARGET'] = 'ctl00$MainContent$List' self.data['__EVENTARGUMENT'] = 'Page$1' return FormRequest(url=URL, method='POST', callback=self.parse_page, formdata=self.data, meta={'page': 1}, dont_filter=True, headers=HEADERS) def parse_page(self, response): current_page = response.meta['page'] + 1 # parse agents (TODO: yield items instead of printing) for agent in response.xpath('//a[#class="regtext"]/text()'): print agent.extract() print "------" # request the next page data = { '__EVENTARGUMENT': 'Page$%d' % current_page, '__EVENTVALIDATION': re.search(r"__EVENTVALIDATION\|(.*?)\|", response.body, re.MULTILINE).group(1), '__VIEWSTATE': re.search(r"__VIEWSTATE\|(.*?)\|", response.body, re.MULTILINE).group(1), '__ASYNCPOST': 'true', '__EVENTTARGET': 'ctl00$MainContent$agentList', 'ctl00$MainContent$ScriptManager1': 'ctl00$MainContent$UpdatePanel1|ctl00$MainContent$agentList', '': '' } return FormRequest(url=URL, method='POST', formdata=data, callback=self.parse_page, meta={'page': current_page}, dont_filter=True, headers=HEADERS)