scrapy-splash python, just extracts first page - javascript

I was wondering if anyone could help me with this.
My code:
from gc import callbacks
import re
import scrapy
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy_splash import SplashRequest
class ProvinciaSpider(Spider):
name = 'provincia'
allowed_domains = ['aduanet.gob.pe']
start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']
def parse(self, response):
data ={ 'accion': 'consultaManifExpProvincia',
'salidaPro': 'YES',
'strMenu': '-',
'strEmpTransTerrestre': '-',
'CMc1_Anno': '2022',
'CMc1_Numero': '354',
'CG_cadu': '046',
'viat': '1'}
yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)
def parse_form_page(self, response):
#I extract and yield
Until here I haven't used splash and I'm already inside the page where I can extract without problem, the problem comes when I want to go to the next page and do the same thing.
XPath of the "next page" link: /html/body/form[1]/table[4]/tbody/tr/td/table[2]/tbody/tr/td[3]/a
which I reduced to this: /html/body/form[1]//td[3][#class="lnk7"]
Now, inside the def parse_form_page and after the extract and yield part, I put this script and yield SplashRequest.
script = """function main(splash)
assert(splash:go(splash.args.url))
splash:wait(0.3)
button = splash:select("/html/body/form[1]//td[3][class=lnk7] a")
splash:set_viewport_full()
splash:wait(0.1)
button:mouse_click()
splash:wait(1)
return {url = splash:url(),
html = splash:html()}
end"""
yield SplashRequest(url ='http://www.aduanet.gob.pe/cl-ad itconsmanifiesto/manifiestoITS01Alias',
formdata=data,
callback =self.parse_form_page,
endpoint='execute',
args = {'lua_source': script})
I know this is wrong because is not working, it is still just extracting the first page. I think the script part is ok, but I'm not sure how to make the yield part, I've seen codes where they just put url= url. They enter the page with splash and it didn't have like a login page, so that's why I don't really know how to make this part.
I'm using splash for this part because the "next page" links are like this:
Siguiente

Related

Python Web Scraping in Pagination in Single Page Application

I am currently researching on how to scrape web content using python in pagination driven by javascript in single page application (SPA).
For example,
https://angular-8-pagination-example.stackblitz.io/
I googled and found that using Scrapy is not possible to scrape javascript / SPA driven content.
It needs to use Splash. I am new to both Scrapy and Splash.
Is this correct?
Also, how do I call the javascript pagination method? I inspect the element, it's just an anchor without href and javascript event.
Please advise.
Thank you,
Hatjhie
You need to use a SpalshRequest to render the JS. You then need to get the pagination text. Generally I use re.search with the appropriate regex pattern to extract the relevent numbers. You can then assign them to current page variable and total pages variables.
Typically a website will move to the next page by incrementing ?page=x or ?p=x at the end of the url. You can then increment this value to scrape all the relevant pages.
The overall pattern looks like this:
import scrapy
from scrapy_splash import SplashRequest
import re
from ..items import Item
proxy ='http//your.proxy.com:PORT'
current_page_xpath='//div[your x path selector]/text()'
last_page_xpath='//div[your other x path selector]/text()'
class spider(scrapy.Spider):
name = 'my_spider'
allowed_domains =['domain.com']
start_urls =['https://www.domaintoscrape.com/page=1']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse, meta ={'proxy':proxy})
def get_page_nbr(value):
#you may need more complex regex to get page numbers.
#most of the time they are in form "page X of Y"
#google is your friend
if re.search('\d+',value):
value = re.search('\d+',value)
value = value[0]
else:
value =None
return value
def parse(self, response):
#get last and current page from response:
last_page = page_response.xpath(last_page_xpath).get()
current_page = page_response.xpath(current_page_xpath).get()
#do something with your response
# if current page is less than last page make another request by incrmenenting the page in the URL
if current_page < last_page:
ajax_url = response.url.replace(f'page={int(current_page)}',f'page={int(current_page)+1}')
yield scrapy.Request(url=ajax_url, callback=self.parse, meta ={'proxy':proxy})
#optional
if current_page == last_page:
print(f'processed {last_page} items for {response.url}')
finally, its worth having a look on Youtube as there are a number of tutorials on scrapy_splash and pagination.

Python: Javascript rendered webpage not parsing

I want to parse the information in the following url. I want to parse the Name of the trade, the strategy description and the transactions in the "Trading History" and "Open Positions". When I parse the page, I do not get this data.
I am new to parsing javascript rendered webpages so I would appreciate some explanation why my code below isn't working.
import bs4 as bs
import urllib
import dryscrape
import sys
import time
url = 'https://www.zulutrade.com/trader/314062/trading'
sess = dryscrape.Session()
sess.visit(url)
time.sleep(10)
sauce = sess.body()
soup = bs.BeautifulSoup(sauce, 'lxml')
Thanks!
Your link in the code doesn't allow you to get anything cause the original url you should play with is the one I'm pasting below .The one you tried to work with automatically gets redirected to the one I mentioned here.
https://www.zulutrade.com/zulutrade-client/traders/api/providers/314062/tradeHistory?
Scraping json data out of the table from that page is as follows:
import requests
r = requests.get('https://www.zulutrade.com/zulutrade-client/traders/api/providers/314062/tradeHistory?')
j = r.json()
items = j['content']
for item in items:
print(item['currency'],item['pips'],item['tradeType'],item['transactionCurrency'],item['id'])

scrapy-splash usage for rendering javascript

This is a follow up of my previous quesion
I installed splash and scrapy-splash.
And also followed the instructions for scrapy-splash.
I edited my code as follows:
import scrapy
from scrapy_splash import SplashRequest
class CityDataSpider(scrapy.Spider):
name = "citydata"
def start_requests(self):
urls = [
'http://www.city-data.com/advanced/search.php#body?fips=0&csize=a&sc=2&sd=0&states=ALL&near=&nam_crit1=6914&b6914=MIN&e6914=MAX&i6914=1&nam_crit2=6819&b6819=15500&e6819=MAX&i6819=1&ps=20&p=0',
'http://www.city-data.com/advanced/search.php#body?fips=0&csize=a&sc=2&sd=0&states=ALL&near=&nam_crit1=6914&b6914=MIN&e6914=MAX&i6914=1&nam_crit2=6819&b6819=15500&e6819=MAX&i6819=1&ps=20&p=1',
]
for url in urls:
yield SplashRequest(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'citydata-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
But still i get the same output. only one html file is generated and the result is only for http://www.city-data.com/advanced/search.php
is there anything wrong in the code or any other suggestions please.
First off, I wanted to clear up some possible points of confusion from your last question which "#paul trmbrth" wrote:
The URL fragment (i.e. everything including and after #body) is not sent to the server and only http://www.city-data.com/advanced/search.php is fetched
So for Scrapy, the requests to [...] and [...] are the same resource, so it's only fetch once. They differ only in their URL fragments.
URI standards dictate that the number sign (#) be used to indicate the start of the fragment, which is the last part of the URL. In most/all browsers, nothing beyond the "#" is transmitted. However, it's fairly common for AJAX sites to utilize Javascript's window.location.hash grab the URL fragment, and use it to execute additional AJAX calls. I bring this up because city-data.com does exactly this, which may confuse you as it does in fact bring back two different sites for each of those URLs in a browser.
Scrapy does by default drop the URL fragment, so it will report both URLs as being just "http://www.city-data.com/advanced/search.php", and filter the second one.
With all of that out of the way, there will still be problem after you remove "#body" from the URLs caused by a combination of of page = response.url.split("/")[-2] and filename = 'citydata-%s.html' % page. Neither of your URL's redirect, so the URL provided is what will populate the response.url string.
Isolating that, we get the following:
>>> urls = [
>>> 'http://www.city-data.com/advanced/search.php?fips=0&csize=a&sc=2&sd=0&states=ALL&near=&nam_crit1=6914&b6914=MIN&e6914=MAX&i6914=1&nam_crit2=6819&b6819=15500&e6819=MAX&i6819=1&ps=20&p=0',
>>> 'http://www.city-data.com/advanced/search.php?fips=0&csize=a&sc=2&sd=0&states=ALL&near=&nam_crit1=6914&b6914=MIN&e6914=MAX&i6914=1&nam_crit2=6819&b6819=15500&e6819=MAX&i6819=1&ps=20&p=1',
>>> ]
>>> for url in urls:
... print(url.split("/")[-2])
advanced
advanced
So, for both URL's, you're extracting the same piece of information, which means when you use filename = 'citydata-%s.html' % page you're going to get the same filename, which I assume would be 'citydata-advanced.html'. The second time it's called, you're overwriting the first file.
Depending on what you're doing with the data, you could either change this to append to the file, or modify your filename variable to something unique such as:
from urlparse import urlparse, parse_qs
import scrapy
from scrapy_splash import SplashRequest
class CityDataSpider(scrapy.Spider):
[...]
def parse(self, response):
page = parse_qs(urlparse(response.url).query).get('p')
filename = 'citydata-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)

Scrapy + Selenium 302 redirection handling

So I'm building a webcrawler that logs into my bank account and gathers data about my spending. I originally was going to use only Scrapy but it didn't work since the First Merit page uses Javascript to log in so I piled Selenium on top.
My code logs in (first you need to input username, and then password, not in conjunction as in most pages) through a series of yielding Requests with specific callback functions that handle the next step.
import scrapy
from scrapy import Request
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import selenium
import time
class LoginSpider(scrapy.Spider):
name = 'www.firstmerit.com'
# allowed_domains = ['https://www.firstmeritib.com']
start_urls = ['https://www.firstmeritib.com/AccountHistory.aspx?a=1']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
# Obtaining necessary components to input my own stuff
username = WebDriverWait(self.driver, 10).until(lambda driver: self.driver.find_element_by_xpath('//*[#id="txtUsername"]'))
login_button = WebDriverWait(self.driver, 10).until(lambda driver: self.driver.find_element_by_xpath('//*[#id="btnLogin"]'))
# The actual interaction
username.send_keys("username")
login_button.click()
# The process of logging in is broken up in two functions since the website requires me
# to enter my username first which redirects me to a password page where I cna finally enter my account (after inputting password)
yield Request(url = self.driver.current_url,
callback = self.password_handling,
meta = {'dont_redirect' : True,
'handle_httpstatus_list': [302],
'cookiejar' : response}
)
def password_handling(self, response):
print("^^^^^^")
print(response.url)
password = WebDriverWait(self.driver, 10).until(lambda driver: self.driver.find_element_by_xpath('//*[#id="MainContent_txtPassword"]'))
login_button2 = WebDriverWait(self.driver, 10).until(lambda driver: self.driver.find_element_by_xpath('//*[#id="MainContent_btnLogin"]'))
password.send_keys("password")
login_button2.click()
print("*****")
print(self.driver.current_url)
print("*****")
yield Request (url = self.driver.current_url,
callback = self.after_login, #, dont_filter = True,
meta = {'dont_redirect' : True,
'handle_httpstatus_list': [302],
'cookiejar' : response.meta['cookiejar'] }
)
def after_login(self, response):
print"***"
print(response.url)
print"***"
print(response.body)
if "Account Activity" in response.body:
self.logger.error("Login failed")
return
else:
print("you got through!")
print()
The issue is that once I finally get to my account page where all my spending is displayed, I can't actually access the HTML data. I've properly handles 302 redirections, but the "meta = " options seem to take me to the page through selenium, but don't let me scrape it.
Instead of getting all the data from response.body in the after_login function, I get the following:
<html><head><title>Object moved</title></head><body>
<h2>Object moved to here.</h2>
</body></html>
How do I manage to actually be able to get that information to scrape it?
Is this redirection in place by the bank to protect account from being crawled?
Thank you!

Scrapy - making selections from dropdown(e.g.date) on webpage

I'm new to scrapy and python and i am trying to scrape data off the following start url .
After login, this is my start url--->
start_urls = ["http://www.flightstats.com/go/HistoricalFlightStatus/flightStatusByFlight.do?"]
(a) from there i need to interact with the webpage to select ---by-airport---
and then make ---airport, date, time period selection---
how can i do that? i would like to loop over all time periods and past dates..
I have used firebug to see the source, I cannot show here as I do not have enough points to post images..
i read a post mentioning the use of Splinter..
(b) after the selections it will lead me to a page where there are links to the eventual page with the information i want. How do i populate the links and make scrapy look into every one to extract the information?
-using rules? where should i insert the rules/ linkextractor function?
I am willing to try myself, hope help can be given to find posts that can guide me.. I am a student and I have spent more than a week on this.. I have done the scrapy tutorial, python tutorial, read the scrapy documentation and searched for previous posts in stackoverflow but I did not manage to find posts that cover this.
a million thanks.
my code so far to log-in and the items to scrape via xpath from the eventual target site:
`import scrapy
from tutorial.items import FlightItem
from scrapy.http import FormRequest
class flightSpider(scrapy.Spider):
name = "flight"
allowed_domains = ["flightstats.com"]
login_page = 'https://www.flightstats.com/go/Login/login_input.do;jsessionid=0DD6083A334AADE3FD6923ACB8DDCAA2.web1:8009?'
start_urls = [
"http://www.flightstats.com/go/HistoricalFlightStatus/flightStatusByFlight.do?"]
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
#"""Generate a login request."""
return FormRequest.from_response(response,formdata= {'loginForm_email': 'marvxxxxxx#hotmail.com', 'password': 'xxxxxxxx'},callback=self.check_login_response)
def check_login_response(self, response):
#"""Check the response returned by a login request to see if we aresuccessfully logged in."""
if "Sign Out" in response.body:
self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
# Now the crawling can begin..
return self.initialized() # ****THIS LINE FIXED THE LAST PROBLEM*****
else:
self.log("\n\n\nFailed, Bad times :(\n\n\n")
# Something went wrong, we couldn't log in, so nothing happens.
def parse(self, response):
for sel in response.xpath('/html/body/div[2]/div[2]/div'):
item = flightstatsItem()
item['flight_number'] = sel.xpath('/div[1]/div[1]/h2').extract()
item['aircraft_make'] = sel.xpath('/div[4]/div[2]/div[2]/div[2]').extract()
item['dep_date'] = sel.xpath('/div[2]/div[1]/div').extract()
item['dep_airport'] = sel.xpath('/div[1]/div[2]/div[2]/div[1]').extract()
item['arr_airport'] = sel.xpath('/div[1]/div[2]/div[2]/div[2]').extract()
item['dep_gate_scheduled'] = sel.xpath('/div[2]/div[2]/div[1]/div[2]/div[2]').extract()
item['dep_gate_actual'] = sel.xpath('/div[2]/div[2]/div[1]/div[3]/div[2]').extract()
item['dep_runway_actual'] = sel.xpath('/div[2]/div[2]/div[2]/div[3]/div[2]').extract()
item['dep_terminal'] = sel.xpath('/div[2]/div[2]/div[3]/div[2]/div[1]').extract()
item['dep_gate'] = sel.xpath('/div[2]/div[2]/div[3]/div[2]/div[2]').extract()
item['arr_gate_scheduled'] = sel.xpath('/div[3]/div[2]/div[1]/div[2]/div[2]').extract()
item['arr_gate_actual'] = sel.xpath('/div[3]/div[2]/div[1]/div[3]/div[2]').extract()
item['arr_terminal'] = sel.xpath('/div[3]/div[2]/div[3]/div[2]/div[1]').extract()
item['arr_gate'] = sel.xpath('/div[3]/div[2]/div[3]/div[2]/div[2]').extract()
yield item`

Categories

Resources