Scrapy + Selenium 302 redirection handling - javascript

So I'm building a webcrawler that logs into my bank account and gathers data about my spending. I originally was going to use only Scrapy but it didn't work since the First Merit page uses Javascript to log in so I piled Selenium on top.
My code logs in (first you need to input username, and then password, not in conjunction as in most pages) through a series of yielding Requests with specific callback functions that handle the next step.
import scrapy
from scrapy import Request
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import selenium
import time
class LoginSpider(scrapy.Spider):
name = 'www.firstmerit.com'
# allowed_domains = ['https://www.firstmeritib.com']
start_urls = ['https://www.firstmeritib.com/AccountHistory.aspx?a=1']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
# Obtaining necessary components to input my own stuff
username = WebDriverWait(self.driver, 10).until(lambda driver: self.driver.find_element_by_xpath('//*[#id="txtUsername"]'))
login_button = WebDriverWait(self.driver, 10).until(lambda driver: self.driver.find_element_by_xpath('//*[#id="btnLogin"]'))
# The actual interaction
username.send_keys("username")
login_button.click()
# The process of logging in is broken up in two functions since the website requires me
# to enter my username first which redirects me to a password page where I cna finally enter my account (after inputting password)
yield Request(url = self.driver.current_url,
callback = self.password_handling,
meta = {'dont_redirect' : True,
'handle_httpstatus_list': [302],
'cookiejar' : response}
)
def password_handling(self, response):
print("^^^^^^")
print(response.url)
password = WebDriverWait(self.driver, 10).until(lambda driver: self.driver.find_element_by_xpath('//*[#id="MainContent_txtPassword"]'))
login_button2 = WebDriverWait(self.driver, 10).until(lambda driver: self.driver.find_element_by_xpath('//*[#id="MainContent_btnLogin"]'))
password.send_keys("password")
login_button2.click()
print("*****")
print(self.driver.current_url)
print("*****")
yield Request (url = self.driver.current_url,
callback = self.after_login, #, dont_filter = True,
meta = {'dont_redirect' : True,
'handle_httpstatus_list': [302],
'cookiejar' : response.meta['cookiejar'] }
)
def after_login(self, response):
print"***"
print(response.url)
print"***"
print(response.body)
if "Account Activity" in response.body:
self.logger.error("Login failed")
return
else:
print("you got through!")
print()
The issue is that once I finally get to my account page where all my spending is displayed, I can't actually access the HTML data. I've properly handles 302 redirections, but the "meta = " options seem to take me to the page through selenium, but don't let me scrape it.
Instead of getting all the data from response.body in the after_login function, I get the following:
<html><head><title>Object moved</title></head><body>
<h2>Object moved to here.</h2>
</body></html>
How do I manage to actually be able to get that information to scrape it?
Is this redirection in place by the bank to protect account from being crawled?
Thank you!

Related

scrapy-splash python, just extracts first page

I was wondering if anyone could help me with this.
My code:
from gc import callbacks
import re
import scrapy
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy_splash import SplashRequest
class ProvinciaSpider(Spider):
name = 'provincia'
allowed_domains = ['aduanet.gob.pe']
start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']
def parse(self, response):
data ={ 'accion': 'consultaManifExpProvincia',
'salidaPro': 'YES',
'strMenu': '-',
'strEmpTransTerrestre': '-',
'CMc1_Anno': '2022',
'CMc1_Numero': '354',
'CG_cadu': '046',
'viat': '1'}
yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)
def parse_form_page(self, response):
#I extract and yield
Until here I haven't used splash and I'm already inside the page where I can extract without problem, the problem comes when I want to go to the next page and do the same thing.
XPath of the "next page" link: /html/body/form[1]/table[4]/tbody/tr/td/table[2]/tbody/tr/td[3]/a
which I reduced to this: /html/body/form[1]//td[3][#class="lnk7"]
Now, inside the def parse_form_page and after the extract and yield part, I put this script and yield SplashRequest.
script = """function main(splash)
assert(splash:go(splash.args.url))
splash:wait(0.3)
button = splash:select("/html/body/form[1]//td[3][class=lnk7] a")
splash:set_viewport_full()
splash:wait(0.1)
button:mouse_click()
splash:wait(1)
return {url = splash:url(),
html = splash:html()}
end"""
yield SplashRequest(url ='http://www.aduanet.gob.pe/cl-ad itconsmanifiesto/manifiestoITS01Alias',
formdata=data,
callback =self.parse_form_page,
endpoint='execute',
args = {'lua_source': script})
I know this is wrong because is not working, it is still just extracting the first page. I think the script part is ok, but I'm not sure how to make the yield part, I've seen codes where they just put url= url. They enter the page with splash and it didn't have like a login page, so that's why I don't really know how to make this part.
I'm using splash for this part because the "next page" links are like this:
Siguiente

Django why getting this error No user profile found matching the query

I am trying to implement load more button so I am writing this view for load json data but I am not understanding why I am getting this error. No user profile found matching the query Raised by: members.views.UserProfileUpdate
here is my view:
class PostJsonListView(View):
def get(self, *args, **kwargs):
print(kwargs)
upper = kwargs.get('num_posts')
lower = upper - 3
posts = list(Blog.objects.values()[lower:upper])
posts_size = len(Blog.objects.all())
max_size = True if upper >= posts_size else False
return JsonResponse({'data': posts, 'max': max_size}, safe=False)
this is my blog app
urls.py
path('posts-json/<int:num_posts>',PostJsonListView.as_view(),name='json-view')
this is my members app
urls.py
path('<slug:slug>/', UserProfileUpdate.as_view(), name='update-profile'),
if I put any wrong url like this http://127.0.0.1:8000/sassassdsadasdd/ giving me the same error No user profile found matching the query Raised by: members.views.UserProfileUpdate
you are in the UserProfileUpdate view, using the slug for getting the profile object like this:
Profile.objects.get(username=slug)
but!
you should use the get_object_or_404 shortcut functions.
from django.shortcuts import get_object_or_404
get_object_or_404(Profile, username=slug)
refrence

How to use Selenium to get real-time stock price on website?

I am working on a school project to get the real-time stock price on JPMorgan Website. I would like to get all the data shown in <div class="price_area">. I have tried beautifulsoup and yahoo api but still cannot get what I want. So, it my first time to try selenium but I have no idea how to run the javascript by it. Here is my code:
def getStockPrice():
driver = webdriver.Chrome()
driver.get("http://www.jpmhkwarrants.com/en_hk/market-statistics/underlying/underlying-terms/code/1")
try:
stock_index = WebDriverWait(driver, 10).until(
driver.find_element_by_class_name('price_area').find_element_by_class_name('price')
)
finally:
driver.quit()
But it shows the error 'WebElement' is not callable. How can I get the real-time price, % change and the open price. Thanks.
to use .find_element_by_* in WebDriverWait you have to use lambda function like
stock_index = WebDriverWait(driver, 10).until(
lambda d: d.find_element_by_class_name('price_area').find_element_by_class_name('price')
)
and don't forget to call .text to get the content
def getStockPrice():
driver = webdriver.Chrome()
driver.get("http://www.jpmhkwarrants.com/en_hk/market-statistics/underlying/underlying-terms/code/0700")
try:
stock_index = WebDriverWait(driver, 10).until(
lambda x: x.find_element_by_class_name('price_area')
)
price = stock_index.find_element_by_class_name('price')
percentage = stock_index.find_element_by_css_selector('.percentage.rise')
open_price = stock_index.find_element_by_css_selector('ul li span')
print('Current price: ' + price.text)
print('Percentage: ' + percentage.text)
print('Open price: ' + open_price.text)
except:
print('wait timeout, element not found')
finally:
driver.quit()
You can use requests and BeautifulSoup to get the three items you mention using the Ajax query string call
import requests
from bs4 import BeautifulSoup
url= 'http://www.jpmhkwarrants.com/en_hk/ajax/terms_quick_search?_=1543832902362'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
items = [item.text for item in soup.select('.price,.percentage.rise,li:nth-of-type(3) span')]
print(items)
Result:
The real time box has its own Ajax call of:
http://www.jpmhkwarrants.com/en_hk/ajax/market-terms-real-time-box/code/0700?_=1543832902364
You can use that to retrieve all items in that box.
import requests
from bs4 import BeautifulSoup
url= 'http://www.jpmhkwarrants.com/en_hk/ajax/market-terms-real-time-box/code/0700?_=1543832902364'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
items = [item.text.strip().split('\n') for item in soup.select('.price_area div')]
tidied = [item for sublist in items for item in sublist if item and item !='Change (%)']
print(tidied)
Result:
That data isn't real-time.
You have to pay for real time data usually.
If your project involves any type of paper trading/analysis, know that everything you pull from a scrape will probably be delayed by 5-15min.
I've heard Bloomberg has a free api, but I don't know if the real time data is free.
Check out Interactive Brokers API. I'm pretty sure access to the data is free, and it allows you to connect to a paper trading account to test strategies, and algorithms.

Scrapy - making selections from dropdown(e.g.date) on webpage

I'm new to scrapy and python and i am trying to scrape data off the following start url .
After login, this is my start url--->
start_urls = ["http://www.flightstats.com/go/HistoricalFlightStatus/flightStatusByFlight.do?"]
(a) from there i need to interact with the webpage to select ---by-airport---
and then make ---airport, date, time period selection---
how can i do that? i would like to loop over all time periods and past dates..
I have used firebug to see the source, I cannot show here as I do not have enough points to post images..
i read a post mentioning the use of Splinter..
(b) after the selections it will lead me to a page where there are links to the eventual page with the information i want. How do i populate the links and make scrapy look into every one to extract the information?
-using rules? where should i insert the rules/ linkextractor function?
I am willing to try myself, hope help can be given to find posts that can guide me.. I am a student and I have spent more than a week on this.. I have done the scrapy tutorial, python tutorial, read the scrapy documentation and searched for previous posts in stackoverflow but I did not manage to find posts that cover this.
a million thanks.
my code so far to log-in and the items to scrape via xpath from the eventual target site:
`import scrapy
from tutorial.items import FlightItem
from scrapy.http import FormRequest
class flightSpider(scrapy.Spider):
name = "flight"
allowed_domains = ["flightstats.com"]
login_page = 'https://www.flightstats.com/go/Login/login_input.do;jsessionid=0DD6083A334AADE3FD6923ACB8DDCAA2.web1:8009?'
start_urls = [
"http://www.flightstats.com/go/HistoricalFlightStatus/flightStatusByFlight.do?"]
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
#"""Generate a login request."""
return FormRequest.from_response(response,formdata= {'loginForm_email': 'marvxxxxxx#hotmail.com', 'password': 'xxxxxxxx'},callback=self.check_login_response)
def check_login_response(self, response):
#"""Check the response returned by a login request to see if we aresuccessfully logged in."""
if "Sign Out" in response.body:
self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
# Now the crawling can begin..
return self.initialized() # ****THIS LINE FIXED THE LAST PROBLEM*****
else:
self.log("\n\n\nFailed, Bad times :(\n\n\n")
# Something went wrong, we couldn't log in, so nothing happens.
def parse(self, response):
for sel in response.xpath('/html/body/div[2]/div[2]/div'):
item = flightstatsItem()
item['flight_number'] = sel.xpath('/div[1]/div[1]/h2').extract()
item['aircraft_make'] = sel.xpath('/div[4]/div[2]/div[2]/div[2]').extract()
item['dep_date'] = sel.xpath('/div[2]/div[1]/div').extract()
item['dep_airport'] = sel.xpath('/div[1]/div[2]/div[2]/div[1]').extract()
item['arr_airport'] = sel.xpath('/div[1]/div[2]/div[2]/div[2]').extract()
item['dep_gate_scheduled'] = sel.xpath('/div[2]/div[2]/div[1]/div[2]/div[2]').extract()
item['dep_gate_actual'] = sel.xpath('/div[2]/div[2]/div[1]/div[3]/div[2]').extract()
item['dep_runway_actual'] = sel.xpath('/div[2]/div[2]/div[2]/div[3]/div[2]').extract()
item['dep_terminal'] = sel.xpath('/div[2]/div[2]/div[3]/div[2]/div[1]').extract()
item['dep_gate'] = sel.xpath('/div[2]/div[2]/div[3]/div[2]/div[2]').extract()
item['arr_gate_scheduled'] = sel.xpath('/div[3]/div[2]/div[1]/div[2]/div[2]').extract()
item['arr_gate_actual'] = sel.xpath('/div[3]/div[2]/div[1]/div[3]/div[2]').extract()
item['arr_terminal'] = sel.xpath('/div[3]/div[2]/div[3]/div[2]/div[1]').extract()
item['arr_gate'] = sel.xpath('/div[3]/div[2]/div[3]/div[2]/div[2]').extract()
yield item`

Django pagination get request going to dev server

I am implementing pagination via an ajax call in Django. I have everything working on my dev server, but when I move to my outward facing server to show the client, the get requests for the next page still route back to my localhost.
This is what I get in the javascript console:
"GET http://127.0.0.1:8001/uploadedwords/?page=2&title=indextest"
This is what it should be:
"GET http://nameofmywebsite.com/uploadedwords/?page=2&title=indextest"
Is this a caching issue. Is this because I still have debug mode on? How does Django pagination decide what URL to send requests to?
Here is my views.py code. I am using the Django Rest Framework:
def uploadedword_list(request):
from mainsite.serializers import PaginatedUploadedWordSerializer
"""
List all code snippets, or create a new snippet.
"""
if request.method == 'GET':
submitted = SubmittedTextFile.objects.get(title=request.GET['title'])
text = NlpParseText.objects.get(text=submitted)
#words = UploadedWord.objects.filter(user=request.user, text=text)
queryset = UploadedWord.objects.filter(user=request.user, text=text)
paginator = Paginator(queryset, 500)
page = request.GET['page']
try:
words = paginator.page(page)
except PageNotAnInteger:
# If page is not an integer, deliver first page.
words = paginator.page(1)
except EmptyPage:
# If page is out of range (e.g. 9999),
# deliver last page of results.
words = paginator.page(paginator.num_pages)
serializer_context = {'request': request}
serializer = PaginatedUploadedWordSerializer(words,
context=serializer_context)
#return Response(serializer.data)
# serializer = UploadedWordSerializer(words, many=True)
return JSONResponse(serializer.data)
elif request.method == 'POST':
data = JSONParser().parse(request)
serializer = UploadedWordSerializer(data=data)
if serializer.is_valid():
serializer.save()
return JSONResponse(serializer.data, status=201)
return JSONResponse(serializer.errors, status=400)

Categories

Resources