Scrape Javascript-generated page using Scrapy - javascript

The following page gives access to product details by executing a Javascript request:
http://www.ooshop.com/ContentNavigation.aspx?TO_NOEUD_IDMO=N000000013143&FROM_NOEUD_IDMO=N000000013131&TO_NOEUD_IDFO=81080&NOEUD_NIVEAU=2&UNIVERS_INDEX=3
Each product has the following element:
<a id="ctl00_cphC_pn3T1_ctl01_rp_ctl00_ctl00_lbVisu" class="prodimg" href="javascript:__doPostBack('ctl00$cphC$pn3T1$ctl01$rp$ctl00$ctl00$lbVisu','')"><img id="ctl00_cphC_pn3T1_ctl01_rp_ctl00_ctl00_iVisu" title="Visualiser la fiche détail" class="image" onerror="this.src='/Media/images/null.gif';" src="Media/ProdImages/Produit/Vignettes/3270190199359.gif" alt="Dés de jambon" style="height:70px;width:70px;border-width:0px;margin-top:15px"></a>
I try to use FormRequest from Scrapy librairies to crawl these pages but it does not seem to work:
<python>
import scrapy
from scrapy.http import FormRequest
from JStest.items import JstestItem
class ooshoptest2(scrapy.Spider):
name = "ooshoptest2"
allowed_domains = ["ooshop.com"]
start_urls = ["http://www.ooshop.com/courses-en-ligne/ContentNavigation.aspx?TO_NOEUD_IDMO=N000000013143&FROM_NOEUD_IDMO=N000000013131&TO_NOEUD_IDFO=81080&NOEUD_NIVEAU=2&UNIVERS_INDEX=3"]
def parse(self, response):
URL=response.url
path='//div[#class="blockInside"]//ul/li/a'
for balise in response.xpath(path):
jsrequest = response.urljoin(balise.xpath('#href').extract()[0]
js="'"+jsrequest[25:-5]+"'"
data = {'__EVENTTARGET': js,'__EVENTARGUMENT':''}
yield FormRequest(url=URL,
method='POST',
callback=self.parse_level1,
formdata=data,
dont_filter=True)
def parse_level1(self, response):
path='//div[#class="popContent"]'
test=response.xpath(path)[0].extract()
print test
item=JstestItem()
yield item
Does anyone knows how to make this work?
Many thanks!

Related

How to access 'parent' from BeautifulSoup using select or find?

import requests
from bs4 import BeautifulSoup
# raw = requests.get("https://www.daum.net")
# raw = requests.get("http://127.0.0.1:5000/emp")
response = requests.get("https://vip.mk.co.kr/newSt/rate/item_all.php?koskok=KOSPI&orderBy=upjong")
response.raise_for_status()
response.encoding= 'EUC-KR'
html = response.text
bs = BeautifulSoup(html, 'html.parser')
result = bs.select("tr .st2")
<tr>
<td width='92' class='st2'><a href="javascript:goPrice('000020&MSid=&msPortfolioID=')" title='000020'>somethinbg</a></td>
<td width='60' align='right'>15,100</td>
<td width='40' align='right'><span class='t_12_blue'>▼300</span></td>
</tr>
I want to get datas from the someweher by using BeautifulSoup.
But I should access parent Node where has .
However, it's really hard to do it.
This is the code:
Then, how can I get datas from the parent which has the '<tr class ='st2>''
here is the example
You can access the parent element in BeautifulSoup via the element's parent attribute. Since you asked for a list of elements, this has to be done in an iteration.
I'm assuming here that you want to extract each row in the table.
import requests
from bs4 import BeautifulSoup
response = requests.get("https://vip.mk.co.kr/newSt/rate/item_all.php?koskok=KOSPI&orderBy=upjong")
response.raise_for_status()
response.encoding= 'EUC-KR'
html = response.text
bs = BeautifulSoup(html, 'html.parser')
result = [[child.text for child in elem.parent.findChildren('td', recursive=False)] \
for elem in bs.select('tr .st2')]
The result is:
[
['동화약품', '15,100', '▼300'],
['유한양행', '61,100', '▼400'],
['유한양행우', '58,900', '▲300'],
...
]

Not getting redirected back to the updated blog post in django

i have a single Blog,i am trying to only have a detailview,but when i add new post it will only show the pagination,but the post will already be added to my blogpost not until i go to the blog home page,even when i search for a blogpost e.g http://127.0.0.1:8000/pages/blog/22/?= i wont get what i want
##views.py
class BlogDetailView(DetailView):
model = Blog
template_name = 'pages/blog.html'
context_object_name = 'blog'
ordering = ['-timestamp']
##urls.py
path('blog/', blog, name='blog'),
path('blog/<int:pk>/', BlogDetailView.as_view(), name='blog'),
i created a listview and detailview separately, with a diffrent URL
class BlogListView(ListView):
model = Blog
template_name = 'pages/blog.html' # <app>/<model>_<viewtype>.html
context_object_name = 'blog_post'
ordering = ['-timestamp']
paginate_by = 5
class BlogDetailView(DetailView):
model = Blog

How to get the webpage using pyqt5?

I want to get the webpage using pyqt5.
The url is https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html.
The webpage will generate two values with javascript.
Just input 5 in the text box and press the red button.
Two values in red will be returned.
Please refer to the image.
The code below is used to get the webpage.
However, I wait for a long time and there is no response.
What should I change in my code?
Thank you very much.
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup
import pandas as pd
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.first_pass = True
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._load_finished)
self.load(QUrl(url))
self.app.exec_()
def _load_finished(self, result):
if self.first_pass:
self._first_finished()
self.first_pass = False
else:
self._second_finished()
def _first_finished(self):
self.page().runJavaScript('document.getElementById("txtDistance").value = "5";')
self.page().runJavaScript("void(0)")
self.page().runJavaScript("CheckUserWhere();")
def _second_finished(self):
self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
self.app.quit()
url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
element = soup.find('div', {'id':"divResult"})
df = pd.read_html(str(element))
It seems that you have several misconceptions:
When js is executed, the page is not reloaded, so the _second_finished function will never be called.
If you do not want to show the window then it is better to use QWebEnginePage.
Considering the above the html that is obtained is:
<div class="p8-5" id="divResult" style="display:block;">
<div align="center" display="block" id="rsloading" style="display: block;">
<img src="//img2.soufunimg.com/qyb/loading.gif"/>
正在为您加载数据...
</div>
<table border="0" cellpadding="0" cellspacing="0" class="tablebox01" display="none" id="tbResult" style="display: none;" width="600">
<tbody><tr>
<td style="width:260px;"><span class="gray8">建设用地面积:</span>14748平方米</td>
<td style="width:340px;"><span class="gray8">所在城市:</span>山西省 长治市 </td>
</tr>
<tr>
<td><span class="gray8">规划建筑面积:</span>51617平方米</td>
<td><span class="gray8">土地评估楼面价:</span><b class="redc00 font14" id="_bpgj">867.61</b> 元/平方米</td>
</tr>
<tr>
<td><span class="gray8">容积率:</span>大于1并且小于或等于3.5</td>
<td><span class="gray8">土地评估总价:</span><b class="redc00 font14" id="_bSumPrice">4478.34</b> 万元</td>
</tr>
<tr>
<td><span class="gray8">规划用途:</span>住宅用地</td>
<td><span class="gray8">推出楼面价:</span>27.51元/平方米</td>
</tr>
</tbody></table>
</div>
So the simplest thing to do is to filter by the ids "_bpgj" and "_bSumPrice"
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
from bs4 import BeautifulSoup
class Render(QtWebEngineWidgets.QWebEnginePage):
def __init__(self, url):
self.html = ""
self.first_pass = True
self.app = QtWidgets.QApplication(sys.argv)
super(Render, self).__init__()
self.loadFinished.connect(self._load_finished)
self.loadProgress.connect(print)
self.load(QtCore.QUrl(url))
self.app.exec_()
def _load_finished(self, result):
if result:
self.call_js()
def call_js(self):
self.runJavaScript('document.getElementById("txtDistance").value = "5";')
self.runJavaScript("void(0)")
self.runJavaScript("CheckUserWhere();")
self.toHtml(self.callable)
def callable(self, data):
self.html = data
self.app.quit()
url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
_bpgj = soup.find('b', {'id':"_bpgj"}).string
_bSumPrice = soup.find('b', {'id':"_bSumPrice"}).string
print(_bpgj, _bSumPrice)
Output:
867.61 4478.34

Scrapy crawling not working on ASPX website

I'm scraping the Madrid Assembly's website, built in aspx, and I have no idea how to simulate clicks on the links where I need to get the corresponding politicians from. I tried this:
import scrapy
class AsambleaMadrid(scrapy.Spider):
name = "Asamblea_Madrid"
start_urls = ['http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx']
def parse(self, response):
for id in response.css('div#moduloBusqueda div.sangria div.sangria ul li a::attr(id)'):
target = id.extract()
url = "http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx"
formdata= {'__EVENTTARGET': target,
'__VIEWSTATE': '/wEPDwUBMA9kFgJmD2QWAgIBD2QWBAIBD2QWAgIGD2QWAmYPZBYCAgMPZBYCAgMPFgIeE1ByZXZpb3VzQ29udHJvbE1vZGULKYgBTWljcm9zb2Z0LlNoYXJlUG9pbnQuV2ViQ29udHJvbHMuU1BDb250cm9sTW9kZSwgTWljcm9zb2Z0LlNoYXJlUG9pbnQsIFZlcnNpb249MTQuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49NzFlOWJjZTExMWU5NDI5YwFkAgMPZBYMAgMPZBYGBSZnXzM2ZWEwMzEwXzg5M2RfNGExOV85ZWQxXzg4YTEzM2QwNjQyMw9kFgJmD2QWAgIBDxYCHgtfIUl0ZW1Db3VudAIEFghmD2QWAgIBDw8WBB4PQ29tbWFuZEFyZ3VtZW50BTRHcnVwbyBQYXJsYW1lbnRhcmlvIFBvcHVsYXIgZGUgbGEgQXNhbWJsZWEgZGUgTWFkcmlkHgRUZXh0BTRHcnVwbyBQYXJsYW1lbnRhcmlvIFBvcHVsYXIgZGUgbGEgQXNhbWJsZWEgZGUgTWFkcmlkZGQCAQ9kFgICAQ8PFgQfAgUeR3J1cG8gUGFybGFtZW50YXJpbyBTb2NpYWxpc3RhHwMFHkdydXBvIFBhcmxhbWVudGFyaW8gU29jaWFsaXN0YWRkAgIPZBYCAgEPDxYEHwIFL0dydXBvIFBhcmxhbWVudGFyaW8gUG9kZW1vcyBDb211bmlkYWQgZGUgTWFkcmlkHwMFL0dydXBvIFBhcmxhbWVudGFyaW8gUG9kZW1vcyBDb211bmlkYWQgZGUgTWFkcmlkZGQCAw9kFgICAQ8PFgQfAgUhR3J1cG8gUGFybGFtZW50YXJpbyBkZSBDaXVkYWRhbm9zHwMFIUdydXBvIFBhcmxhbWVudGFyaW8gZGUgQ2l1ZGFkYW5vc2RkBSZnX2MxNTFkMGIxXzY2YWZfNDhjY185MWM3X2JlOGUxMTZkN2Q1Mg9kFgRmDxYCHgdWaXNpYmxlaGQCAQ8WAh8EaGQFJmdfZTBmYWViMTVfOGI3Nl80MjgyX2ExYjFfNTI3ZDIwNjk1ODY2D2QWBGYPFgIfBGhkAgEPFgIfBGhkAhEPZBYCAgEPZBYEZg9kFgICAQ8WAh8EaBYCZg9kFgQCAg9kFgQCAQ8WAh8EaGQCAw8WCB4TQ2xpZW50T25DbGlja1NjcmlwdAW7AWphdmFTY3JpcHQ6Q29yZUludm9rZSgnVGFrZU9mZmxpbmVUb0NsaWVudFJlYWwnLDEsIDEsICdodHRwOlx1MDAyZlx1MDAyZnd3dy5hc2FtYmxlYW1hZHJpZC5lc1x1MDAyZkVTXHUwMDJmUXVlRXNMYUFzYW1ibGVhXHUwMDJmQ29tcG9zaWNpb25kZWxhQXNhbWJsZWFcdTAwMmZMb3NEaXB1dGFkb3MnLCAtMSwgLTEsICcnLCAnJykeGENsaWVudE9uQ2xpY2tOYXZpZ2F0ZVVybGQeKENsaWVudE9uQ2xpY2tTY3JpcHRDb250YWluaW5nUHJlZml4ZWRVcmxkHgxIaWRkZW5TY3JpcHQFIVRha2VPZmZsaW5lRGlzYWJsZWQoMSwgMSwgLTEsIC0xKWQCAw8PFgoeCUFjY2Vzc0tleQUBLx4PQXJyb3dJbWFnZVdpZHRoAgUeEEFycm93SW1hZ2VIZWlnaHQCAx4RQXJyb3dJbWFnZU9mZnNldFhmHhFBcnJvd0ltYWdlT2Zmc2V0WQLrA2RkAgEPZBYCAgUPZBYCAgEPEBYCHwRoZBQrAQBkAhcPZBYIZg8PFgQfAwUPRW5nbGlzaCBWZXJzaW9uHgtOYXZpZ2F0ZVVybAVfL0VOL1F1ZUVzTGFBc2FtYmxlYS9Db21wb3NpY2lvbmRlbGFBc2FtYmxlYS9Mb3NEaXB1dGFkb3MvUGFnZXMvUmVsYWNpb25BbGZhYmV0aWNhRGlwdXRhZG9zLmFzcHhkZAICDw8WBB8DBQZQcmVuc2EfDgUyL0VTL0JpZW52ZW5pZGFQcmVuc2EvUGFnaW5hcy9CaWVudmVuaWRhUHJlbnNhLmFzcHhkZAIEDw8WBB8DBRpJZGVudGlmaWNhY2nDs24gZGUgVXN1YXJpbx8OBTQvRVMvQXJlYVVzdWFyaW9zL1BhZ2luYXMvSWRlbnRpZmljYWNpb25Vc3Vhcmlvcy5hc3B4ZGQCBg8PFgQfAwUGQ29ycmVvHw4FKGh0dHA6Ly9vdXRsb29rLmNvbS9vd2EvYXNhbWJsZWFtYWRyaWQuZXNkZAIlD2QWAgIDD2QWAgIBDxYCHwALKwQBZAI1D2QWAgIHD2QWAgIBDw8WAh8EaGQWAgIDD2QWAmYPZBYCAgMPZBYCAgUPDxYEHgZIZWlnaHQbAAAAAAAAeUABAAAAHgRfIVNCAoABZBYCAgEPPCsACQEADxYEHg1QYXRoU2VwYXJhdG9yBAgeDU5ldmVyRXhwYW5kZWRnZGQCSQ9kFgICAg9kFgICAQ9kFgICAw8WAh8ACysEAWQYAgVBY3RsMDAkUGxhY2VIb2xkZXJMZWZ0TmF2QmFyJFVJVmVyc2lvbmVkQ29udGVudDMkVjRRdWlja0xhdW5jaE1lbnUPD2QFKUNvbXBvc2ljacOzbiBkZSBsYSBBc2FtYmxlYVxMb3MgRGlwdXRhZG9zZAVHY3RsMDAkUGxhY2VIb2xkZXJUb3BOYXZCYXIkUGxhY2VIb2xkZXJIb3Jpem9udGFsTmF2JFRvcE5hdmlnYXRpb25NZW51VjQPD2QFGkluaWNpb1xRdcOpIGVzIGxhIEFzYW1ibGVhZJ',
'__EVENTVALIDATION': '/wEWCALIhqvYAwKh2YVvAuDF1KUDAqCK1bUOAqCKybkPAqCKnbQCAqCKsZEJAvejv84Dtkx5dCFr3QGqQD2wsFQh8nP3iq8',
'__VIEWSTATEGENERATOR': 'BAB98CB3',
'__REQUESTDIGEST': '0x476239970DCFDABDBBDF638A1F9B026BD43022A10D1D757B05F1071FF3104459B4666F96A47B4845D625BCB2BE0D88C6E150945E8F5D82C189B56A0DA4BC859D'}
yield scrapy.FormRequest(url=url, formdata= formdata, callback=self.takeEachParty)
def takeEachParty(self, response):
print response.css('ul.listadoVert02 ul li::text').extract()
Going into the source code of the website, I can see how links look like, and how they send the JavaScript query. This is one of the links I need to access:
<a id="ctl00_m_g_36ea0310_893d_4a19_9ed1_88a133d06423_ctl00_Repeater1_ctl00_lnk_Grupo" href="javascript:WebForm_DoPostBackWithOptions(new WebForm_PostBackOptions("ctl00$m$g_36ea0310_893d_4a19_9ed1_88a133d06423$ctl00$Repeater1$ctl00$lnk_Grupo", "", true, "", "", false, true))">Grupo Parlamentario Popular de la Asamblea de Madrid</a>
I have been reading so many articles about, but probably the problem is my ignorance in respect.
Thanks in advance.
EDITED:
SOLUTION: I finally did it! Translating the very helpul code from Padraic Cunningham into Scrapy way. As I specified the issue for Scrapy, I want to post the result just in case someone has the same problem as I had.
So here it goes:
import scrapy
import js2xml
class AsambleaMadrid(scrapy.Spider):
name = "AsambleaMadrid"
start_urls = ['http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx']
def parse(self, response):
source = response
hrefs = response.xpath("//*[#id='moduloBusqueda']//div[#class='sangria']/ul/li/a/#href").extract()
form_data = self.validate(source)
for ref in hrefs:
# js2xml allows us to parse the JS function and params, and so to grab the __EVENTTARGET
js_xml = js2xml.parse(ref)
_id = js_xml.xpath(
"//identifier[#name='WebForm_PostBackOptions']/following-sibling::arguments/string[starts-with(.,'ctl')]")[0]
form_data["__EVENTTARGET"] = _id.text
url_diputado = 'http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx'
# The proper way to send a POST in scrapy is by using the FormRequest
yield scrapy.FormRequest(url=url_diputado, formdata=form_data, callback=self.extract_parties, method='POST')
def validate(self, source):
# these fields are the minimum required as cannot be hardcoded
data = {"__VIEWSTATEGENERATOR": source.xpath("//*[#id='__VIEWSTATEGENERATOR']/#value")[0].extract(),
"__EVENTVALIDATION": source.xpath("//*[#id='__EVENTVALIDATION']/#value")[0].extract(),
"__VIEWSTATE": source.xpath("//*[#id='__VIEWSTATE']/#value")[0].extract(),
" __REQUESTDIGEST": source.xpath("//*[#id='__REQUESTDIGEST']/#value")[0].extract()}
return data
def extract_parties(self, response):
source = response
name = source.xpath("//ul[#class='listadoVert02']/ul/li/a/text()").extract()
print name
I hope is clear. Thanks everybody, again!
If you look at the data posted to the form in chrome or firebug you can see there are many fields passed in the post request, there are a few that are essential and must be parsed from the original page, parsing the ids from the div.sangria ul li a tags is not sufficient as the actual data posted is slightly different, what is posted is in the Javascript function, WebForm_DoPostBackWithOptions which is in the href not the id attribute:
href='javascript:WebForm_DoPostBackWithOptions(new
WebForm_PostBackOptions("ctl00$m$g_36ea0310_893d_4a19_9ed1_88a133d06423$ctl00$Repeater1$ctl03$lnk_Grupo", "", true, "", "", false, true))'>
Sometimes all the underscores are replaced with dollar signs so it is easy to do a str.replace to get them in the correct order but not really in this case, we could use a regex to parse but I like the js2xml lib which can parse a javascript function and its args into an xml tree.
The following code using requests shows you how can get the data from the initial request and get to all the pages you want:
import requests
from lxml import html
import js2xml
post = "http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx"
def validate(xml):
# these fields are the minimum required as cannot be hardcoded
data = {"__VIEWSTATEGENERATOR": xml.xpath("//*[#id='__VIEWSTATEGENERATOR']/#value")[0],
"__EVENTVALIDATION": xml.xpath("//*[#id='__EVENTVALIDATION']/#value")[0],
"__VIEWSTATE": xml.xpath("//*[#id='__VIEWSTATE']/#value")[0],
" __REQUESTDIGEST": xml.xpath("//*[#id='__REQUESTDIGEST']/#value")[0]}
return data
with requests.Session() as s:
# make initial requests to get the links/hrefs and the from fields
r = s.get(
"http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx")
xml = html.fromstring(r.content)
hrefs = xml.xpath("//*[#id='moduloBusqueda']//div[#class='sangria']/ul/li/a/#href")
form_data = validate(xml)
for h in hrefs:
js_xml = js2xml.parse(h)
_id = js_xml.xpath(
"//identifier[#name='WebForm_PostBackOptions']/following-sibling::arguments/string[starts-with(.,'ctl')]")[
0]
form_data["__EVENTTARGET"] = _id.text
r = s.post(post, data=form_data)
xml = html.fromstring(r.content)
print(xml.xpath("//ul[#class='listadoVert02']/ul/li/a/text()"))
If we run the code above we see the different text output from all teh anchor tags:
In [2]: with requests.Session() as s:
...: r = s.get(
...: "http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx")
...: xml = html.fromstring(r.content)
...: hrefs = xml.xpath("//*[#id='moduloBusqueda']//div[#class='sangria']/ul/li/a/#href")
...: form_data = validate(xml)
...: for h in hrefs:
...: js_xml = js2xml.parse(h)
...: _id = js_xml.xpath(
...: "//identifier[#name='WebForm_PostBackOptions']/following-sibling::arguments/string[starts-with(.,'ctl')]")[
...: 0]
...: form_data["__EVENTTARGET"] = _id.text
...: r = s.post(post, data=form_data)
...: xml = html.fromstring(r.content)
...: print(xml.xpath("//ul[#class='listadoVert02']/ul/li/a/text()"))
...:
[u'Abo\xedn Abo\xedn, Sonsoles Trinidad', u'Adrados Gautier, M\xaa Paloma', u'Aguado Del Olmo, M\xaa Josefa', u'\xc1lvarez Padilla, M\xaa Nadia', u'Arribas Del Barrio, Jos\xe9 M\xaa', u'Ballar\xedn Valc\xe1rcel, \xc1lvaro C\xe9sar', u'Berrio Fern\xe1ndez-Caballero, M\xaa In\xe9s', u'Berzal Andrade, Jos\xe9 Manuel', u'Cam\xedns Mart\xednez, Ana', u'Carballedo Berlanga, M\xaa Eugenia', 'Cifuentes Cuencas, Cristina', u'D\xedaz Ayuso, Isabel Natividad', u'Escudero D\xedaz-Tejeiro, Marta', u'Fermosel D\xedaz, Jes\xfas', u'Fern\xe1ndez-Quejo Del Pozo, Jos\xe9 Luis', u'Garc\xeda De Vinuesa Gardoqui, Ignacio', u'Garc\xeda Mart\xedn, Mar\xeda Bego\xf1a', u'Garrido Garc\xeda, \xc1ngel', u'G\xf3mez Ruiz, Jes\xfas', u'G\xf3mez-Angulo Rodr\xedguez, Juan Antonio', u'Gonz\xe1lez Gonz\xe1lez, Isabel Gema', u'Gonz\xe1lez Jim\xe9nez, Bartolom\xe9', u'Gonz\xe1lez Taboada, Jaime', u'Gonz\xe1lez-Mo\xf1ux V\xe1zquez, Elena', u'Gonzalo L\xf3pez, Rosal\xeda', 'Izquierdo Torres, Carlos', u'Li\xe9bana Montijano, Pilar', u'Mari\xf1o Ortega, Ana Isabel', u'Moraga Valiente, \xc1lvaro', u'Mu\xf1oz Abrines, Pedro', u'N\xfa\xf1ez Guijarro, Jos\xe9 Enrique', u'Olmo Fl\xf3rez, Luis Del', u'Ongil Cores, M\xaa Gador', 'Ortiz Espejo, Daniel', u'Ossorio Crespo, Enrique Mat\xedas', 'Peral Guerra, Luis', u'P\xe9rez Baos, Ana Isabel', u'P\xe9rez Garc\xeda, David', u'Pla\xf1iol De Lacalle, Regina M\xaa', u'Redondo Alcaide, M\xaa Isabel', u'Roll\xe1n Ojeda, Pedro', u'S\xe1nchez Fern\xe1ndez, Alejandro', 'Sanjuanbenito Bonal, Diego', u'Serrano Guio, Jos\xe9 Tom\xe1s', u'Serrano S\xe1nchez-Capuchino, Alfonso Carlos', 'Soler-Espiauba Gallo, Juan', 'Toledo Moreno, Lucila', 'Van-Halen Acedo, Juan']
[u'Andaluz Andaluz, M\xaa Isabel', u'Ardid Jim\xe9nez, M\xaa Isabel', u'Carazo G\xf3mez, M\xf3nica', u'Casares D\xedaz, M\xaa Luc\xeda Inmaculada', u'Cepeda Garc\xeda De Le\xf3n, Jos\xe9 Carmelo', 'Cruz Torrijos, Diego', u'Delgado G\xf3mez, Carla', u'Franco Pardo, Jos\xe9 Manuel', u'Freire Campo, Jos\xe9 Manuel', u'Gabilondo Pujol, \xc1ngel', 'Gallizo Llamas, Mercedes', u"Garc\xeda D'Atri, Ana", u'Garc\xeda-Rojo Garrido, Pedro Pablo', u'G\xf3mez Montoya, Rafael', u'G\xf3mez-Chamorro Torres, Jos\xe9 \xc1ngel', u'Gonz\xe1lez Gonz\xe1lez, M\xf3nica Silvana', u'Leal Fern\xe1ndez, M\xaa Isaura', u'Llop Cuenca, M\xaa Pilar', 'Lobato Gandarias, Juan', u'L\xf3pez Ruiz, M\xaa Carmen', u'Manguan Valderrama, Eva M\xaa', u'Maroto Illera, M\xaa Reyes', u'Mart\xednez Ten, Carmen', u'Mena Romero, M\xaa Carmen', u'Moreno Navarro, Juan Jos\xe9', u'Moya Nieto, Encarnaci\xf3n', 'Navarro Lanchas, Josefa', 'Nolla Estrada, Modesto', 'Pardo Ortiz, Josefa Dolores', u'Quintana Viar, Jos\xe9', u'Rico Garc\xeda-Hierro, Enrique', u'Rodr\xedguez Garc\xeda, Nicol\xe1s', u'S\xe1nchez Acera, Pilar', u'Sant\xedn Fern\xe1ndez, Pedro', 'Segovia Noriega, Juan', 'Vicente Viondi, Daniel', u'Vinagre Alc\xe1zar, Agust\xedn']
['Abasolo Pozas, Olga', 'Ardanuy Pizarro, Miguel', u'Beirak Ulanosky, Jazm\xedn', u'Camargo Fern\xe1ndez, Ra\xfal', 'Candela Pokorna, Marco', 'Delgado Orgaz, Emilio', u'D\xedaz Rom\xe1n, Laura', u'Espinar Merino, Ram\xf3n', u'Espinosa De La Llave, Mar\xeda', u'Fern\xe1ndez Rubi\xf1o, Eduardo', u'Garc\xeda G\xf3mez, M\xf3nica', 'Gimeno Reinoso, Beatriz', u'Guti\xe9rrez Benito, Eduardo', 'Huerta Bravo, Raquel', u'L\xf3pez Hern\xe1ndez, Isidro', u'L\xf3pez Rodrigo, Jos\xe9 Manuel', u'Mart\xednez Abarca, Hugo', u'Morano Gonz\xe1lez, Jacinto', u'Ongil L\xf3pez, Miguel', 'Padilla Estrada, Pablo', u'Ruiz-Huerta Garc\xeda De Viedma, Lorena', 'Salazar-Alonso Revuelta, Cecilia', u'San Jos\xe9 P\xe9rez, Carmen', u'S\xe1nchez P\xe9rez, Alejandro', u'Serra S\xe1nchez, Isabel', u'Serra S\xe1nchez, Clara', 'Sevillano De Las Heras, Elena']
[u'Aguado Crespo, Ignacio Jes\xfas', u'\xc1lvarez Cabo, Daniel', u'Gonz\xe1lez Pastor, Dolores', u'Iglesia Vicente, M\xaa Teresa De La', 'Lara Casanova, Francisco', u'Marb\xe1n De Frutos, Marta', u'Marcos Arias, Tom\xe1s', u'Meg\xedas Morales, Jes\xfas Ricardo', u'N\xfa\xf1ez S\xe1nchez, Roberto', 'Reyero Zubiri, Alberto', u'Rodr\xedguez Dur\xe1n, Ana', u'Rubio Ruiz, Juan Ram\xf3n', u'Ruiz Fern\xe1ndez, Esther', u'Sol\xeds P\xe9rez, Susana', 'Trinidad Martos, Juan', 'Veloso Lozano, Enrique', u'Zafra Hern\xe1ndez, C\xe9sar']
You can add the exact same logic to your spider, I just used requests to show you a working example. You should also be aware that not every asp.net site behaves the same, you may have to re-validate for every post as in this related answer.
I think that scrapy's from_response could help you a lot (maybe this isn't the best re but for it, but you'll get the idea), try something like this:
import scrapy
import urllib
from scrapy.http.request.form import FormRequest
class AsambleaMadrid(scrapy.Spider):
name = "Asamblea_Madrid"
start_urls = ['http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx']
def parse(self, response):
ids_re = r'WebForm_PostBackOptions\(([^,]*)'
for id in response.css('#moduloBusqueda li a').re(ids_re):
target = urllib.unquote(id).strip('"')
formdata = {'__EVENTTARGET': target}
request = FormRequest.from_response(response=response,
formdata=formdata,
callback=self.takeEachParty,
dont_click=True)
yield request
def takeEachParty(self, response):
print response.css('.listadoVert02 li a::text').extract()
Do agree with ELRuLL - Firebug is your best friend while scraping.
If you want to avoid JS simulation then you need reproduce carefully all the params/headers that are being sent.
For example from what I see
for __EVENTTARGET you're sending just
id="ctl00_m_g_36ea0310_893d_4a19_9ed1_88a133d06423_ctl00_Repeater2_ctl01_lnk_Diputado")
and via Firebug we see that:
__EVENTTARGET=ctl00$m$g_36ea0310_893d_4a19_9ed1_88a133d06423$ctl00$Repeater2$ctl01$lnk_Diputado
It maybe the reason and maybe not, just repeat and test.
Firebug link just in case.

Scrapy: scraping javascript and nested url

I need to scrape a webpage and I normally use scrapy. I need to follow some link that can be opened through javascript and they are nested into some < ul > and < li >.
For example:
<ul class="level1">
<li class="closed"> <----this become "expanded" when opened
<a href="javascript:etc...
<ul class="level2">
<li class="closed">
<ul class="level3">
<li class="track">
<a href="this_is_the_url_that_I_want">
Now, did I need something else than scrapy (I see that Selenium is suggested) or can I use a XmlLinkExtractor? Or can I, in some ways, use the code to extract the url inside "level3"?
Thanks
EDIT: I'm trying to use selenium but I get " File "/usr/lib/pymodules/python2.7/scrapy/spiderloader.py", line 40, in load
raise KeyError("Spider not found: {}".format(spider_name))
KeyError: 'Spider not found: '"
I'm naming the spider, so I don't understand what I've done wrong.
import scrapy
from selenium import webdriver
class audioSpider(scrapy.Spider):
name = "audio"
allowed_domains = ["http://audio.sample"]
start_urls = ["http://audio.sample/archive-project"]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
el1 = self.driver.find_element_by_xpath('//ul[#class="level1"]/li[#class]/href')
el1.click()
el2 = self.driver.find_element_by_xpath('//id[#class="subNavContainer loaded"/ul[#class="level2"]/li[#class]/href')
el2.click()
el3 = self.driver.find_element_by_xpath('//id[#class="subNavContainer loaded"/ul[#class="level3"]/li[#class="track"]/href')
print el3

Categories

Resources