Web Scraping Journal "El Peruano" - Python/Scrapy - javascript

im trying to scrap some info from "El Peruano" journal, but i cannot at first sight it look have to:
El Peruano Website
Put a Date in a Formbox.
Do a click in SearchBox.
Get all links for get all: "Title","Resolution#", "Body"
This is my code:
import scrapy
class SpiderPeruano(scrapy.Spider):
name = "peruano"
start_urls = [
"https://diariooficial.elperuano.pe/Normas"
]
custom_settings= {
"FEED_URI": "peruano.json",
"FEED_FORMAT": "json",
"FEED_EXPORT_ENCODING": "utf-8"
}
def parse_click(self, response):
#i put here a condition but i think is not necessary
#button = response.xpath("//div[#id='busqueda']/form[#action]/button[#id='btnBuscar']").get()
#if buttom:
yield scrapy.FormRequest.from_response(
response,
formxpath= "//form[#id='space_PortalNormasLegalesN']",
formdata={"cddesde": "08/03/2022", "cdhasta:": "08/03/2022"},
dont_click=True,
dont_filter=True,
callback=self.parse
)
def parse(self, response):
links = response.xpath("//div[#class='ediciones_texto']/h5/a/#href").getall()
for link in links:
yield response.follow(link, callback=self.parse_link)
def parse_link(self, response):
title = response.xpath("//div[#class='story']/h1[#class='sumilla']/text()").get()
num = response.xpath("//div[#class='story']/h2[#class='resoluci-n']/text()").getall()
body = response.xpath("//div[#class='story']/p/text()").getall()
yield {
"title": title,
"num": num,
"body": body
}
#call
#scrapy crawl peruano
#url = "https://diariooficial.elperuano.pe/normas"
#Form_BOX: "//form[#action]"
#Box_desde = "//form[#action]/input[#id='cddesde']"
#Box_hasta = "//form[#action]/input[#id='cdhasta']"
#Button= "//div[#id='busqueda']/form[#action]/button[#id='btnBuscar']"
#links = "//div[#class='ediciones_texto']/h5/a/#href"
#titles= "//div[#class='story']/h1[#class='sumilla']/text()"
#resolutionNum= "//div[#class='story']/h2[#class='resoluci-n']/text()"
#body= "//div[#class='story']/p/text()"
So, i need some help for know what i'm doing wrong on my code cuz this run well but dont get the data.
Thx a lot for your time and help!

I found two mistakes:
First:
Scrapy gets url from start_urls and sends response to parse (as default callback) but you expect it in parse_click (to send Form). If I rename functions then it sends form.
Second:
Small typo. In formdata= you use string "cdhasta:" with : at the end and this made problems.
import scrapy
class SpiderPeruano(scrapy.Spider):
name = "peruano"
start_urls = [
"https://diariooficial.elperuano.pe/Normas"
]
custom_settings= {
"FEED_URI": "peruano.json",
"FEED_FORMAT": "json",
"FEED_EXPORT_ENCODING": "utf-8"
}
def parse(self, response):
print('[parse] url:', response.url)
yield scrapy.FormRequest.from_response(
response,
formxpath= "//form[#id='space_PortalNormasLegalesN']",
formdata={"cddesde": "01/03/2022", "cdhasta": "03/03/2022", "btnBuscar":""},
dont_click=True,
dont_filter=True,
#headers={'Referer':"https://diariooficial.elperuano.pe/Normas", 'X-Requested-With': 'XMLHttpRequest'},
callback=self.parse_result
)
def parse_result(self, response):
print('[parse_result] url:', response.url)
links = response.xpath("//div[#class='ediciones_texto']/h5/a/#href").getall()
for link in links:
yield response.follow(link, callback=self.parse_link)
def parse_link(self, response):
print('[parse_link] url:', response.url)
title = response.xpath("//div[#class='story']/h1[#class='sumilla']/text()").get()
num = response.xpath("//div[#class='story']/h2[#class='resoluci-n']/text()").getall()
body = response.xpath("//div[#class='story']/p/text()").getall()
yield {
"title": title,
"num": num,
"body": body
}
# --- run without project ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(SpiderPeruano)
c.start()
EDIT:
Meanwhile I tested it also with requests but I didn't try to get links from response to search details.
import requests
# --- GET ---
headers = {
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
}
url = 'https://diariooficial.elperuano.pe/Normas'
response = requests.get(url, headers=headers)
print(response)
# --- POST ---
url = 'https://diariooficial.elperuano.pe/Normas/Filtro?dateparam=03/08/2022 00:00:00'
params = {
'cddesde': '01/03/2022',
'cdhasta': '03/03/2022',
# 'X-Requested-With': 'XMLHttpRequest',
}
headers = {
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
# 'Referer': "https://diariooficial.elperuano.pe/Normas",
# 'X-Requested-With': 'XMLHttpRequest'
}
response = requests.post(url, data=params, headers=headers)
print(response)
print(response.text[:1000])

Related

Button not clicking with scrapy playwright

I am attempting to click on an sso login for a platform by testing its button functionality with scrapy playwright. I have inputted an incorrect email and so after clicking the button, it should throw a text error that the email is incorrect. However, nothing seems to happen.
For example:
import scrapy
from scrapy_playwright.page import PageMethod
from path import Path
class telSpider(scrapy.Spider):
name = 'tel'
start_urls = 'https://my.tealiumiq.com/login/sso/'
customer_settings = {
'USER-AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15',
'CONNECTION': 'keep-alive',
'ACCEPT': 'application/json, text/javascript, */*; q=0.01'
}
def start_requests(self):
yield scrapy.Request(
self.start_urls,
meta = dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods = [
PageMethod('waitForLoadState', state = 'domcontentloaded'),
PageMethod("fill", selector = '#email', value = "incorrect#email.com"),
PageMethod('wait_for_selector', selector = "span[data-i18n='login.form.btn_submit']", state = 'visible'),
PageMethod("click", selector = "#submitBtn", button = "middle", delay = 2000, force=True),
PageMethod("waitForEvent", event = "click"),
PageMethod("screenshot", path=Path(__file__).parent / "tealium.png", full_page=True),
]
),
callback = self.parse
)
def parse(self, response):
print(response)
I have attempted evaluate instead but this for some reason will only do one or the other event. For example, it either inputs email first or clicks, not one after the other or both.
"""(function() {
const setValue = Object.getOwnPropertyDescriptor(
window.HTMLInputElement.prototype,
"value"
).set;
const modifyInput = (name, value) => {
const input = document.getElementsByName(name)[0]
setValue.call(input, value)
input.dispatchEvent(new Event('input', { bubbles: true}))
};
modifyInput('email', "incorrect#email.com");
document.querySelector("#submitBtn").click();
}());"""
Expected output:

How is this site forming the headers on a POST request?

I am trying to learn how the headers are being constructed when a zipcode is entered by the user and a "POST" command is issued (by clicking on the "Shop Now" button) from the following website:
I believe the interesting part of this "POST" request is how the site is forming the following headers but I can't figure out how it is doing it (my suspicion is that there is some JavaScript/Angular code that is responsible):
x-ccwfdfx7-a
x-ccwfdfx7-b
x-ccwfdfx7-c
x-ccwfdfx7-d
x-ccwfdfx7-f
x-ccwfdfx7-z
So I have tried to use the requests module to login as guest to learn more about how this flow works:
with requests.Session()
with cloudscraper.create_scraper()
So far all my attempts have FAILED. Here is my code:
import requests
from requests_toolbelt.utils import dump #pip install requests_toolbelt
import cloudscraper #pip install cloudscraper
#with requests.Session() as session:
with cloudscraper.create_scraper(
browser={
'custom': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
) as session:
CITY = XXXXX
ZIPCODE = XXXXX
#get cookies
url = 'http://www.peapod.com'
res1 = session.get(url)
session.headers['Referer'] = 'https://www.peapod.com/'
#get more cookies
url = 'http://www.peapod.com/login'
res2 = session.get(url)
#get more cookies
url = 'https://www.peapod.com/ppd/bundles/js/ppdBundle.js'
res3 = session.get(url)
#get all the service locations
response = session.get('https://www.peapod.com/api/v4.0/serviceLocations',
params={
'customerType': 'C',
'zip': ZIPCODE
}
)
try:
loc_id = list(
filter(
lambda x: x.get('location', {}).get('city') == CITY, response.json()['response']['locations']
)
)[0]['location']['id']
except IndexError:
raise ValueError("Can't find City '{}' -> Zip {}".format(CITY, ZIPCODE))
#login as guest
response = session.post('https://www.peapod.com/api/v4.0/user/guest',
json={
'customerType': 'C',
'cities': None,
'email': None,
'serviceLocationId': loc_id,
'zip': ZIPCODE
},
params={
'serviceLocationId': loc_id,
'zip': ZIPCODE
}
)
This seems to produce some sort of an error message saying "I'm blocked" which I believe is due to the fact that I can't figure out how the browser constructs the ccwfdfx7headers in the "POST" request (my suspicion is that there is some JavaScript/Angular code that is responsible for constructing these headers but I can't find it and hoping someone could help...)
On the same computer, Chrome browser is able to login just fine

POST request take too long time with fetch api

I have developed http server in python3 with http.server.
I have encountered problem that the does not done sending post response, so browser keep loading too much time.
Which side (server or browser) causes this problem?
and How do I fix it?
Python3.6.3
macOS 10.13
import http.server
class MyHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"
def do_POST(self):
# Now this method just print path and content-type.
print("POSTED")
content_type = self.headers["Content-Type"]
print(content_type)
print(self.path)
if "multipart/form-data" in content_type:
raw_data = self.rfile.read()
self.send_response(200, self.responses[200][0])
self.send_header("access-control-allow-origin", "*")
self.send_header("connection", "close")
self.end_headers()
# WIP: do something...
def do_GET(self):
if self.path[0] == "/":
self.path = self.path[1:]
try:
with open(self.path, "rb") as f:
file_data = f.read()
except FileNotFoundError:
self.send_response(404, self.responses[404][0])
self.end_headers()
return
content_length = len(file_data)
self.send_response(200, self.responses[200][0])
self.send_header("content-length", content_length)
self.end_headers()
self.wfile.write(file_data)
def parse_post():
# WIP
pass
httpd = http.server.HTTPServer(("", 6788), MyHandler)
print("Address:", "", "Port:", 6788)
httpd.serve_forever()
browser js:
let formdata = new FormData();
formdata.append("Hello", "World");
fetch("http://localhost:6788/nk", {
method: "post",
mode: "cors",
body: formdata,
})

how to visit page using requests with cookie?

I want to visit zoomeye.org using requests module, the cookie from firebug is as follows:
__jsluid=470133a1338c0be13b6fdccf396772c3; csrftoken=WG6eSMS9XaLZfLjICiin8esg1qO3UOFl; Hm_lvt_e58da53564b1ec3fb2539178e6db042e=1448411456; Hm_lpvt_e58da53564b1ec3fb2539178e6db042e=1448505898; __jsl_clearance=1448505830.313|0|EwXSRp%2BrIEF5DR0E5WALlzLMV2Q%3D
The scripts to read web page content:
import requests
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language": "en-GB,en;q=0.5",
"Connection": "keep-alive",
"Host": "www.zoomeye.org",
"Referer": "https://www.zoomeye.org/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:41.0) Gecko/20100101 Firefox/41.0"
}
data = open("cookie.txt", "r").read()
cookieDict = {}
for item in data.split(";"):
keyValue = item.split("=")
cookieDict[keyValue[0]] = keyValue[1]
url = "https://www.zoomeye.org/search?q=apache"
r = requests.get(url,cookies=cookieDict, headers=headers)
print r.content
But i fail to read web page content, output as follows:
<script>var dc="";var t_d={hello:"world",t_c:function(x){if(x==="")return;if(x.s
lice(-1)===";"){x=x+" ";};if(x.slice(-2)!=="; "){x=x+"; ";};dc=dc+x;}};(function
(a){eval(function(p,a,c,k,e,d){e=function(c){return(c<a?"":e(parseInt(c/a)))+((c
=c%a)>35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){
while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\
w+'};c=1;};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);
return p;}('b d=[5,4,0,1,2,3];b o=[];b p=0;g(b i=d.c;i--;){o[d[i]]=a[i]}o=o.m(\'
\');g(b i=0;i<o.c;i++){l(o.q(i)===\';\'){s(o,p,i);p=i+1}}s(o,p,o.c);j s(t,r,n){k
.h(t.y(r,n))};w("f.e=f.e.v(/[\\?|&]u-x/, \'\')",z);',36,36,'|||||||||||var|lengt
h||href|location|for|t_c||function|t_d|if|join||||charAt||||captcha|replace|setT
imeout|challenge|substring|1500'.split('|'),0,{}));})(['45 GMT;Path=/;', ' 26-No
v-15 03:52:', '__jsl_clearance=1448506365.', '687|0|rtcCTV', 'xuWxRiE8%2BC0', 'W
WncvYkCpQ%3D;Expires=Thu,']);document.cookie=dc;</script>
where the problem is?if you know a better solution for this question, please tell me. Thanks
For some reason the website does not like your user agent. Remove the user agent header and it will work.

ExtJS, Flask and AJAX: cross-domain request

I'm developing a RESTful application with ExtJS (client) and Flask (server): client and server are linked by a protocol.
The problem comes when I try to do an AJAX request to the server, like this:
Ext.Ajax.request ({
url: 'http://localhost:5000/user/update/' + userId ,
method: 'POST' ,
xmlData: xmlUser ,
disableCaching: false ,
headers: {
'Content-Type': 'application/xml'
} ,
success: function (res) {
// something here
} ,
failure: function (res) {
// something here
}
});
With the above request, the client is trying to update the user information.
Unfortunately, this is a cross-domain request (details).
The server handles that request as follows:
#app.route ("/user/update/<user_id>", methods=['GET', 'POST'])
def user_update (user_id):
return user_id
What I see on the browser console is an OPTIONS request instead of POST.
Then, I tried to start the Flask application on the 80 port but it's not possible, obviously:
app.run (host="127.0.0.1", port=80)
In conclusion, I don't understand how the client can interact with the server if it cannot do any AJAX request.
How can I get around this problem?
Here's an excellent decorator for CORS with Flask.
http://flask.pocoo.org/snippets/56/
Here's the code for posterity if the link goes dead:
from datetime import timedelta
from flask import make_response, request, current_app
from functools import update_wrapper
def crossdomain(origin=None, methods=None, headers=None,
max_age=21600, attach_to_all=True,
automatic_options=True):
if methods is not None:
methods = ', '.join(sorted(x.upper() for x in methods))
if headers is not None and not isinstance(headers, basestring):
headers = ', '.join(x.upper() for x in headers)
if not isinstance(origin, basestring):
origin = ', '.join(origin)
if isinstance(max_age, timedelta):
max_age = max_age.total_seconds()
def get_methods():
if methods is not None:
return methods
options_resp = current_app.make_default_options_response()
return options_resp.headers['allow']
def decorator(f):
def wrapped_function(*args, **kwargs):
if automatic_options and request.method == 'OPTIONS':
resp = current_app.make_default_options_response()
else:
resp = make_response(f(*args, **kwargs))
if not attach_to_all and request.method != 'OPTIONS':
return resp
h = resp.headers
h['Access-Control-Allow-Origin'] = origin
h['Access-Control-Allow-Methods'] = get_methods()
h['Access-Control-Max-Age'] = str(max_age)
if headers is not None:
h['Access-Control-Allow-Headers'] = headers
return resp
f.provide_automatic_options = False
return update_wrapper(wrapped_function, f)
return decorator
You get around the problem by using CORS
http://en.wikipedia.org/wiki/Cross-origin_resource_sharing
The module Flask-CORS makes it extremely simple to perform cross-domain requests:
app = Flask(__name__)
cors = CORS(app, resources={r"/api/*": {"origins": "*"}})
See also: https://pypi.python.org/pypi/Flask-Cors

Categories

Resources