How is this site forming the headers on a POST request? - javascript

I am trying to learn how the headers are being constructed when a zipcode is entered by the user and a "POST" command is issued (by clicking on the "Shop Now" button) from the following website:
I believe the interesting part of this "POST" request is how the site is forming the following headers but I can't figure out how it is doing it (my suspicion is that there is some JavaScript/Angular code that is responsible):
x-ccwfdfx7-a
x-ccwfdfx7-b
x-ccwfdfx7-c
x-ccwfdfx7-d
x-ccwfdfx7-f
x-ccwfdfx7-z
So I have tried to use the requests module to login as guest to learn more about how this flow works:
with requests.Session()
with cloudscraper.create_scraper()
So far all my attempts have FAILED. Here is my code:
import requests
from requests_toolbelt.utils import dump #pip install requests_toolbelt
import cloudscraper #pip install cloudscraper
#with requests.Session() as session:
with cloudscraper.create_scraper(
browser={
'custom': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
) as session:
CITY = XXXXX
ZIPCODE = XXXXX
#get cookies
url = 'http://www.peapod.com'
res1 = session.get(url)
session.headers['Referer'] = 'https://www.peapod.com/'
#get more cookies
url = 'http://www.peapod.com/login'
res2 = session.get(url)
#get more cookies
url = 'https://www.peapod.com/ppd/bundles/js/ppdBundle.js'
res3 = session.get(url)
#get all the service locations
response = session.get('https://www.peapod.com/api/v4.0/serviceLocations',
params={
'customerType': 'C',
'zip': ZIPCODE
}
)
try:
loc_id = list(
filter(
lambda x: x.get('location', {}).get('city') == CITY, response.json()['response']['locations']
)
)[0]['location']['id']
except IndexError:
raise ValueError("Can't find City '{}' -> Zip {}".format(CITY, ZIPCODE))
#login as guest
response = session.post('https://www.peapod.com/api/v4.0/user/guest',
json={
'customerType': 'C',
'cities': None,
'email': None,
'serviceLocationId': loc_id,
'zip': ZIPCODE
},
params={
'serviceLocationId': loc_id,
'zip': ZIPCODE
}
)
This seems to produce some sort of an error message saying "I'm blocked" which I believe is due to the fact that I can't figure out how the browser constructs the ccwfdfx7headers in the "POST" request (my suspicion is that there is some JavaScript/Angular code that is responsible for constructing these headers but I can't find it and hoping someone could help...)
On the same computer, Chrome browser is able to login just fine

Related

Web Scraping Journal "El Peruano" - Python/Scrapy

im trying to scrap some info from "El Peruano" journal, but i cannot at first sight it look have to:
El Peruano Website
Put a Date in a Formbox.
Do a click in SearchBox.
Get all links for get all: "Title","Resolution#", "Body"
This is my code:
import scrapy
class SpiderPeruano(scrapy.Spider):
name = "peruano"
start_urls = [
"https://diariooficial.elperuano.pe/Normas"
]
custom_settings= {
"FEED_URI": "peruano.json",
"FEED_FORMAT": "json",
"FEED_EXPORT_ENCODING": "utf-8"
}
def parse_click(self, response):
#i put here a condition but i think is not necessary
#button = response.xpath("//div[#id='busqueda']/form[#action]/button[#id='btnBuscar']").get()
#if buttom:
yield scrapy.FormRequest.from_response(
response,
formxpath= "//form[#id='space_PortalNormasLegalesN']",
formdata={"cddesde": "08/03/2022", "cdhasta:": "08/03/2022"},
dont_click=True,
dont_filter=True,
callback=self.parse
)
def parse(self, response):
links = response.xpath("//div[#class='ediciones_texto']/h5/a/#href").getall()
for link in links:
yield response.follow(link, callback=self.parse_link)
def parse_link(self, response):
title = response.xpath("//div[#class='story']/h1[#class='sumilla']/text()").get()
num = response.xpath("//div[#class='story']/h2[#class='resoluci-n']/text()").getall()
body = response.xpath("//div[#class='story']/p/text()").getall()
yield {
"title": title,
"num": num,
"body": body
}
#call
#scrapy crawl peruano
#url = "https://diariooficial.elperuano.pe/normas"
#Form_BOX: "//form[#action]"
#Box_desde = "//form[#action]/input[#id='cddesde']"
#Box_hasta = "//form[#action]/input[#id='cdhasta']"
#Button= "//div[#id='busqueda']/form[#action]/button[#id='btnBuscar']"
#links = "//div[#class='ediciones_texto']/h5/a/#href"
#titles= "//div[#class='story']/h1[#class='sumilla']/text()"
#resolutionNum= "//div[#class='story']/h2[#class='resoluci-n']/text()"
#body= "//div[#class='story']/p/text()"
So, i need some help for know what i'm doing wrong on my code cuz this run well but dont get the data.
Thx a lot for your time and help!
I found two mistakes:
First:
Scrapy gets url from start_urls and sends response to parse (as default callback) but you expect it in parse_click (to send Form). If I rename functions then it sends form.
Second:
Small typo. In formdata= you use string "cdhasta:" with : at the end and this made problems.
import scrapy
class SpiderPeruano(scrapy.Spider):
name = "peruano"
start_urls = [
"https://diariooficial.elperuano.pe/Normas"
]
custom_settings= {
"FEED_URI": "peruano.json",
"FEED_FORMAT": "json",
"FEED_EXPORT_ENCODING": "utf-8"
}
def parse(self, response):
print('[parse] url:', response.url)
yield scrapy.FormRequest.from_response(
response,
formxpath= "//form[#id='space_PortalNormasLegalesN']",
formdata={"cddesde": "01/03/2022", "cdhasta": "03/03/2022", "btnBuscar":""},
dont_click=True,
dont_filter=True,
#headers={'Referer':"https://diariooficial.elperuano.pe/Normas", 'X-Requested-With': 'XMLHttpRequest'},
callback=self.parse_result
)
def parse_result(self, response):
print('[parse_result] url:', response.url)
links = response.xpath("//div[#class='ediciones_texto']/h5/a/#href").getall()
for link in links:
yield response.follow(link, callback=self.parse_link)
def parse_link(self, response):
print('[parse_link] url:', response.url)
title = response.xpath("//div[#class='story']/h1[#class='sumilla']/text()").get()
num = response.xpath("//div[#class='story']/h2[#class='resoluci-n']/text()").getall()
body = response.xpath("//div[#class='story']/p/text()").getall()
yield {
"title": title,
"num": num,
"body": body
}
# --- run without project ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(SpiderPeruano)
c.start()
EDIT:
Meanwhile I tested it also with requests but I didn't try to get links from response to search details.
import requests
# --- GET ---
headers = {
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
}
url = 'https://diariooficial.elperuano.pe/Normas'
response = requests.get(url, headers=headers)
print(response)
# --- POST ---
url = 'https://diariooficial.elperuano.pe/Normas/Filtro?dateparam=03/08/2022 00:00:00'
params = {
'cddesde': '01/03/2022',
'cdhasta': '03/03/2022',
# 'X-Requested-With': 'XMLHttpRequest',
}
headers = {
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
# 'Referer': "https://diariooficial.elperuano.pe/Normas",
# 'X-Requested-With': 'XMLHttpRequest'
}
response = requests.post(url, data=params, headers=headers)
print(response)
print(response.text[:1000])

When adding a Job to scheduler: Value cannot be null, Job class cannot be null?

My question is very similar to:
Quartz.net - "Job's key cannot be null"
However its different setup as I am using Rest API.
I am able to run a job when adding through Startup.cs however when I call API to add job using javascript it fails with below error:
ERROR:
System.ArgumentNullException: Value cannot be null. (Parameter 'typeName')
at System.RuntimeType.GetType(String typeName, Boolean throwOnError, Boolean ignoreCase, StackCrawlMark& stackMark)
at System.Type.GetType(String typeName)
at Quartz.Web.Api.JobsController.AddJob(String schedulerName, String jobGroup, String jobName, String jobType, Boolean durable, Boolean requestsRecovery, Boolean replace) in E:\Amit\DotNet\QuartzApi\QuartzApi\Controllers\JobsController.cs:line 108
at lambda_method14(Closure , Object )
at Microsoft.AspNetCore.Mvc.Infrastructure.ActionMethodExecutor.AwaitableResultExecutor.Execute(IActionResultTypeMapper mapper, ObjectMethodExecutor executor, Object controller, Object[] arguments)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.<InvokeActionMethodAsync>g__Logged|12_1(ControllerActionInvoker invoker)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.<InvokeNextActionFilterAsync>g__Awaited|10_0(ControllerActionInvoker invoker, Task lastTask, State next, Scope scope, Object state, Boolean isCompleted)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.Rethrow(ActionExecutedContextSealed context)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.Next(State& next, Scope& scope, Object& state, Boolean& isCompleted)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.InvokeInnerFilterAsync()
--- End of stack trace from previous location ---
at Microsoft.AspNetCore.Mvc.Infrastructure.ResourceInvoker.<InvokeFilterPipelineAsync>g__Awaited|19_0(ResourceInvoker invoker, Task lastTask, State next, Scope scope, Object state, Boolean isCompleted)
at Microsoft.AspNetCore.Mvc.Infrastructure.ResourceInvoker.<InvokeAsync>g__Logged|17_1(ResourceInvoker invoker)
at Microsoft.AspNetCore.Routing.EndpointMiddleware.<Invoke>g__AwaitRequestTask|6_0(Endpoint endpoint, Task requestTask, ILogger logger)
at Microsoft.AspNetCore.Authorization.AuthorizationMiddleware.Invoke(HttpContext context)
at Microsoft.AspNetCore.Diagnostics.DeveloperExceptionPageMiddleware.Invoke(HttpContext context)
HEADERS
=======
Accept: application/json
Accept-Encoding: gzip, deflate, br
Accept-Language: en-US,en;q=0.9
Connection: close
Content-Length: 83
Content-Type: application/json
Host: localhost:44379
Referer: https://localhost:44379/jobs.html
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36
sec-ch-ua: "Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"
sec-ch-ua-mobile: ?0
origin: https://localhost:44379
sec-fetch-site: same-origin
sec-fetch-mode: cors
sec-fetch-dest: empty
SETUP:
In VS, I created Quartz REST API and front end in a single project. Running the project loads webpage with Jobs and API running in the background.
All controller endpoints work except AddJob. (i.e. get jobs, view job details, pause, resume, trigger, delete)
Dependency:
Quartz.Extensions.Hosting 3.3.3
JobsController.cs
quartznet/JobsController.cs at main · quartznet/quartznet · GitHub
[HttpPut]
[Route("{jobGroup}/{jobName}")]
public async Task AddJob(string schedulerName, string jobGroup, string jobName, string jobType, bool durable, bool requestsRecovery, bool replace = false)
{
var scheduler = await GetScheduler(schedulerName).ConfigureAwait(false);
var jobDetail = new JobDetailImpl(jobName, jobGroup, Type.GetType(jobType), durable, requestsRecovery);
await scheduler.AddJob(jobDetail, replace).ConfigureAwait(false);
}
HelloWorldJob.cs:
https://andrewlock.net/using-quartz-net-with-asp-net-core-and-worker-services/
Startup.cs: (Adds a job without API and runs it using trigger at start)
void ConfigureHostQuartz(IServiceCollection services)
{
services.AddQuartz(q =>
{
q.UseMicrosoftDependencyInjectionScopedJobFactory();
var jobKey = new JobKey("HelloWorldJob");
q.AddJob<HelloWorldJob>(opts => opts.WithIdentity(jobKey));
q.AddTrigger(opts => opts
.ForJob(jobKey)
.WithIdentity("HelloWorldJob-trigger")
.WithCronSchedule("0/5 * * * * ?"));
});
services.AddQuartzHostedService(
q => q.WaitForJobsToComplete = true);
}
Html/Javascript front end:
Following this example:
Tutorial: Call an ASP.NET Core web API with JavaScript | Microsoft Docs
<form action="javascript:void(0);" method="POST" onsubmit="addJob()">
<input type="text" id="add-name" placeholder="New job">
<input type="submit" value="Add">
</form>
<script>
function addJob() {
const addNameTextbox = document.getElementById('add-name').value.trim();
const item = {
jobType: "HelloWorldJob",
durable: true,
requestsRecovery: false,
replace: false
};
fetch(`${uri}/DEFAULT/${addNameTextbox}`, {
method: 'PUT',
headers: {
'Accept': 'application/json',
'Content-Type': 'application/json'
},
body: JSON.stringify(item)
})
.then(response => console.log(response))
.then(() => {
getJobs();
addNameTextbox.value = '';
})
.catch(error => console.error('Unable to add job.', error));
}
</script>
I have tried updating the API to include jobType in url, then it gives different error:
Job class cannot be null
at Qurtz.Impl.JobDetilImpl.set_JobType(Type value)
You need to supply assembly qualified name as job type. Problems is here:
jobType: "HelloWorldJob",
jobType should be something like "MyNameSpace.JobType, MyAssembly" - you can probably get this written to console with Console.WriteLine(typeof(HelloWorldJob).AssemblyQualifiedName) - you can ignore the version etc, only type name with namespace and assembly name are needed.
Please also note that your setup has security implications as you allow CLR types to be passed from the UI.
API controller changes:
As mentioned by Marko above, jobType needs fully qualified name, assembly reference is not however necessary in my case as I have jobs in same assembly.
[HttpPut]
[Route("{jobGroup}/{jobName}/{jobType}/{replace}/new")]
public async Task NewJob(string schedulerName, string jobGroup,
string jobName, string jobType, bool replace = false)
{
//Note: Job added without a trigger must be durable.
var scheduler = await GetScheduler(schedulerName).ConfigureAwait(false);
var jobDetail = new JobDetailImpl(jobName, jobGroup,
Type.GetType("QuartzApi.Jobs." + jobType), true, false);
await scheduler.AddJob(jobDetail, replace).ConfigureAwait(false);
}
JavaScript fetch query changes:
Removed JSON body tag and added extra parameters to url. Note its a job without trigger. At a later stage jobType can be a variable, for now its included in fetch string.
function addJob() {
const addNameTextbox = document.getElementById('add-name').value.trim();
fetch(`${uri}/DEFAULT/${addNameTextbox}/HelloWorldJob/false/new`, {
method: 'PUT',
headers: {
'Accept': 'application/json',
'Content-Type': 'application/json'
}
})
.then(response => console.log(response))
.then(() => {
getJobs();
addNameTextbox.value = '';
})
.catch(error => console.error('Unable to add job.', error));
}
Running the UI request to add job, now adds it without a trigger (to be worked on in separate section). To confirm, I then ran API request in browser to fetch all jobs for the running scheduler using:
[https://localhost:44379/api/schedulers/QuartzScheduler/jobs]
resulting in:
[{"name":"HelloWorldJob","group":"DEFAULT"},{"name":"TestJob","group":"DEFAULT"}]
That implies a few things:
Passing JSON object in body does not associate it with API function parameters. I need to add all parameters in url string to use them. May be there is a way to use body parameters.
Now that the class is correctly referenced in API, I can continue passing just the class name through UI, without namespace and assembly to keep it secure as class is defined in the project at build time.
Adding Console.Writeline in API function did not return any output at runtime.

Get Android app Icon by package name from javascript code

I'm trying to get application icon from my web javascript code using package name only.
How can I fetch it from google play store?
Is the only way is scraping? Is it dangerous?
Thanks.
If you want to get application icon from Google Play App info page you need just extract them from the correct selectors, but you need an applicaton id. To do this you can search what you want on Google Play main page, get the first result (most relevant to your search) and parse app id from it.
I'll show how you can do this in the code below (also check it on the online IDE):
const cheerio = require("cheerio");
const axios = require("axios");
const searchQuery = "asphalt 9"; // what you want to search
const mainOptions = {
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
}, // adding the User-Agent header as one way to prevent the request from being blocked
hl: "en", // Parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
};
function getAppId() {
const AXIOS_OPTIONS = {
headers: mainOptions.headers,
params: {
q: searchQuery,
c: "apps", // parameter defines category of the search. "apps" means apps & games category
hl: mainOptions.hl,
gl: mainOptions.gl,
},
};
return axios.get(`https://play.google.com/store/search`, AXIOS_OPTIONS).then(function ({ data }) {
let $ = cheerio.load(data);
const link = `https://play.google.com${$('[jscontroller="gKWqec"] .ULeU3b .Si6A0c')?.attr("href")}`;
const appId = link.slice(link.indexOf("?id=") + 4);
return appId;
});
}
function getAppInfo(id) {
const AXIOS_OPTIONS = {
headers: mainOptions.headers,
params: {
id, // Parameter defines the ID of a product you want to get the results for
hl: mainOptions.hl,
gl: mainOptions.gl,
},
};
return axios.get(`https://play.google.com/store/apps/details`, AXIOS_OPTIONS).then(function ({ data }) {
let $ = cheerio.load(data);
return {
thumbnail: $(".l8YSdd > img")?.attr("srcset")?.slice(0, -3),
};
});
}
getAppId().then(getAppInfo).then(console.log);
Output
{
"thumbnail": "https://play-lh.googleusercontent.com/PJo-zZiPokt4vUPri7-md-S-adydt9HPf9yfAcuKift7tYTC1cyrhpxmqFPQbuDRrDU=w240-h480-rw"
}
You can read more about scraping Google Play App info from my blog post Web scraping Google Play App Info with Nodejs.

Log into JavaScript login form requests

I am trying to log into this website that uses a JS based form. Is this even possible with the Python requests library?
payload = {
'_username': 'xxx#xxx.com',
'_password': 'xxx',
'_remember_me': 'false'
}
with requests.Session() as s:
p = s.post('https://www.lovoo.com/login_check', data=payload)
r = s.get('https://www.lovoo.com/list/visits')
print(r.text)
I search r.text afterwards with grep, but I see I am still not logged in?
You need to do an initial get to set some cookies and add some headers:
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}
with requests.Session() as s:
s.get("https://www.lovoo.com")
p = s.post('https://www.lovoo.com/login_check', data=payload, headers=head)
r = s.get('https://www.lovoo.com/list/visits')
If you print p.json() you will see a response like {"referer":"https:\/\/www.lovoo.com\/welcome\/login","success":true,"user":{}} which means you have successfully logged in.

how to visit page using requests with cookie?

I want to visit zoomeye.org using requests module, the cookie from firebug is as follows:
__jsluid=470133a1338c0be13b6fdccf396772c3; csrftoken=WG6eSMS9XaLZfLjICiin8esg1qO3UOFl; Hm_lvt_e58da53564b1ec3fb2539178e6db042e=1448411456; Hm_lpvt_e58da53564b1ec3fb2539178e6db042e=1448505898; __jsl_clearance=1448505830.313|0|EwXSRp%2BrIEF5DR0E5WALlzLMV2Q%3D
The scripts to read web page content:
import requests
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language": "en-GB,en;q=0.5",
"Connection": "keep-alive",
"Host": "www.zoomeye.org",
"Referer": "https://www.zoomeye.org/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:41.0) Gecko/20100101 Firefox/41.0"
}
data = open("cookie.txt", "r").read()
cookieDict = {}
for item in data.split(";"):
keyValue = item.split("=")
cookieDict[keyValue[0]] = keyValue[1]
url = "https://www.zoomeye.org/search?q=apache"
r = requests.get(url,cookies=cookieDict, headers=headers)
print r.content
But i fail to read web page content, output as follows:
<script>var dc="";var t_d={hello:"world",t_c:function(x){if(x==="")return;if(x.s
lice(-1)===";"){x=x+" ";};if(x.slice(-2)!=="; "){x=x+"; ";};dc=dc+x;}};(function
(a){eval(function(p,a,c,k,e,d){e=function(c){return(c<a?"":e(parseInt(c/a)))+((c
=c%a)>35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){
while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\
w+'};c=1;};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);
return p;}('b d=[5,4,0,1,2,3];b o=[];b p=0;g(b i=d.c;i--;){o[d[i]]=a[i]}o=o.m(\'
\');g(b i=0;i<o.c;i++){l(o.q(i)===\';\'){s(o,p,i);p=i+1}}s(o,p,o.c);j s(t,r,n){k
.h(t.y(r,n))};w("f.e=f.e.v(/[\\?|&]u-x/, \'\')",z);',36,36,'|||||||||||var|lengt
h||href|location|for|t_c||function|t_d|if|join||||charAt||||captcha|replace|setT
imeout|challenge|substring|1500'.split('|'),0,{}));})(['45 GMT;Path=/;', ' 26-No
v-15 03:52:', '__jsl_clearance=1448506365.', '687|0|rtcCTV', 'xuWxRiE8%2BC0', 'W
WncvYkCpQ%3D;Expires=Thu,']);document.cookie=dc;</script>
where the problem is?if you know a better solution for this question, please tell me. Thanks
For some reason the website does not like your user agent. Remove the user agent header and it will work.

Categories

Resources