How open a password protected image url with python? [duplicate] - javascript

Im trying to do a HTTPS GET with basic authentication using python. Im very new to python and the guides seem to use diffrent librarys to do things. (http.client, httplib and urllib). Can anyone show me how its done? How can you tell the standard library to use?

In Python 3 the following will work. I am using the lower level http.client from the standard library. Also check out section 2 of rfc2617 for details of basic authorization. This code won't check the certificate is valid, but will set up a https connection. See the http.client docs on how to do that.
from http.client import HTTPSConnection
from base64 import b64encode
# Authorization token: we need to base 64 encode it
# and then decode it to acsii as python 3 stores it as a byte string
def basic_auth(username, password):
token = b64encode(f"{username}:{password}".encode('utf-8')).decode("ascii")
return f'Basic {token}'
username = "user_name"
password = "password"
#This sets up the https connection
c = HTTPSConnection("www.google.com")
#then connect
headers = { 'Authorization' : basic_auth(username, password) }
c.request('GET', '/', headers=headers)
#get the response back
res = c.getresponse()
# at this point you could check the status etc
# this gets the page text
data = res.read()

Use the power of Python and lean on one of the best libraries around: requests
import requests
r = requests.get('https://my.website.com/rest/path', auth=('myusername', 'mybasicpass'))
print(r.text)
Variable r (requests response) has a lot more parameters that you can use. Best thing is to pop into the interactive interpreter and play around with it, and/or read requests docs.
ubuntu#hostname:/home/ubuntu$ python3
Python 3.4.3 (default, Oct 14 2015, 20:28:29)
[GCC 4.8.4] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import requests
>>> r = requests.get('https://my.website.com/rest/path', auth=('myusername', 'mybasicpass'))
>>> dir(r)
['__attrs__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_content', '_content_consumed', 'apparent_encoding', 'close', 'connection', 'content', 'cookies', 'elapsed', 'encoding', 'headers', 'history', 'iter_content', 'iter_lines', 'json', 'links', 'ok', 'raise_for_status', 'raw', 'reason', 'request', 'status_code', 'text', 'url']
>>> r.content
b'{"battery_status":0,"margin_status":0,"timestamp_status":null,"req_status":0}'
>>> r.text
'{"battery_status":0,"margin_status":0,"timestamp_status":null,"req_status":0}'
>>> r.status_code
200
>>> r.headers
CaseInsensitiveDict({'x-powered-by': 'Express', 'content-length': '77', 'date': 'Fri, 20 May 2016 02:06:18 GMT', 'server': 'nginx/1.6.3', 'connection': 'keep-alive', 'content-type': 'application/json; charset=utf-8'})

Update: OP uses Python 3. So adding an example using httplib2
import httplib2
h = httplib2.Http(".cache")
h.add_credentials('name', 'password') # Basic authentication
resp, content = h.request("https://host/path/to/resource", "POST", body="foobar")
The below works for python 2.6:
I use pycurl a lot in production for a process which does upwards of 10 million requests per day.
You'll need to import the following first.
import pycurl
import cStringIO
import base64
Part of the basic authentication header consists of the username and password encoded as Base64.
headers = { 'Authorization' : 'Basic %s' % base64.b64encode("username:password") }
In the HTTP header you will see this line Authorization: Basic dXNlcm5hbWU6cGFzc3dvcmQ=. The encoded string changes depending on your username and password.
We now need a place to write our HTTP response to and a curl connection handle.
response = cStringIO.StringIO()
conn = pycurl.Curl()
We can set various curl options. For a complete list of options, see this. The linked documentation is for the libcurl API, but the options does not change for other language bindings.
conn.setopt(pycurl.VERBOSE, 1)
conn.setopt(pycurlHTTPHEADER, ["%s: %s" % t for t in headers.items()])
conn.setopt(pycurl.URL, "https://host/path/to/resource")
conn.setopt(pycurl.POST, 1)
If you do not need to verify certificate. Warning: This is insecure. Similar to running curl -k or curl --insecure.
conn.setopt(pycurl.SSL_VERIFYPEER, False)
conn.setopt(pycurl.SSL_VERIFYHOST, False)
Call cStringIO.write for storing the HTTP response.
conn.setopt(pycurl.WRITEFUNCTION, response.write)
When you're making a POST request.
post_body = "foobar"
conn.setopt(pycurl.POSTFIELDS, post_body)
Make the actual request now.
conn.perform()
Do something based on the HTTP response code.
http_code = conn.getinfo(pycurl.HTTP_CODE)
if http_code is 200:
print response.getvalue()

A correct way to do basic auth in Python3 urllib.request with certificate validation follows.
Note that certifi is not mandatory. You can use your OS bundle (likely *nix only) or distribute Mozilla's CA Bundle yourself. Or if the hosts you communicate with are just a few, concatenate CA file yourself from the hosts' CAs, which can reduce the risk of MitM attack caused by another corrupt CA.
#!/usr/bin/env python3
import urllib.request
import ssl
import certifi
context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
context.verify_mode = ssl.CERT_REQUIRED
context.load_verify_locations(certifi.where())
httpsHandler = urllib.request.HTTPSHandler(context = context)
manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
manager.add_password(None, 'https://domain.com/', 'username', 'password')
authHandler = urllib.request.HTTPBasicAuthHandler(manager)
opener = urllib.request.build_opener(httpsHandler, authHandler)
# Used globally for all urllib.request requests.
# If it doesn't fit your design, use opener directly.
urllib.request.install_opener(opener)
response = urllib.request.urlopen('https://domain.com/some/path')
print(response.read())

Based on the #AndrewCox 's answer with some minor improvements:
from http.client import HTTPSConnection
from base64 import b64encode
client = HTTPSConnection("www.google.com")
user = "user_name"
password = "password"
headers = {
"Authorization": "Basic {}".format(
b64encode(bytes(f"{user}:{password}", "utf-8")).decode("ascii")
)
}
client.request('GET', '/', headers=headers)
res = client.getresponse()
data = res.read()
Note, you should set encoding if you use bytes function instead of b"".

requests.get(url, auth=requests.auth.HTTPBasicAuth(username=token, password=''))
If with token, password should be ''.
It works for me.

using only standard modules and no manual header encoding
...which seems to be the intended and most portable way
the concept of python urllib is to group the numerous attributes of the request into various managers/directors/contexts... which then process their parts:
import urllib.request, ssl
# to avoid verifying ssl certificates
httpsHa = urllib.request.HTTPSHandler(context= ssl._create_unverified_context())
# setting up realm+urls+user-password auth
# (top_level_url may be sequence, also the complete url, realm None is default)
top_level_url = 'https://ip:port_or_domain'
# of the std managers, this can send user+passwd in one go,
# not after HTTP req->401 sequence
password_mgr = urllib.request.HTTPPasswordMgrWithPriorAuth()
password_mgr.add_password(None, top_level_url, "user", "password", is_authenticated=True)
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
# create OpenerDirector
opener = urllib.request.build_opener(handler, httpsHa)
url = top_level_url + '/some_url?some_query...'
response = opener.open(url)
print(response.read())

GET & POST request is usually used to submit forms. Here is a brief example of its usage
Views.py
def index(request)
col1 = float(request.GET.get('col1'))
index.html
<div class="form-group col-md-2">
<label for="Col 1">Price</label>
<input type="number" class="form-control" id="col1" name="col1">
</div>

Related

How to get Python to scrape web page generated by JavaScript files

I have a website I want to automate some actions on but the page is generated by 2 JavaScript files and is defined like this in the html:
<script src="/build/runtime.js"></script><script src="/build/app.js"></script>
runtime.js is about 70 lines and app.js is about 40k lines... I have no idea how to read the code as I don't know any JavaScript and my Pyton knowledge is a mere atom more ;)
I'd share the particular site but the page is behind a login. So I've managed to get to the page using 2 different methods but can't find a way to press buttons within this next page generated by the JS.
Method 1 - Requests & BeautifulSoup but got stuck on the JS bit so switched to method2.
import requests
from bs4 import BeautifulSoup
# Site & creds
LOGIN_URL = 'https://website.com/login'
USERNAME = 'user'
PASSWORD = 'pass'
# Pretend to be browser
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# Start session
session = requests.session()
# Get login page
response = session.get(LOGIN_URL, headers=headers, verify=False)
# Get csrf token
soup = BeautifulSoup(response.content, 'html.parser')
csrf_token = (soup.find(id="login_form__token")["value"])
# Set creds with csfr token
payload = {
'login_form[username]': USERNAME,
'login_form[password]': PASSWORD,
'login_form[login]': '',
'login_form[_token]': csrf_token
}
# Login & do something else with cookies I don't understand
response = session.post(LOGIN_URL, data=payload, verify=False)
response = session.get('https://website.com/pageIWant', verify=False)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify())
Method 2 - Selenium & ChromeDriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
driver = webdriver.Chrome(options=options, executable_path='chromedriver.exe')
driver.get("https://website.com/login")
driver.find_element_by_id("login_form_username").send_keys('user')
driver.find_element_by_id("login_form_password").send_keys('pass')
driver.find_element_by_id("login_form_login").click()
driver.get("https://website.com/pageIWant")
html = driver.page_source
print(html)
So I thought method 2 would make things easier but pretty much stuck at the same point. The page generated that I want contains buttons I'd need to press in order to access downstream pages. Read a lot about accessing elements but can't see anything within this 40k worth of JS jibberish. Where is a good place to start?
"Where is a good place to start?"
Regardless of how the page is generated (HTML or JS), ultimately what you have to address in Selenium is the page's live DOM. So "where to start" is inspecting the page's DOM in browser dev tools, and from the DOM, figure out how to find the button elements in Selenium.

Can't remove CORS error even so I included all headers

I am writing simple web-site with js-client and a server side(python) I did everything to remove CORS error but nothing works. I wrote all needed headers for this but still get this error. So web-site should send a request to a server and get answer.
Error:
Access to XMLHttpRequest at 'http://127.0.0.1:8000/' from origin 'null' has been blocked by CORS policy: No 'Access-Control-Allow-Origin' header is present on the requested resource.
myFile.html:
<!DOCTYPE html>
<html>
<head>
<title>requestJs</title>
</head>
<body>
<button class="myButton">SEND</button>
<script type="text/javascript">
let theButton = document.querySelector(".myButton");
theButton.addEventListener('click',function() {
const xhr = new XMLHttpRequest();
xhr.onload = function() {
alert(`Статус: ${xhr.status}; Результат: ${xhr.response}`)
};
xhr.onerror = function() {
alert('Ошибка запроса');
};
xhr.open("GET", "http://127.0.0.1:8000/", true);
xhr.send(2);
})
</script>
</body>
</html>
Server side:
import http.server as httpserver
class CORSHTTPRequestHandler(httpserver.SimpleHTTPRequestHandler):
def send_head(self):
"""Common code for GET and HEAD commands.
This sends the response code and MIME headers.
Return value is either a file object (which has to be copied
to the outputfile by the caller unless the command was HEAD,
and must be closed by the caller under all circumstances), or
None, in which case the caller has nothing further to do.
"""
path = self.translate_path(self.path)
f = None
if os.path.isdir(path):
if not self.path.endswith('/'):
# redirect browser - doing basically what apache does
self.send_response(301)
self.send_header("Location", self.path + "/")
self.end_headers()
return None
for index in "index.html", "index.html":
index = os.path.join(path, index)
if os.path.exists(index):
path = index
break
else:
return self.list_directory(path)
ctype = self.guess_type(path)
try:
# Always read in binary mode. Opening files in text mode may cause
# newline translations, making the actual size of the content
# transmitted *less* than the content-length!
f = open(path, 'rb')
except IOError:
self.send_error(404, "File not found")
return None
self.send_response(200)
self.send_header("Content-type", ctype)
fs = os.fstat(f.fileno())
self.send_header("Content-Length", str(fs[6]))
self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
self.send_header("Access-Control-Allow-Origin:", "*")
self.send_header("Access-Control-Allow-Methods:", "GET,HEAD,PUT,PATCH,POST,DELETE")
self.send_header("Access-Control-Allow-Headers:", "Content-Type, Access-Control-Allow-Origin, xxx")
self.end_headers()
return f
if __name__ == "__main__":
import os
import socketserver
import sys
PORT = int(sys.argv[1]) if len(sys.argv) > 1 else 8000
handler = CORSHTTPRequestHandler
httpd = socketserver.TCPServer(("", PORT), handler)
print(f"serving at port {PORT}")
httpd.serve_forever()
Help me please, what is my problem?
This is not a comprehensive answer but it might help.
CORS is entirely a browser feature. You can turn it off in your browser. I suggest the first step therefore is to launch a CORS-free browser to test your app. Make sure not to open your banking page in this browser session though, it isn't safe!
google-chrome --user-data-dir=/var/tmp/Chrome --disable-web-security
If everything works then then issue is just CORS.
If you are only running in a dev environment, you can just do this everytime even.
If you are running in production, the easiest option is often just to use a gateway that fixes this stuff for you. That's how I got mine working.
If the above isn't good enough and you want to debug, remember that all browser CORS requests are initiated by a preflight OPTIONS requests. Sometimes that's where the problem comes in. Make sure your server is able to handle and respond to OPTIONS, and check that it is responding correctly.

Headless javascript download with selenium

I'm trying to download files from http://www.oracle.com/technetwork/server-storage/developerstudio/downloads/index.html in a headless context. I have an account (they are free), but the site really doesn't make it easy, apparently it uses a chain of javascript forms/redirection. With Firefox I can use the element inspector, copy the url of the file as cURL when the download starts, and use it in a headless machine to download the file, but so far all my attempts to get the file only in the headless machine have failed.
I've managed to get the login with:
#!/usr/bin/env python3
username="<my username>"
password="<my password>"
import requests
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
caps = DesiredCapabilities.PHANTOMJS
caps["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"
driver = webdriver.PhantomJS("/usr/local/bin/phantomjs")
driver.set_window_size(1120, 550)
driver.get("http://www.oracle.com/technetwork/server-storage/developerstudio/downloads/index.html")
print("loaded")
driver.find_element_by_name("agreement").click()
print("clicked agreement")
driver.find_element_by_partial_link_text("RPM installer").click()
print("clicked link")
driver.find_element_by_id("sso_username").send_keys(username)
driver.find_element_by_id("ssopassword").send_keys(password)
driver.find_element_by_xpath("//input[contains(#title,'Please click here to sign in')]").click()
print("submitted")
print(driver.get_cookies())
print(driver.current_url)
print(driver.page_source)
driver.quit()
I suspect the login worked, because in the cookies I see some data associated with my username, but in Firefox submitting the form results in a download starting after 3-4 redirections, while here I get nothing and the page_source and current_url still belong to the login page.
Maybe the site is actively blocking this kind of use, or maybe I'm doing something wrong. Any idea how to actually download the file?
Thanks to TheChetan's comment I got it working. I didn't use the javascript-blob route though, but the requests approach suggested by Tarun Lalwani in https://stackoverflow.com/a/46027215. It took me a while to realize I had to modify the user agent in the request too. Finally this works for me:
#!/usr/bin/env python3
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from requests import Session
from urllib.parse import urlparse
from os.path import basename
from hashlib import sha256
import sys
index_url = "http://www.oracle.com/technetwork/server-storage/developerstudio/downloads/index.html"
link_text = "RPM installer"
username="<my username>"
password="<my password>"
user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"
# set up browser
caps = DesiredCapabilities.PHANTOMJS
caps["phantomjs.page.settings.userAgent"] = user_agent
driver = webdriver.PhantomJS("/usr/local/bin/phantomjs")
driver.set_window_size(800,600)
# load index page and click through
driver.get(index_url)
print("loaded")
driver.find_element_by_name("agreement").click()
print("clicked agreement")
link = driver.find_element_by_partial_link_text(link_text)
sha = driver.find_element_by_xpath("//*[contains(text(), '{0}')]/following::*[contains(text(), 'sum:')]/following-sibling::*".format(link_text)).text
file_url = link.get_attribute("href")
filename = basename(urlparse(file_url).path)
print("filename: {0}".format(filename))
print("checksum: {0}".format(sha))
link.click()
print("clicked link")
driver.find_element_by_id("sso_username").send_keys(username)
driver.find_element_by_id("ssopassword").send_keys(password)
driver.find_element_by_xpath("//input[contains(#title,'Please click here to sign in')]").click()
print("submitted")
# we should be logged in now
def progressBar(title, value, endvalue, bar_length=60):
percent = float(value) / endvalue
arrow = '-' * int(round(percent * bar_length)-1) + '>'
spaces = ' ' * (bar_length - len(arrow))
sys.stdout.write("\r{0}: [{1}] {2}%".format(title, arrow + spaces, int(round(percent * 100))))
sys.stdout.flush()
# transfer the cookies to a new session and request the file
session = Session()
session.headers = {"user-agent": user_agent}
for cookie in driver.get_cookies():
session.cookies.set(cookie["name"], cookie["value"])
driver.quit()
r = session.get(file_url, stream=True)
# now we should have gotten the url with param
new_url = r.url
print("final url {0}".format(new_url))
r = session.get(new_url, stream=True)
print("requested")
length = int(r.headers['Content-Length'])
title = "Downloading ({0})".format(length)
sha_file = sha256()
chunk_size = 2048
done = 0
with open(filename, "wb") as f:
for chunk in r.iter_content(chunk_size):
f.write(chunk)
sha_file.update(chunk)
done = done+len(chunk)
progressBar(title, done, length)
print()
# check integrity
if (sha_file.hexdigest() == sha):
print("checksums match")
sys.exit(0)
else:
print("checksums do NOT match!")
sys.exit(1)
So at the end the idea is using selenium+phantomjs for logging in, and then using the cookies for a plain request.

JavaScript/Node + cURL

I have a line of cURL:
curl -X POST --form "file=#Calvin Harris - Thinking About You (Tez Cadey Remix)_165299184_soundcloud.mp3" https://api.idolondemand.com/1/api/async/recognizespeech/v1
I'm building a hybrid mobile app with Meteor/Ionic as the framework. Therefore, I have access to any Node library that leverages cURL.
Can anyone:
1) Suggest one of the many node-curl libraries
2) Show me how to properly output the above cURL line in the context of the right library?
My primary issue that is stopping me is the --form flag. I've poured over several libraries/docs and none explicitly reference how to use this form flag. I cannot drop this flag, it's a requirement of the API.
You could just use node's fs and https APIs
var fs = require('fs');
var https = require('https');
var rs = fs.createReadStream(
'Calvin Harris - Thinking About You (Tez Cadey Remix)_165299184_soundcloud.mp3'
);
var req = http.request({
hostname: 'api.idolondemand.com',
path: '/1/api/async/recognizespeech/v1',
method: 'POST'
}, function(res) {
// do something when you get a response
});
rs.pipe(req);
Or use the requests module as #ffk mentioned

Update and render a value from Flask periodically

I want to display my CPU usage dynamically. I don't want to reload the page to see a new value. I know how to get the CPU usage in Python. Right now I render a template with the value. How can I continually update a page with a value from Flask?
#app.route('/show_cpu')
def show_cpu():
cpu = getCpuLoad()
return render_template('show_cpu.html', cpu=cpu)
Using an Ajax request
Python
#app.route('/_stuff', methods= ['GET'])
def stuff():
cpu=round(getCpuLoad())
ram=round(getVmem())
disk=round(getDisk())
return jsonify(cpu=cpu, ram=ram, disk=disk)
Javascript
function update_values() {
$SCRIPT_ROOT = {{ request.script_root|tojson|safe }};
$.getJSON($SCRIPT_ROOT+"/_stuff",
function(data) {
$("#cpuload").text(data.cpu+" %")
$("#ram").text(data.ram+" %")
$("#disk").text(data.disk+" %")
});
}
Using Websockets
project/app/views/request/websockets.py
# -*- coding: utf-8 -*-
# OS Imports
import json
# Local Imports
from app import sockets
from app.functions import get_cpu_load, get_disk_usage, get_vmem
#sockets.route('/_socket_system')
def socket_system(ws):
"""
Returns the system informations, JSON Format
CPU, RAM, and Disk Usage
"""
while True:
message = ws.receive()
if message == "update":
cpu = round(get_cpu_load())
ram = round(get_vmem())
disk = round(get_disk_usage())
ws.send(json.dumps(dict(received=message, cpu=cpu, ram=ram, disk=disk)))
else:
ws.send(json.dumps(dict(received=message)))
project/app/__init__.py
# -*- coding: utf-8 -*-
from flask import Flask
from flask_sockets import Sockets
app = Flask(__name__)
sockets = Sockets(app)
app.config.from_object('config')
from app import views
Using Flask-Websockets made my life a lot easier. Here is the launcher :
launchwithsockets.sh
#!/bin/sh
gunicorn -k flask_sockets.worker app:app
Finally, here is the client code :
custom.js
The code is a bit too long, so here it is.
Note that I'm NOT using things like socket.io, that's why the code is long. This code also tries to reconnect to the server periodically, and can stop trying to reconnect on a user action. I use the Messenger lib to notify the user that something went wrong. Of course it's a bit more complicated than using socket.io but I really enjoyed coding the client side.

Categories

Resources