Get audio source link from Website with python - javascript

I am writing a script to fetch audio source links from a website. By crawling the main page a get a list of the links available. but when I crawl the links generated I can't find the source. (It should be inside href of a < audio > tag).
Here is my code:
# -*- coding: utf-8 -*-
import urllib.request
from bs4 import BeautifulSoup
def getHTML(st):
with urllib.request.urlopen(site+'/',timeout=100) as response:
return response.read()
site = 'http://www.e-radio.gr'
soup = BeautifulSoup(getHTML(site), 'html.parser')
# Parse Main Page And get links
lst = list()
for a in soup.body.find_all('a', {'class' : 'erplayer'}):
item = a.get('href')
if site in item:
lst.append(item)
else:
lst.append(site + item)
print("\n".join(lst))
It seems that the website doesn't load properly and it doesn't load the audio source using urllib.request. What else i can use instead of urllib.request so it waits for the full page to load. What i have thought was to use some external web browser to generate the html but i don't know how to do that

This is a bit tricky, but we can approach that step by step - first getting the player's HTML by following the iframe link. Then, getting the flash player link and following it. Then, extracting the link to the mp3 and downloading the stream. All of that under the same web-scraping session:
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
def download_file(session, link, path):
r = session.get(link, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
base_url = "http://www.e-radio.gr"
url = "http://www.e-radio.gr/Rainbow-89-Thessaloniki-i92/live"
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'}
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
frame = soup.find(id="playerControls1")
frame_url = urljoin(base_url, frame["src"])
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
link = soup.select_one(".onerror a")['href']
flash_url = urljoin(response.url, link)
response = session.get(flash_url)
soup = BeautifulSoup(response.content, "html.parser")
mp3_link = soup.select_one("param[name=flashvars]")['value'].split("url=", 1)[-1]
print(mp3_link)
download_file(session, mp3_link, "download.mp3")

Related

Need help about sending variable from chrome extension to python

i wanted to make a small script that download "maps" automatically when the user open the ingame link.
Once the link Open in chrome, the extension get the current URL and send it to python(this is where i'm stuck right now) and then close the tab if successful (since it will fail if the python script isn't running?).
once in python i then proceed to download the map in question and add it to the Songs folder where the only thing he will to do is push F5
right now, i have these pieces of code:
Manifest.json:
{
"name": "Osu!AltDownload",
"version": "1.0",
"description": "A requirement to make osu!AltDownload work",
"permissions": ["tabs","http://localhost:5000/"],
"background": {
"scripts": ["Osu!AltDownload.js"],
"persistant": false
},
"manifest_version": 2
}
Osu!AltDownload.js
chrome.tabs.onUpdated.addListener( function (tabId, changeInfo, tab) {
if (changeInfo.status == 'complete') {
chrome.tabs.query({active: true, currentWindow: true}, tabs => {
let url = tabs[0].url;
});
var xhr = new XMLHttpRequest();
xhr.open("POST", "http://localhost:5000/",true);
xhr.send(url);
}
})
The script that receive the links and download the "maps":
import browser_cookie3
import requests
from bs4 import BeautifulSoup as BS
import re
import os
def maplink(osupath):
link = link #obtain link from POST ?
if link.split("/",4[:4]) == ['https:', '', 'osu.ppy.sh', 'beatmapsets']:
Download_map(osupath, link.split("#osu")[0])
def Download_map(osupath, link):
cj = browser_cookie3.load()
print("Downloading", link)
headers = {"referer": link}
with requests.get(link) as r:
t = BS(r.text, 'html.parser').title.text.split("ยท")[0]
with requests.get(link+"/download", stream=True, cookies=cj, headers=headers) as r:
if r.status_code == 200:
try:
id = re.sub("[^0-9]", "", link)
with open(os.path.abspath(osupath+"/Songs/"+id+" "+t+".osz"), "wb") as otp:
otp.write(r.content)
except:
print("You either aren't connected on osu!'s website or you're limited by the API, in which case you now have to wait 1h and then try again.")
i would like to add in that i used these lines of codes in my extension:
var xhr = new XMLHttpRequest();
xhr.open("POST", "http://localhost:5000/",true);
xhr.send(url);
they come from one of my google searches but i don't really understand how i can handle POST request in python with that and i don't even know if i going the right way.
Some may say i haven't done much research at this subject but out of about 50 chrome tabs, i haven't really found anything that would really give me an idea of the right approach for this subject.
You have to run web server to get http requests
You can use Flask for this.
from flask import Flask, request
app = Flask(__name__)
#app.route('/', methods=['GET', 'POST'])
def index():
if request.method == 'POST':
#print(request.form)
print(request.data)
return "OK"
if __name__ == '__main__':
app.run(port=5000)
If you want to send only url then you can use even GET instead of POST and send as
http://localhost:5000/?data=your_url
where your_url is what you get with tab[0].url.
xhr.open("GET", "http://localhost:5000/?data=" + url, true);
xhr.send(); // or maybe xhr.send(null);
And then you can get it with
from flask import Flask, request
app = Flask(__name__)
#app.route('/')
def index():
print(request.args.get('data'))
return "OK"
if __name__ == '__main__':
app.run(port=5000)
EDIT:
Example which tests Flask using directly JavaScript when you visit http://localhost:5000/test
from flask import Flask, request
app = Flask(__name__)
#app.route('/')
def index():
print(request.args.get('data'))
return "OK"
#app.route('/test/')
def test():
return """
<script>
var url = "https://stackoverflow.com/questions/65867136/need-help-about-sending-variable-from-chrome-extension-to-python/";
var xhr = new XMLHttpRequest();
xhr.open("GET", "http://localhost:5000/?data=" + url, true);
xhr.send();
</script>
"""
if __name__ == '__main__':
app.run(port=5000)
Eventually I can test it with bookmarklet
javascript:{window.location='http://localhost:5000/?data='+encodeURIComponent(window.location.href)}
which I put as url in bookmark in favorites - but this reload page
or using modern fetch() (instead of old XMLHttpRequest()) and it doesn't reload page.
javascript:{fetch('http://localhost:5000/?data='+encodeURIComponent(window.location.href))}

How Scraping Dynamic Variable Javascript value using BeautifulSoup and Requests

I am scraping login page, i only need VAR SALT= variable in JAVASCRIPT TAG.
This is the website = https://ib.muamalatbank.com/ib-app/loginpage
When i am read all answer here,using BeautifulSoup and requests, i can get these 2 variable(Maybe because its static):
var muserid='User ID must be filled';
var mpassword= 'Password must be filled';
But when i try Scrape this var SALT= , its give me all VAR value.
My result code in python
I just need This VAR SALT value only with no Quotation mark
Here the PIC = Source VAR SALT VALUE
I already using re.search, and re.compile, re.findall, but i am Newbie, keep gives me error "Object cannot string...."
from bs4 import BeautifulSoup as bs
import requests
import re
import lxml
import json
URL = 'https://ib.muamalatbank.com/ib-app/loginpage'
REF = 'https://ib.muamalatbank.com'
HEADERS = {'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0', 'origin': URL, 'referer': REF}
s = requests.session()
soup = bs(s.get(URL, headers=HEADERS, timeout=5, verify=False).text,"html.parser")
script = soup.find_all("script")[11]
ambilteks = soup.find_all(text=re.compile("salt=(.*?)"))
print(ambilteks)
Note: 1) i need Help but not interested using Selenium,
I have script in PHP-Laravel, its fully working(i need in Python), but i have no knowledge in laravel, anyone can ask me , i will give the Laravel code
Please help me, thank you very much
Try using re.compile and add the '' into your regex, then extract first result. Not tested with page response. First verify the string is actually present in the response.
p = re.compile(r"var salt='(.*?)'")
res = p.findall(s.get(URL, headers=HEADERS, timeout=5, verify=False).text)[0]
print(res)

Page won't load after .click() even when it display on the browser

I'm trying to scrape this page "https://www.seloger.com/list_beta.htm?tri=initial&enterprise=0&idtypebien=2,1&idtt=2,5&naturebien=1,2,4&cp=75"
But when i use .click() on the first element of the search, the page is correctly loaded in the browser but I didn't get the body and all its child with driver.find_element method whereas get the url of the new loaded page let me find it without problem.
driver.current_url give me the first page, it's like .click() didn't load anything whereas the rendered is successfully loaded in the browser.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import visibility_of_element_located
from selenium.webdriver.support.ui import WebDriverWait
import pyautogui
import time
def cssconvert(tag):
return '.'+tag.replace(' ', '.')
binary = r'C:\Program Files\Mozilla Firefox\firefox.exe'
options = Options()
options.set_headless(headless=False)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True #optional
driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:\\Users\\chrys\\Desktop\\DataScientist\\Selenium\\geckodriver_true\\geckodriver.exe")
driver.get("https://www.seloger.com/list_beta.htm?tri=initial&enterprise=0&idtypebien=2,1&idtt=2,5&naturebien=1,2,4&cp=75")
#time.sleep(2)
select = 'block__ShadowedBlock-sc-10w6hsj-0 ListContent__SmartClassifiedExtended-sc-1viyr2k-2 iddbNe classified__ClassifiedContainer-sc-1wmlctl-0 haLWMI Card__CardContainer-sc-7insep-7 jZkbME'
driver.find_element_by_css_selector(cssconvert(select)).click()
driver.find_element_by_id('js-descriptifBien')
241 raise exception_class(message, screen, stacktrace, alert_text)
--> 242 raise exception_class(message, screen, stacktrace)
243
244 def _value_or_default(self, obj, key, default):
NoSuchElementException: Message: Unable to locate element: [id="js-descriptifBien"]
if now i copy past the url :
driver.get('https://www.seloger.com/annonces/viagers/appartement/paris-11eme-75/belleville-saint-maur/145504325.htm?projects=2,5&types=2,1&natures=1,2,4&places=[{cp:75}]&qsVersion=1.0&bd=ListToDetail')
driver.find_element_by_id('js-descriptifBien').text
it's working.
As my purpose is to crawl all elements from the result research, i would know how to deal with it .
You can read about selectors here and here.
The problem is when you click on list item, it opens it in a new tab. You have to switch to it before any action. Here some example how to navigate between windows:
current_window = driver.current_window_handle
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".iddbNe"))).click()
wait.until(EC.new_window_is_opened)
driver.switch_to.window(driver.window_handles[1])
With Selenium, solution can be collecting all links from the list and after navigate to them. For that you can use .iddbNe a[name=classified-link] css selector. After collecting all links from all required pages you can navigate to them and collect data.
Here is example how to do it for the first page:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# ...
wait = WebDriverWait(driver, 20)
# you can put loop to go throw all pages you need and add to the list
links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".iddbNe a[name=classified-link]")))
for link in links:
driver.get(link)
# get you data
Better solution to scrape is to use requests. With code below is simple example how you can get 100 results in json format. In repanse you can find total number of results and use it to loop and collect all information you need:
import requests
headers = {
'sec-fetch-mode': 'cors',
'origin': 'https://www.seloger.com',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'ru,en-US;q=0.9,en;q=0.8,tr;q=0.7',
'pragma': 'no-cache',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/77.0.3865.90 Safari/537.36',
'content-type': 'application/json',
'accept': 'application/json',
'cache-control': 'no-cache',
'authority': 'www.seloger.com',
'referer': 'https://www.seloger.com/list_beta.htm?projects=2%2C5&types=2%2C1&natures=1%2C2%2C4&places=%5B%7Bcp'
'%3A75%7D%5D&qsVersion=1.0&LISTING-LISTpg=2',
'sec-fetch-site': 'same-origin',
'dnt': '1',
}
params = (
('from', '0'),
('size', '100'),
('isSeo', 'false'),
)
data = '{"idPublication":null,"projects":[2,5],"types":[2,1],"natures":[1,2,4],"places":[{"label":"Paris",' \
'"cities":null,"districts":null,"countries":null,"divisions":null,"subDivisions":["75"],"fakeCities":null}],' \
'"searchAreas":null,"isochronePoints":null,"proximities":null,"withGeoloc":null,"price":null,' \
'"groundSurface":null,"surface":null,"bedrooms":[],"rooms":[],"bedroom":null,"room":null,"sort":null,' \
'"floor":null,"lastFloor":null,"hearth":null,"guardian":null,"view":null,"balcony":null,"pool":null,' \
'"lift":null,"terrace":null,"cellar":null,"south":null,"parking":null,"box":null,"parquet":null,"locker":null,' \
'"furnished":null,"disabledAccess":null,"alarm":null,"toilet":null,"bathtub":null,"shower":null,"hall":null,' \
'"livingRoom":null,"diningRoom":null,"kitchen":null,"heating":null,"unobscured":null,"picture":null,' \
'"exclusiveness":null,"priceChange":null,"privateSeller":null,"video":null,"vv":null,"enterprise":null,' \
'"garden":null,"basement":null,"groundFloor":null,"houseboat":null} '
response = requests.post('https://www.seloger.com/list/api/externaldata', headers=headers, params=params, data=data)
print(response.text)

How do I grab the table content from a webpage with javascript using python?

I like to grab the table content from this page. The following is my code and I got NaN (without the data). How come the numbers are not showing up? How do I grab the table with the corresponding data? Thanks.
You can get a nice json format from the api:
import requests
import pandas as pd
url = 'https://api.blockchain.info/stats'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
params = {'cors': 'true'}
data = requests.get(url, headers=headers, params=params).json()
# if you want it as a table
df = pd.DataFrame(data.items())
Option 2:
Let the page fully render. There is abetter way to use wait with Selenium, but just quickly threw a 5 second wait in there to show:
from selenium import webdriver
import pandas as pd
import time
url = 'https://www.blockchain.com/stats'
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
browser.get(url)
time.sleep(5)
dfs = pd.read_html(browser.page_source)
print(dfs[0])
browser.close()
Output:
0 1 2 3
0 Blocks Mined 150 150 NaN
1 Time Between Blocks 9.05 minutes 9.05 minutes NaN
2 Bitcoins Mined 1,875.00000000 BTC 1,875.00000000 BTC NaN

Python-Requests, how to dynamically receive the url?

I have looked all over Stack Overflow but I have not found an answer to this question.
How can a python script dynamically receive a url based on a javascript function call?
For example, in this Stack Overflow question (Code reproduced below) how could I dynamically receive the url (which is hardcoded in this case) if the name of the python file was abc.py and I called
xhttp = new XMLHttpRequest(); and then
xhttp.open("GET", "abc.py?token=123", true); in some html file with javascript?
from urllib.parse import urlparse
from urllib.parse import parse_qs
from urllib.parse import urlencode
url = 'http://example.com?param1=a&token=TOKEN_TO_REPLACE&param2=c'
o = urlparse(url)
query = parse_qs(o.query)
if query.get('token'):
query['token'] = ['NEW_TOKEN', ]
new_query = urlencode(query, doseq=True)
url.split('?')[0] + '?' + new_query
>>> http://example.com?param2=c&param1=a&token=NEW_TOKEN
You need to use framework to do this, python script alone with no networking/socket functionality could not communicate with the front-end (js side), below is a simple backend to received your javascript request using bottle.py
Here is a simple implementation that receives a POST request from client and do the logic you need, updated url is returned to the calling code.
Note the request is POST and the data is json with the token and url
from bottle import post, run, request
import json
from urllib.parse import urlparse
from urllib.parse import parse_qs
from urllib.parse import urlencode
def replace_token(data):
url = data['url']
token = data['token']
o = urlparse(url)
query = parse_qs(o.query)
if query.get('token'):
query['token'] = [token]
new_query = urlencode(query, doseq=True)
return url.split('?')[0] + '?' + new_query
#post('/token')
def index():
data = json.load(request.body)
return replace_token(data)
run(host='localhost', port=8080, debug=True)
Then you can test it by simple using a curl
curl -X POST http://localhost:8080/token -d '{"token":"NEW_TOKEN", "url":"http://example.com?param1=a&token=TOKEN_TO_REPLACE&param2=c"}'

Categories

Resources