I want to get the webpage using pyqt5.
The url is https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html.
The webpage will generate two values with javascript.
Just input 5 in the text box and press the red button.
Two values in red will be returned.
Please refer to the image.
The code below is used to get the webpage.
However, I wait for a long time and there is no response.
What should I change in my code?
Thank you very much.
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup
import pandas as pd
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.first_pass = True
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._load_finished)
self.load(QUrl(url))
self.app.exec_()
def _load_finished(self, result):
if self.first_pass:
self._first_finished()
self.first_pass = False
else:
self._second_finished()
def _first_finished(self):
self.page().runJavaScript('document.getElementById("txtDistance").value = "5";')
self.page().runJavaScript("void(0)")
self.page().runJavaScript("CheckUserWhere();")
def _second_finished(self):
self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
self.app.quit()
url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
element = soup.find('div', {'id':"divResult"})
df = pd.read_html(str(element))
It seems that you have several misconceptions:
When js is executed, the page is not reloaded, so the _second_finished function will never be called.
If you do not want to show the window then it is better to use QWebEnginePage.
Considering the above the html that is obtained is:
<div class="p8-5" id="divResult" style="display:block;">
<div align="center" display="block" id="rsloading" style="display: block;">
<img src="//img2.soufunimg.com/qyb/loading.gif"/>
正在为您加载数据...
</div>
<table border="0" cellpadding="0" cellspacing="0" class="tablebox01" display="none" id="tbResult" style="display: none;" width="600">
<tbody><tr>
<td style="width:260px;"><span class="gray8">建设用地面积:</span>14748平方米</td>
<td style="width:340px;"><span class="gray8">所在城市:</span>山西省 长治市 </td>
</tr>
<tr>
<td><span class="gray8">规划建筑面积:</span>51617平方米</td>
<td><span class="gray8">土地评估楼面价:</span><b class="redc00 font14" id="_bpgj">867.61</b> 元/平方米</td>
</tr>
<tr>
<td><span class="gray8">容积率:</span>大于1并且小于或等于3.5</td>
<td><span class="gray8">土地评估总价:</span><b class="redc00 font14" id="_bSumPrice">4478.34</b> 万元</td>
</tr>
<tr>
<td><span class="gray8">规划用途:</span>住宅用地</td>
<td><span class="gray8">推出楼面价:</span>27.51元/平方米</td>
</tr>
</tbody></table>
</div>
So the simplest thing to do is to filter by the ids "_bpgj" and "_bSumPrice"
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
from bs4 import BeautifulSoup
class Render(QtWebEngineWidgets.QWebEnginePage):
def __init__(self, url):
self.html = ""
self.first_pass = True
self.app = QtWidgets.QApplication(sys.argv)
super(Render, self).__init__()
self.loadFinished.connect(self._load_finished)
self.loadProgress.connect(print)
self.load(QtCore.QUrl(url))
self.app.exec_()
def _load_finished(self, result):
if result:
self.call_js()
def call_js(self):
self.runJavaScript('document.getElementById("txtDistance").value = "5";')
self.runJavaScript("void(0)")
self.runJavaScript("CheckUserWhere();")
self.toHtml(self.callable)
def callable(self, data):
self.html = data
self.app.quit()
url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
_bpgj = soup.find('b', {'id':"_bpgj"}).string
_bSumPrice = soup.find('b', {'id':"_bSumPrice"}).string
print(_bpgj, _bSumPrice)
Output:
867.61 4478.34
Related
import requests
from bs4 import BeautifulSoup
# raw = requests.get("https://www.daum.net")
# raw = requests.get("http://127.0.0.1:5000/emp")
response = requests.get("https://vip.mk.co.kr/newSt/rate/item_all.php?koskok=KOSPI&orderBy=upjong")
response.raise_for_status()
response.encoding= 'EUC-KR'
html = response.text
bs = BeautifulSoup(html, 'html.parser')
result = bs.select("tr .st2")
<tr>
<td width='92' class='st2'><a href="javascript:goPrice('000020&MSid=&msPortfolioID=')" title='000020'>somethinbg</a></td>
<td width='60' align='right'>15,100</td>
<td width='40' align='right'><span class='t_12_blue'>▼300</span></td>
</tr>
I want to get datas from the someweher by using BeautifulSoup.
But I should access parent Node where has .
However, it's really hard to do it.
This is the code:
Then, how can I get datas from the parent which has the '<tr class ='st2>''
here is the example
You can access the parent element in BeautifulSoup via the element's parent attribute. Since you asked for a list of elements, this has to be done in an iteration.
I'm assuming here that you want to extract each row in the table.
import requests
from bs4 import BeautifulSoup
response = requests.get("https://vip.mk.co.kr/newSt/rate/item_all.php?koskok=KOSPI&orderBy=upjong")
response.raise_for_status()
response.encoding= 'EUC-KR'
html = response.text
bs = BeautifulSoup(html, 'html.parser')
result = [[child.text for child in elem.parent.findChildren('td', recursive=False)] \
for elem in bs.select('tr .st2')]
The result is:
[
['동화약품', '15,100', '▼300'],
['유한양행', '61,100', '▼400'],
['유한양행우', '58,900', '▲300'],
...
]
I want to scrape the table on the following website
https://www.hkab.org.hk/DisplayInterestSettlementRatesAction.do
However, it has a very complicated query
I tried the following code but I cant find the table I want.
url = "https://www.hkab.org.hk/DisplayInterestSettlementRatesAction.do"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find('table',{'class':'etxtmed'})
And the table result is:
<table border="0" cellpadding="4" cellspacing="0" class="etxtmed" width="100%">
<tr>
<td height="30" valign="top">Home
</td>
<td align="right" class="etxtsml" valign="top">
</td>
</tr>
</table>
How can I get the value of the table? I cant find the table value.
Some comment say that it is generated by javascript, any suggestion for getting the table value instead of beautifulsoup?
I've tracked from where the data loaded and found the url to load from it :).
import requests
from bs4 import BeautifulSoup
import csv
r = requests.get(
'https://www.hkab.org.hk/hibor/listRates.do?lang=en&Submit=Detail')
soup = BeautifulSoup(r.text, 'html.parser')
mat = []
hk = []
for item in soup.findAll('td', {'align': 'right'})[2:]:
item = item.text.strip()
mat.append(item)
for item in soup.findAll('td', {'align': 'middle'})[3:11]:
item = item.text
hk.append(item)
data = []
for item in zip(mat, hk):
data.append(item)
with open('output.csv', 'w+', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Maturity', 'HKD Interest\nSettlement Rate'])
writer.writerows(data)
print("Operation Completed")
Output: Click Here
I am trying to refresh the content of a table every few seconds in my HTML page using javascript. I keep getting 500 error when it tries to refresh the div, internal server error. Could someone enlighten the reason this is not working? I have used this: Refresh div using JQuery in Django while using the template system
as a reference to what I was doing. The page loads perfectly the first time just fails to refresh.
Here is my code:
urls.py
url(r'^specialScoreboard/$', views.specialScoreboard.as_view(), name='specialScoreboard'),
url(r'^specialScoreboardDiv/$', views.specialScoreboardDiv , name='specialScoreboardDiv'),
views.py
class specialScoreboard(generic.ListView):
template_name = 'CTF/specialScoreboard.html'
context_object_name = 'teams'
#method_decorator(login_required)
#method_decorator(never_ever_cache)
def dispatch(self, request, *args, **kwargs):
if getAnyActiveGame and request.user.is_staff:
return super(specialScoreboard, self).dispatch(request, *args, **kwargs)
else:
return HttpResponseRedirect(reverse('CTF:no_active_game'))
def get_queryset(self):
"""
ordering teams by score
"""
game = getAnyActiveGame()
teams = get_teams_from_game(game)
return sorted(teams, key=lambda a: a.get_score(game), reverse=True)
def specialScoreboardDiv():
game = getAnyActiveGame()
teams = get_teams_from_game(game)
sortedList = sorted(teams, key=lambda a: a.get_score(game), reverse=True)
return render_to_response('CTF/specialscoreboardDiv.html' , {'sortedList' :sortedList})
scoreboardRefresh.js + scoreboardDiv.html
<script>
var scoreboardURL = '{% url '
CTF: specialScoreboardDiv ' %}';
function refresh() {
$.ajax({
url: scoreboardURL,
success: function(data) {
$('#scoreboardDiv').html(data);
}
});
};
$(document).ready(function($) {
refresh();
setInterval("refresh()", 3000);
})
</script>
<div class="panel panel-info">
<div class="panel-heading">Scoreboard</div>
<div class="panel-body">
<div class="table-striped">
<table id="scoreboardDiv" class="table table-hover">
<thead>
<tr>
<th>#</th>
<th>Team Name</th>
<th>Score</th>
</tr>
</thead>
<tbody>
{% for team in teams %}
<tr>
<td>{{forloop.counter}}</td>
<td>{{team.name}}</td>
<td>{{team|getScoreTeam}}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
I can't seem to be able to format the error, here is a picture of it: http://i.imgur.com/Yc11juA.png
http://i.imgur.com/QluqZyc.png
http://imgur.com/QluqZyc
Your django view takes no arguments, but usually django tries to pass request param into it. From the error in the screenshot you provided in comments, looks likе this is your problem.
I think you error will be fixed by making your view function take this argument:
def specialScoreboardDiv(request):
game = getAnyActiveGame()
...
The following page gives access to product details by executing a Javascript request:
http://www.ooshop.com/ContentNavigation.aspx?TO_NOEUD_IDMO=N000000013143&FROM_NOEUD_IDMO=N000000013131&TO_NOEUD_IDFO=81080&NOEUD_NIVEAU=2&UNIVERS_INDEX=3
Each product has the following element:
<a id="ctl00_cphC_pn3T1_ctl01_rp_ctl00_ctl00_lbVisu" class="prodimg" href="javascript:__doPostBack('ctl00$cphC$pn3T1$ctl01$rp$ctl00$ctl00$lbVisu','')"><img id="ctl00_cphC_pn3T1_ctl01_rp_ctl00_ctl00_iVisu" title="Visualiser la fiche détail" class="image" onerror="this.src='/Media/images/null.gif';" src="Media/ProdImages/Produit/Vignettes/3270190199359.gif" alt="Dés de jambon" style="height:70px;width:70px;border-width:0px;margin-top:15px"></a>
I try to use FormRequest from Scrapy librairies to crawl these pages but it does not seem to work:
<python>
import scrapy
from scrapy.http import FormRequest
from JStest.items import JstestItem
class ooshoptest2(scrapy.Spider):
name = "ooshoptest2"
allowed_domains = ["ooshop.com"]
start_urls = ["http://www.ooshop.com/courses-en-ligne/ContentNavigation.aspx?TO_NOEUD_IDMO=N000000013143&FROM_NOEUD_IDMO=N000000013131&TO_NOEUD_IDFO=81080&NOEUD_NIVEAU=2&UNIVERS_INDEX=3"]
def parse(self, response):
URL=response.url
path='//div[#class="blockInside"]//ul/li/a'
for balise in response.xpath(path):
jsrequest = response.urljoin(balise.xpath('#href').extract()[0]
js="'"+jsrequest[25:-5]+"'"
data = {'__EVENTTARGET': js,'__EVENTARGUMENT':''}
yield FormRequest(url=URL,
method='POST',
callback=self.parse_level1,
formdata=data,
dont_filter=True)
def parse_level1(self, response):
path='//div[#class="popContent"]'
test=response.xpath(path)[0].extract()
print test
item=JstestItem()
yield item
Does anyone knows how to make this work?
Many thanks!
I try to semplify my situation:
I have two PC, PC1 ("server") is not on the web, instead the PC2 ("client") can surf on the web (they communicate between each other).
Into PC1 I have my Django project, DB etc.. while with PC2 I can access via browser to my project an so on.
I would like to reach this goal:
PC1 take the date and time from PC 2 automatically and in PC 1 every actions refers to time and date of PC2.
| PC1 (server-side) | <--datetime--- | PC2 (client-side) |<----- datetime from web
Actually I can take the client datetime with some js:
datetime.js
$(document).ready(function(){
var myVar=setInterval(function(){myTimer()},1000);
function myTimer() {
var myDate = new Date();
var strDate = myDate.getFullYear()+ "-" +(myDate.getMonth()+1)+ "-" +myDate.getDate();
var strTime = myDate.getHours()+ ":" + myDate.getMinutes() +":" + myDate.getSeconds();
document.getElementById("data").innerHTML = myDate.toLocaleString();
}
})
models.py
from django.db import models
class ProvaTime(models.Model):
somewords = models.CharField(max_length=30)
myauthor = models.ForeignKey('auth.User', null=True, blank=False)
created_date = models.DateTimeField() # how tell it to pick js timedate?
def save(self, *args, **kwargs):
super(ProvaTime, self).save(*args, **kwargs)
forms.py
from django import forms
from .models import ProvaTime
class ProvaTimeForm(forms.ModelForm):
class Meta:
model = ProvaTime
fields = ('somewords',)
views.py
def testime(request):
elements =ProvaTime.objects.filter().order_by('-created_date')
if request.method == 'POST':
form = ProvaTimeForm(request.POST)
if form.is_valid():
obj = form.save(commit=False)
obj.author = request.user
obj.save()
else:
form = ProvaTimeForm()
return render(request, 'provapp/testime.html', {'form': form, 'elements': elements})
def myviewtomodel(request): # how to do?
provapp/testime.html
<form action="/testime/" method="post">
{% csrf_token %}
{{ form.as_table }}
<input type="submit" value="LIST" />
</form>
<br />
<br />
<div class="table-responsive">
<table class="table table-bordered">
<tr class="info">
<td width="15%" align="center">SOMETHING WORDS</td>
<td width="15%" align="center"> CREATION DATETIME </td>
<td width="15%" align="center"> AUTHOR </td>
<td>
</td>
{% for stuff in elements %}
<div class="stuff">
<tr>
<td>{{stuff.somewords}}</td>
<td>{{stuff.created_date}}</td>
<td>{{stuff.myauthor}}</td>
</div>
{% endfor %}
</table>
</div>
How can I send JS datetime to models.py?
You should pass the javascript date to your models explicitly.
model.created_date = date_from_javascript
Since you have set the default value for created_date as timezone.now it will use the server's timezone when nothing is passed to it.
ex.
$.ajax({
url: '/myview/',
data: {'javascript_date' : javascript_date},
method: 'post',
....
});
in your django view:
def MyView(request):
javascript_date = request.POST['javascript_date']
m = MyModel(created_at=javascript_date)
m.save()
I would recommend to not save client time directly in the database. Because while there is multiple clients with different timezone, then one client will see wrong time as that was created in separate timezone. I will suggest you to use session to save the offset value (Or database)and use it for displaying data.
Now lets make an ajax request in home page or another page which will send the offset data to the server from browser.
$(document).ready(function(){
var now = new Date()
var value = now.getTimezoneOffset()
$.ajax({
url: "your-url",
type: "post", // or "get"
data: value,
success: function(data) {
console.log(data);
}});
});
add an extra field to model to save that offset. And a property method which will later calculate user's local time.
class ProvaTime(models.Model):
....
tz_offset = models.DecimalField(default=Decimal('0.0'),max_digits=3, decimal_places=1)
#property
def get_local_time(self):
return create_time - datetime.timedelta(tz_offset)
and ajax the view
import json
def post(request):
if request.POST():
data = {}
try:
get_value= request.body
prova_times = ProvaTime.objects.filter(myauthor=request.user)
if prova_times.exists():
for prova_time in prova_times:
prova_time.tz_offset = get_value
prova_time.save()
data['success'] = 'Success'
return HttpResponse(json.dumps(data), content_type="application/json")
except Exception as e:
data['error'] = e
return HttpResponse(json.dumps(data), content_type="application/json")
Now displaying the data:
def myviewtomodel(request):
context = dict()
context['elements'] = ProvaTime.objects.all()
return render(request, 'myapp/index.html', context,
content_type="application/xhtml+xml")
Showing time in template(using get_local_time).
<div class="table-responsive">
<table class="table table-bordered">
<tr class="info">
<td width="15%" align="center">SOMETHING WORDS</td>
<td width="15%" align="center"> CREATION DATETIME </td>
<td width="15%" align="center"> AUTHOR </td>
<td>
</td>
{% for stuff in elements %}
<div class="stuff">
<tr>
<td>{{stuff.somewords}}</td>
<td>{{stuff.get_local_time}}</td>
<td>{{stuff.myauthor}}</td>
</div>
{% endfor %}
</table>
</div>