How to access 'parent' from BeautifulSoup using select or find? - javascript

import requests
from bs4 import BeautifulSoup
# raw = requests.get("https://www.daum.net")
# raw = requests.get("http://127.0.0.1:5000/emp")
response = requests.get("https://vip.mk.co.kr/newSt/rate/item_all.php?koskok=KOSPI&orderBy=upjong")
response.raise_for_status()
response.encoding= 'EUC-KR'
html = response.text
bs = BeautifulSoup(html, 'html.parser')
result = bs.select("tr .st2")
<tr>
<td width='92' class='st2'><a href="javascript:goPrice('000020&MSid=&msPortfolioID=')" title='000020'>somethinbg</a></td>
<td width='60' align='right'>15,100</td>
<td width='40' align='right'><span class='t_12_blue'>▼300</span></td>
</tr>
I want to get datas from the someweher by using BeautifulSoup.
But I should access parent Node where has .
However, it's really hard to do it.
This is the code:
Then, how can I get datas from the parent which has the '<tr class ='st2>''
here is the example

You can access the parent element in BeautifulSoup via the element's parent attribute. Since you asked for a list of elements, this has to be done in an iteration.
I'm assuming here that you want to extract each row in the table.
import requests
from bs4 import BeautifulSoup
response = requests.get("https://vip.mk.co.kr/newSt/rate/item_all.php?koskok=KOSPI&orderBy=upjong")
response.raise_for_status()
response.encoding= 'EUC-KR'
html = response.text
bs = BeautifulSoup(html, 'html.parser')
result = [[child.text for child in elem.parent.findChildren('td', recursive=False)] \
for elem in bs.select('tr .st2')]
The result is:
[
['동화약품', '15,100', '▼300'],
['유한양행', '61,100', '▼400'],
['유한양행우', '58,900', '▲300'],
...
]

Related

Insert nested XLM nodes if node doesn't exist using Groovy/JavaScript

Below is the XML I am working with
<Root>
<Employee>
<EmployeeNumber>10105950</EmployeeNumber>
<LastName>Myers Jr</LastName>
<FirstName>Ezell</FirstName>
<MiddleName/>
<JobTitle>Tech Shield Strapper</JobTitle>
<Manager>N</Manager>
<ManagerID>50501273</ManagerID>
<BusinessEmail>10105950#LPnoemail.com</BusinessEmail>
<Location>TX06</Location>
<CostCenter>10200011</CostCenter>
<HomeLocationCode>TX06 10200011</HomeLocationCode>
<cust_HomeLocationCode>
<cust_LocationCode>30</cust_LocationCode>
</cust_HomeLocationCode>
<Username>myers10105950</Username>
<CostCenterName>JASPER OSB Finishing Techshield</CostCenterName>
<WorkContract>A2</WorkContract>
<HireDate>11/14/2002</HireDate>
<PreferredLanguage>1706</PreferredLanguage>
<PreferredName>Ezell</PreferredName>
</Employee>
<Employee>
<EmployeeNumber>10105951</EmployeeNumber>
<LastName>Bean</LastName>
<FirstName>David</FirstName>
<MiddleName>A</MiddleName>
<JobTitle>Sr. FP&A Analyst</JobTitle>
<Manager>N</Manager>
<ManagerID>10129003</ManagerID>
<BusinessEmail>10105951#LPnoemail.com</BusinessEmail>
<Location>TX06</Location>
<CostCenter>11000152</CostCenter>
<HomeLocationCode>TX06 11000152</HomeLocationCode>
<Username>beanda</Username>
<CostCenterName>OSB Finance</CostCenterName>
<WorkContract>A2</WorkContract>
<HireDate>07/05/1994</HireDate>
<PreferredLanguage>1706</PreferredLanguage>
<PreferredName>David</PreferredName>
</Employee>
</Root>
I need help writing a groovy script or javascript to evaluate the XML. If <cust_HomeLocationCode> and its contents exist, then skip, else insert the node with <cust_LocationCode> value equaling '1'.
So far I have been able to add just the <cust_HomeLocationCode> node using below code
import com.sap.gateway.ip.core.customdev.util.Message;
import java.util.HashMap;
import groovy.xml.XmlUtil;
import groovy.util.*;
def Message processData(Message message) {
def body = message.getBody(java.lang.String);
def root = new XmlParser().parseText(body)
root.each { Employee ->
Employee.appendNode("cust_HomeLocationCode", "newfield")
}
message.setBody(XmlUtil.serialize(root));
return message;
}

How to create language selection wrapper from a gist script?

I have a Gist file written in different languages all do the same thing.
So, I would like to create a language select option similar to Google docs documentation.
Is it possible to create such a wrapper class that accepts a Gist script tag and display as above?
As in embed single file, I tried different query command like <script src="https://gist.github.com/gistid.js?language=python">, but none of them work.
This is the processing code that I ended up with.
With some CSS + javascript hide and toggle logic, it would work like google docs documentation.
I'd appreciate it if anyone updates this answer, with css or js.
import requests
from bs4 import BeautifulSoup
def render_gist_by_file(gist_id):
result = requests.get(f'https://gist.github.com/{gist_id}.js', headers=git_credential)
if result.text.startswith("<!DOCTYPE html>"):
return None
result = result.text
result = result.replace("\\/", "/").replace("\\&", "&").replace("\\$", "$").replace("\\<", "<").replace("\\`", "`").replace("\\n", "\n").replace('\\"', '"')
result = html.unescape(result)
result = result.split("document.write('")[-1][:-3]
bs = BeautifulSoup(result, "html.parser")
for tag in bs.find_all(class_="gist"):
file_box = tag.find(class_="file-box")
root = tag.find(class_="file-box")
toggle_div = bs.new_tag('div', attrs={"class": "gist-meta"})
for i, d in enumerate(tag.find_all(class_="file")):
d["class"] = f"file gist-toggle gist-id-{gist_id}"
if i != 0:
file_box.append(d) # combine to first table
for d in tag.find_all(class_="gist-meta"):
siblings = list(d.next_elements)
file_id, file_name = siblings[4].attrs["href"].split("#")[-1], siblings[5]
gist.file_names.append(file_name)
toggle_a = bs.new_tag('a', attrs={"id": file_id, "class": f"gist-toggle gist-id-{gist_id}", "onclick": f"toggle('gist-id-{gist_id}', '{file_id}')", "style": "padding: 0 18px"})
toggle_a.append(file_name)
toggle_div.append(toggle_a)
d.extract() # remove bottom nav
root.insert(0, toggle_div)
for d in islice(tag.find_all(class_="gist-file"), 1, None):
d.extract() # remove except first
gist.html = str(bs)
return gist

Beautiful soup complicated query question

I want to scrape the table on the following website
https://www.hkab.org.hk/DisplayInterestSettlementRatesAction.do
However, it has a very complicated query
I tried the following code but I cant find the table I want.
url = "https://www.hkab.org.hk/DisplayInterestSettlementRatesAction.do"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find('table',{'class':'etxtmed'})
And the table result is:
<table border="0" cellpadding="4" cellspacing="0" class="etxtmed" width="100%">
<tr>
<td height="30" valign="top">Home
</td>
<td align="right" class="etxtsml" valign="top">
</td>
</tr>
</table>
How can I get the value of the table? I cant find the table value.
Some comment say that it is generated by javascript, any suggestion for getting the table value instead of beautifulsoup?
I've tracked from where the data loaded and found the url to load from it :).
import requests
from bs4 import BeautifulSoup
import csv
r = requests.get(
'https://www.hkab.org.hk/hibor/listRates.do?lang=en&Submit=Detail')
soup = BeautifulSoup(r.text, 'html.parser')
mat = []
hk = []
for item in soup.findAll('td', {'align': 'right'})[2:]:
item = item.text.strip()
mat.append(item)
for item in soup.findAll('td', {'align': 'middle'})[3:11]:
item = item.text
hk.append(item)
data = []
for item in zip(mat, hk):
data.append(item)
with open('output.csv', 'w+', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Maturity', 'HKD Interest\nSettlement Rate'])
writer.writerows(data)
print("Operation Completed")
Output: Click Here

How to get the webpage using pyqt5?

I want to get the webpage using pyqt5.
The url is https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html.
The webpage will generate two values with javascript.
Just input 5 in the text box and press the red button.
Two values in red will be returned.
Please refer to the image.
The code below is used to get the webpage.
However, I wait for a long time and there is no response.
What should I change in my code?
Thank you very much.
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup
import pandas as pd
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.first_pass = True
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._load_finished)
self.load(QUrl(url))
self.app.exec_()
def _load_finished(self, result):
if self.first_pass:
self._first_finished()
self.first_pass = False
else:
self._second_finished()
def _first_finished(self):
self.page().runJavaScript('document.getElementById("txtDistance").value = "5";')
self.page().runJavaScript("void(0)")
self.page().runJavaScript("CheckUserWhere();")
def _second_finished(self):
self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
self.app.quit()
url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
element = soup.find('div', {'id':"divResult"})
df = pd.read_html(str(element))
It seems that you have several misconceptions:
When js is executed, the page is not reloaded, so the _second_finished function will never be called.
If you do not want to show the window then it is better to use QWebEnginePage.
Considering the above the html that is obtained is:
<div class="p8-5" id="divResult" style="display:block;">
<div align="center" display="block" id="rsloading" style="display: block;">
<img src="//img2.soufunimg.com/qyb/loading.gif"/>
正在为您加载数据...
</div>
<table border="0" cellpadding="0" cellspacing="0" class="tablebox01" display="none" id="tbResult" style="display: none;" width="600">
<tbody><tr>
<td style="width:260px;"><span class="gray8">建设用地面积:</span>14748平方米</td>
<td style="width:340px;"><span class="gray8">所在城市:</span>山西省 长治市 </td>
</tr>
<tr>
<td><span class="gray8">规划建筑面积:</span>51617平方米</td>
<td><span class="gray8">土地评估楼面价:</span><b class="redc00 font14" id="_bpgj">867.61</b> 元/平方米</td>
</tr>
<tr>
<td><span class="gray8">容积率:</span>大于1并且小于或等于3.5</td>
<td><span class="gray8">土地评估总价:</span><b class="redc00 font14" id="_bSumPrice">4478.34</b> 万元</td>
</tr>
<tr>
<td><span class="gray8">规划用途:</span>住宅用地</td>
<td><span class="gray8">推出楼面价:</span>27.51元/平方米</td>
</tr>
</tbody></table>
</div>
So the simplest thing to do is to filter by the ids "_bpgj" and "_bSumPrice"
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
from bs4 import BeautifulSoup
class Render(QtWebEngineWidgets.QWebEnginePage):
def __init__(self, url):
self.html = ""
self.first_pass = True
self.app = QtWidgets.QApplication(sys.argv)
super(Render, self).__init__()
self.loadFinished.connect(self._load_finished)
self.loadProgress.connect(print)
self.load(QtCore.QUrl(url))
self.app.exec_()
def _load_finished(self, result):
if result:
self.call_js()
def call_js(self):
self.runJavaScript('document.getElementById("txtDistance").value = "5";')
self.runJavaScript("void(0)")
self.runJavaScript("CheckUserWhere();")
self.toHtml(self.callable)
def callable(self, data):
self.html = data
self.app.quit()
url = "https://land.3fang.com/LandAssessment/b6d8b2c8-bd4f-4bd4-9d22-ca49a7a2dc1f.html"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
_bpgj = soup.find('b', {'id':"_bpgj"}).string
_bSumPrice = soup.find('b', {'id':"_bSumPrice"}).string
print(_bpgj, _bSumPrice)
Output:
867.61 4478.34

Scrape Javascript-generated page using Scrapy

The following page gives access to product details by executing a Javascript request:
http://www.ooshop.com/ContentNavigation.aspx?TO_NOEUD_IDMO=N000000013143&FROM_NOEUD_IDMO=N000000013131&TO_NOEUD_IDFO=81080&NOEUD_NIVEAU=2&UNIVERS_INDEX=3
Each product has the following element:
<a id="ctl00_cphC_pn3T1_ctl01_rp_ctl00_ctl00_lbVisu" class="prodimg" href="javascript:__doPostBack('ctl00$cphC$pn3T1$ctl01$rp$ctl00$ctl00$lbVisu','')"><img id="ctl00_cphC_pn3T1_ctl01_rp_ctl00_ctl00_iVisu" title="Visualiser la fiche détail" class="image" onerror="this.src='/Media/images/null.gif';" src="Media/ProdImages/Produit/Vignettes/3270190199359.gif" alt="Dés de jambon" style="height:70px;width:70px;border-width:0px;margin-top:15px"></a>
I try to use FormRequest from Scrapy librairies to crawl these pages but it does not seem to work:
<python>
import scrapy
from scrapy.http import FormRequest
from JStest.items import JstestItem
class ooshoptest2(scrapy.Spider):
name = "ooshoptest2"
allowed_domains = ["ooshop.com"]
start_urls = ["http://www.ooshop.com/courses-en-ligne/ContentNavigation.aspx?TO_NOEUD_IDMO=N000000013143&FROM_NOEUD_IDMO=N000000013131&TO_NOEUD_IDFO=81080&NOEUD_NIVEAU=2&UNIVERS_INDEX=3"]
def parse(self, response):
URL=response.url
path='//div[#class="blockInside"]//ul/li/a'
for balise in response.xpath(path):
jsrequest = response.urljoin(balise.xpath('#href').extract()[0]
js="'"+jsrequest[25:-5]+"'"
data = {'__EVENTTARGET': js,'__EVENTARGUMENT':''}
yield FormRequest(url=URL,
method='POST',
callback=self.parse_level1,
formdata=data,
dont_filter=True)
def parse_level1(self, response):
path='//div[#class="popContent"]'
test=response.xpath(path)[0].extract()
print test
item=JstestItem()
yield item
Does anyone knows how to make this work?
Many thanks!

Categories

Resources