Scraping a website with content added via javascript with PySide in Python

Scraping a website with content added via javascript with PySide in Python - javascript

I am trying to scrape data from a website using PyQt4 in python. However, this website is adding the data I'm interested in via javascript. Is there a way to ask Selnium to wait for the data before return it? So far, we've tried:
import sys
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from BeautifulSoup import BeautifulSoup
def test():
print "coucou"
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.timerScreen = QTimer()
self.timerScreen.setInterval(10000)
self.timerScreen.setSingleShot(True)
self.timerScreen.timeout.connect(test)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def main():
url = '[redacted]'
r = Render(url)
html = r.frame.toHtml()
page = QWebPage()
page.settings().setAttribute(QWebSettings.AutoLoadImages, False)
page.settings().setAttribute(QWebSettings.PluginsEnabled, False)
page.mainFrame().setHtml(html)
dom = page.mainFrame().documentElement()
li = dom.findFirst("body")
print html
if not li.isNull():
classe = li.attribute("class")
text = li.toPlainText()
main()
Unfortunately, the content of the page doesn't show the relevant data.
We're using Python 2.7.5 and PySide 1.2.2.
Thanks in advance.

Related

How do I add a js map to Pyqt5?

I want to add the map provided by Marinetraffic to pyqt5. When I add the HTML codes provided by MarineTraffic to my own program, it doesn't work.
The map I want to add:
MarineTraffic Map JS
from PyQt5 import QtCore, QtGui, QtWidgets, QtWebEngineWidgets, QtWebChannel
class Backend(QtCore.QObject):
valueChanged = QtCore.pyqtSignal(str)
def __init__(self, parent=None):
super().__init__(parent)
self._value = ""
#QtCore.pyqtProperty(str)
def value(self):
return self._value
#value.setter
def value(self, v):
self._value = v
self.valueChanged.emit(v)
class Widget(QtWidgets.QWidget):
def __init__(self, parent=None):
super().__init__(parent)
self.webEngineView = QtWebEngineWidgets.QWebEngineView()
self.label = QtWidgets.QLabel(alignment=QtCore.Qt.AlignCenter)
lay = QtWidgets.QVBoxLayout(self)
lay.addWidget(self.webEngineView, stretch=1)
lay.addWidget(self.label, stretch=1)
backend = Backend(self)
backend.valueChanged.connect(self.label.setText)
backend.valueChanged.connect(self.foo_function)
self.channel = QtWebChannel.QWebChannel()
self.channel.registerObject("backend", backend)
self.webEngineView.page().setWebChannel(self.channel)
path = "index.html"
self.webEngineView.setUrl(QtCore.QUrl.fromLocalFile(path))
#QtCore.pyqtSlot(str)
def foo_function(self, value):
print(value)
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
w = Widget()
w.show()
sys.exit(app.exec_())
When I run it, I get a connection failed error.
As a result of my searches, I get the same error in all the methods I tried, where am I doing wrong, can you help?

Read the documentation for QtCore.QUrl.fromLocalFile :
QtCore.QUrl.fromLocalFile
"A file URL with a relative path only makes sense if there is a base URL to resolve it against."
So we add the base path:
import os
...
path = os.getcwd() + "\\index.html"
self.webEngineView.setUrl(QtCore.QUrl.fromLocalFile(path))
Added path compatibility between os (edited)
from pathlib import Path
...
base_path = Path(Path.cwd())
full_path = base_path.joinpath('index.html')
self.webEngineView.setUrl(QtCore.QUrl.fromLocalFile(str(full_path)))

Scrapy only scraping the first two pages

I'm trying to scrape a website but need to use splash in all pages because their content created dynamically. Right now it renders only the first 2 pages, even though there are 47 pages in total.
Here's the code:
import scrapy
from scrapy.http import Request
from scrapy_splash import SplashRequest
class JobsSpider(scrapy.Spider):
name = 'jobs'
start_urls = ['https://jobs.citizensbank.com/search-jobs']
def start_requests(self):
filters_script = """function main(splash)
assert(splash:go(splash.args.url))
splash:wait(3)
return splash:html()
end"""
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='execute',
args={'lua_source': filters_script})
def parse(self, response):
cars_urls = response.xpath('.//section[#id="search-results-list"]/ul/li/a/#href').extract()
for car_url in cars_urls:
absolute_car_url = response.urljoin(car_url)
yield scrapy.Request(absolute_car_url,
callback=self.parse_car)
script_at_page_1 = """function main(splash)
assert(splash:go(splash.args.url))
splash:wait(3)
next_button = splash:select("a[class=next]")
next_button.mouse_click()
splash:wait(3)
return {
url = splash:url(),
html = splash:html()
}
end"""
script_at_page_2 = """function main(splash)
assert(splash:go(splash.args.url))
splash:wait(3)
next_button = splash:select("a[class=next]")
next_button.mouse_click()
splash:wait(3)
return {
url = splash:url(),
html = splash:html()
}
end"""
script = None
if response.url is not self.start_urls[0]:
script = script_at_page_2
else:
script = script_at_page_1
yield SplashRequest(url=response.url,
callback=self.parse,
endpoint='execute',
args={'lua_source': script})
def parse_car(self, response):
jobtitle = response.xpath('//h1[#itemprop="title"]/text()').extract_first()
location = response.xpath('//span[#class="job-info"]/text()').extract_first()
jobid = response.xpath('//span[#class="job-id job-info"]/text()').extract_first()
yield {'jobtitle': jobtitle,
'location': location,
'jobid': jobid}
I've played with it in every way I could think off but it didn't work.
I'm new to scrapy so any help appreciated.

I think you do not need to use Splash for this. If you look at the network tab of your browser inspector you will see it is making requests to this URL under XHR:
https://jobs.citizensbank.com/search-jobs/results?ActiveFacetID=0&CurrentPage=3&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=0&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&PostalCode=&fc=&fl=&fcf=&afc=&afl=&afcf=
Try making requests to this URL and change the page each time. If you have trouble you may need to look at the headers of the XHR request and replicate them as well. If you click the link the JSON will load in your browser. So just set page 1 as your start_url and over ride start_requests as follows:
start_urls = ['https://jobs.citizensbank.com/search-jobs/results?ActiveFacetID=0&CurrentPage={}&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=0&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&PostalCode=&fc=&fl=&fcf=&afc=&afl=&afcf=']
def start_requests(self):
num_pages = 10
for page in range(1, num_pages):
yield scrapy.Request(self.start_urls[0].format(page), callback=self.parse)
It's also worth noting you can set the RecordsPerPage setting. You may be able to set it higher and possibly get all records on one page or make less requests to get all records.

Video is freezed while video streaming

I find the following code for streaming video over a socket in python2.7. When I run it, the video will be freeze at the beginning in the server-side (It shows the video in a web browser). I debugged the code and understood that in the streamer.py, the third while loop condition creates an infinite loop because of the condition while len(data) < msg_size: is always satisfied. In other words, len(data) is always less than msg_size.So, the streamer.py does not return the image to the server.py. Could anyone help me to solve this issue?
The server.py is:
from flask import Flask, render_template, Response
from streamer import Streamer
app = Flask(__name__)
def gen():
streamer = Streamer('localhost', 8089)
streamer.start()
while True:
if streamer.client_connected():
yield (b'--frame\r\n'b'Content-Type: image/jpeg\r\n\r\n' +
streamer.get_jpeg() + b'\r\n\r\n')
#app.route('/')
def index():
return render_template('index.html')
#app.route('/video_feed')
def video_feed():
return Response(gen(), mimetype='multipart/x-mixed-replace;
boundary=frame')
if __name__ == '__main__':
app.run(host='localhost', threaded=True)
The streamer.py is:
import threading
import socket
import struct
import StringIO
import json
import numpy
class Streamer (threading.Thread):
def __init__(self, hostname, port):
threading.Thread.__init__(self)
self.hostname = hostname
self.port = port
self.connected = False
self.jpeg = None
def run(self):
self.isRunning = True
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
print 'Socket created'
s.bind((self.hostname, self.port))
print 'Socket bind complete'
data = ""
payload_size = struct.calcsize("L")
s.listen(10)
print 'Socket now listening'
while self.isRunning:
conn, addr = s.accept()
print 'while 1...'
while True:
data = conn.recv(4096)
print 'while 2...'
if data:
packed_msg_size = data[:payload_size]
data = data[payload_size:]
msg_size = struct.unpack("L", packed_msg_size)[0]
while len(data) < msg_size:# the infinite loop is here(my problem)!
data += conn.recv(4096)
print ("lenght of data is " , len(data) )
print ("message size is " , msg_size )
frame_data = data[:msg_size]
#frame_data = data[:len(data)]
memfile = StringIO.StringIO()
memfile.write(json.loads(frame_data).encode('latin-1'))
memfile.seek(0)
frame = numpy.load(memfile)
ret, jpeg = cv2.imencode('.jpg', frame)
self.jpeg = jpeg
self.connected = True
print 'recieving...'
else:
conn.close()
self.connected = False
print 'connected=false...'
break
self.connected = False
def stop(self):
self.isRunning = False
def client_connected(self):
return self.connected
def get_jpeg(self):
return self.jpeg.tobytes()
Client.py is:
import socket
import sys
import pickle
import struct
import StringIO
import json
import time
cap=cv2.VideoCapture(0)
clientsocket=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
clientsocket.connect(('localhost',8089))
while(cap.isOpened()):
ret,frame=cap.read()
memfile = StringIO.StringIO()
np.save(memfile, fravidme)
memfile.seek(0)
data = json.dumps(memfile.read().decode('latin-1'))
clientsocket.sendall(struct.pack("L", len(data))+data)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
I want to show the video captured by my laptop's camera on a client machine in the same network. I expect video stream but in the browser, I just watch an image and it does not update continuously.

As I analyzed this code I noticed that the default implementation for sending OpenCV frames over the network was not working. I decided to replace it with ZeroMQ implementation I have used before. You can check out the linked question for a deeper explanation of how the streaming works. I have neatly packaged it into classes, with unit tests and documentation as SmoothStream check it out too.
Coming back to the question, here is the working code.
client.py
import base64
import cv2
import zmq
context = zmq.Context()
footage_socket = context.socket(zmq.PUB)
footage_socket.connect('tcp://localhost:5555')
camera = cv2.VideoCapture(0) # init the camera
while True:
try:
grabbed, frame = camera.read() # grab the current frame
frame = cv2.resize(frame, (640, 480)) # resize the frame
encoded, buffer = cv2.imencode('.jpg', frame)
jpg_as_text = base64.b64encode(buffer)
footage_socket.send(jpg_as_text)
except KeyboardInterrupt:
camera.release()
cv2.destroyAllWindows()
break
server.py
from flask import Flask, render_template, Response
from streamer import Streamer
app = Flask(__name__)
def gen():
streamer = Streamer('*', 5555)
streamer.start()
while True:
if streamer.client_connected():
yield (b'--frame\r\n'b'Content-Type: image/jpeg\r\n\r\n' + streamer.get_jpeg() + b'\r\n\r\n')
#app.route('/')
def index():
return render_template('index.html')
#app.route('/video_feed')
def video_feed():
return Response(gen(), mimetype='multipart/x-mixed-replace; boundary=frame')
if __name__ == '__main__':
app.run(host='localhost', threaded=True)
streamer.py
import base64
import threading
import cv2
import numpy as np
import zmq
class Streamer(threading.Thread):
def __init__(self, hostname, port):
threading.Thread.__init__(self)
self.hostname = hostname
self.port = port
self.connected = False
self.jpeg = None
def run(self):
self.isRunning = True
context = zmq.Context()
footage_socket = context.socket(zmq.SUB)
footage_socket.bind('tcp://{}:{}'.format(self.hostname, self.port))
footage_socket.setsockopt_string(zmq.SUBSCRIBE, np.unicode(''))
while self.isRunning:
frame = footage_socket.recv_string()
img = base64.b64decode(frame)
npimg = np.fromstring(img, dtype=np.uint8)
source = cv2.imdecode(npimg, 1)
ret, jpeg = cv2.imencode('.jpg', source)
self.jpeg = jpeg
self.connected = True
self.connected = False
def stop(self):
self.isRunning = False
def client_connected(self):
return self.connected
def get_jpeg(self):
return self.jpeg.tobytes()
I understand that copy-pasting entire .py files are probably not the best way to post an answer here, but this is a complex question with a lot of moving parts and I honestly could not think of a better way to help the OP.

Splash (+scrapy) does not render web page correctly

Im' using Scrapy + Splash, I have problems downloading this page: http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText='http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText=
It seems that Splash cannot execute the javascript correctly.
Here is a stripped down, working, self contanied, version of my program (sorry if not stripped down at best)
# -*- coding: utf-8 -*- import scrapy from scrapy_splash import SplashRequest from scrapy.selector import Selector from scrapy.http import HtmlResponse import sys import io import os import base64
def saveFile(ss, fileNameExt, folderName):
f = open(folderName + '/' + fileNameExt, 'w')
f.write(ss)
f.close()
return fileNameExt
def savePng(png_bytes, fileNameExt, folderName):
f = open( folderName +'/' + fileNameExt, 'wb')
f.write(png_bytes)
f.close()
return fileNameExt
def savePageOriginalInFolder(response, folderName, chiave='pag1'):
fileName = "site.html"
testo = response.data[chiave].decode('utf8')
return saveFile(testo, fileName, folderName) def savePagePng(response, folderName, pngDataName):
fileName = 'site.png'
if hasattr(response, 'data'):
png_bytes = base64.b64decode(response.data[pngDataName])
return savePng(png_bytes, fileName, folderName)
class GenericoSpider(scrapy.Spider):
name = 'provaAbb'
def asSplashRequest(self, url, callback, id_elenco="no_id", id_sessione="no_id_sessione"):
return SplashRequest(
url = url,
endpoint='execute',
args={'lua_source': self.script, 'id_elenco': id_elenco, 'id_sessione': id_sessione},
callback=callback,
)
outDir = name # prendo in nome della cartella dal nome dello spider
db_name = ""
def start_requests(self):
sito = 'http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText='
yield self.asSplashRequest(sito, self.parse_list, 'id_mio_elenco')
script = """
function main(splash)
local url = splash.args.url
splash:set_viewport_size(1280, 2500)
splash:init_cookies(splash.args.cookies)
assert(splash:go(url))
assert(splash:wait(10))
return {
url = splash:url(),
pag1 = splash:html(),
png1 = splash:png(),
id_elenco = splash.args.id_elenco,
id_sessione = splash.args.id_sessione,
cookies = splash:get_cookies(),
tt = splash.args
}
end
"""
def parse_list(self, response):
for ss in response.data:
if len(ss) >= 4:
if ss[0:3] == 'pag':
fileName = savePageOriginalInFolder(response, self.outDir, ss)
elif ss[0:3] == 'png':
fileName = savePagePng(response, self.outDir,ss)
A part of the settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
Result, as you can see there is the spinner in the list area and page numbers are not loaded. (augmenting wait time in lua did not solve the problem)

how to get the final result of html page by pyqt?

Recently I am trying to crawl data from Google search result and it seems pyqt is a good module to execute the javascript in html and get the final html result. However for other website, it seems work right. However, for Google search, it always failed. I follow an example here:
http://webscraping.com/blog/Scraping-JavaScript-webpages-with-webkit/
The code is:
import sys
import time
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url1 = 'http://www.google.com/search?start=0&client=firefox-a&q=adidas&safe=off&pws=0&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2002%2Ccd_max%3A1%2F1%2F2001&filter=0&num=10&access=a&oe=UTF-8&ie=UTF-8'
url2 = 'http://www.google.com/search?start=0&client=firefox-a&q=adidas&safe=off&pws=0&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2009%2Ccd_max%3A7%2F1%2F2009&filter=0&num=10&access=a&oe=UTF-8&ie=UTF-8'
r = Render(url1)
html = r.frame.toHtml()
print type(html)
outfile = open('page.html','w')
outfile.write(html.toUtf8())
outfile.close()
print 'finished!'
However, the result of url1 and url2 always get the same result, and the result is just the same when I disable the javascript in chrome. So how should we deal with it? How can we fetch the final html of Google search?

import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://webscraping.com'
r = Render(url)
html = r.frame.toHtml()
Source: http://webscraping.com/blog/Scraping-JavaScript-webpages-with-webkit/

Develop Reference

JavaScript is the programming language of the Web.

Scraping a website with content added via javascript with PySide in Python - javascript

Related

How do I add a js map to Pyqt5?

Scrapy only scraping the first two pages

Video is freezed while video streaming

Splash (+scrapy) does not render web page correctly

how to get the final result of html page by pyqt?

Categories

Resources