Splash (+scrapy) does not render web page correctly - javascript

Im' using Scrapy + Splash, I have problems downloading this page: http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText='http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText=
It seems that Splash cannot execute the javascript correctly.
Here is a stripped down, working, self contanied, version of my program (sorry if not stripped down at best)
# -*- coding: utf-8 -*- import scrapy from scrapy_splash import SplashRequest from scrapy.selector import Selector from scrapy.http import HtmlResponse import sys import io import os import base64
def saveFile(ss, fileNameExt, folderName):
f = open(folderName + '/' + fileNameExt, 'w')
f.write(ss)
f.close()
return fileNameExt
def savePng(png_bytes, fileNameExt, folderName):
f = open( folderName +'/' + fileNameExt, 'wb')
f.write(png_bytes)
f.close()
return fileNameExt
def savePageOriginalInFolder(response, folderName, chiave='pag1'):
fileName = "site.html"
testo = response.data[chiave].decode('utf8')
return saveFile(testo, fileName, folderName) def savePagePng(response, folderName, pngDataName):
fileName = 'site.png'
if hasattr(response, 'data'):
png_bytes = base64.b64decode(response.data[pngDataName])
return savePng(png_bytes, fileName, folderName)
class GenericoSpider(scrapy.Spider):
name = 'provaAbb'
def asSplashRequest(self, url, callback, id_elenco="no_id", id_sessione="no_id_sessione"):
return SplashRequest(
url = url,
endpoint='execute',
args={'lua_source': self.script, 'id_elenco': id_elenco, 'id_sessione': id_sessione},
callback=callback,
)
outDir = name # prendo in nome della cartella dal nome dello spider
db_name = ""
def start_requests(self):
sito = 'http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText='
yield self.asSplashRequest(sito, self.parse_list, 'id_mio_elenco')
script = """
function main(splash)
local url = splash.args.url
splash:set_viewport_size(1280, 2500)
splash:init_cookies(splash.args.cookies)
assert(splash:go(url))
assert(splash:wait(10))
return {
url = splash:url(),
pag1 = splash:html(),
png1 = splash:png(),
id_elenco = splash.args.id_elenco,
id_sessione = splash.args.id_sessione,
cookies = splash:get_cookies(),
tt = splash.args
}
end
"""
def parse_list(self, response):
for ss in response.data:
if len(ss) >= 4:
if ss[0:3] == 'pag':
fileName = savePageOriginalInFolder(response, self.outDir, ss)
elif ss[0:3] == 'png':
fileName = savePagePng(response, self.outDir,ss)
A part of the settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
Result, as you can see there is the spinner in the list area and page numbers are not loaded. (augmenting wait time in lua did not solve the problem)

Related

Ajax file download sets filename to random string of characters

I am trying to download a file using ajax. The download works correctly, however, the downloaded filename is set to a random string of characters. I don't think this is relevant since the backend is working, but I'm using django
js/html:
<script>
function downloadFile(){
var filename = 'data.txt';
$.ajax({
url: 'downloadFile',
data: {'filename': filename},
success: function(blob, status, xhr){
var disposition = xhr.getResponseHeader('Content-Disposition');
if (disposition && disposition.indexOf('attachment') != -1){
var filenameRegex = /filename[^;=\n]*=((['"]).*?\2|[^;\n]*)/;
var matches = filenameRegex.exec(disposition);
if (matches != null && matches[1]){
filename = matches[1].replace(/['"]/g,'');
}
}
var bd = [];
bd.push(blob);
var typename = "application/" + filename;
var downloadURL = window.URL.createObjectURL(new Blob(bd, {type: typename}));
var a = document.createElement("a");
a.href = downloadURL;
document.body.append(a);
a.click();
}
});
}
</script>
...
<button id="downloadFile" type="button" onclick="downloadFile()"><i class="fa fa-download"></i></button>
...
django views.py:
import pathlib
import os
def downloadFile(request):
fname = request.GET.get('filename')
fpath = os.path.join(<local_filesystem_path>, fname)
# code to generate file here
if pathlib.Path(fpath).exists():
file_download = open(fpath, 'rb')
response = HttpResponse(file_download, content_type='application/{}'.format(fname))
response['Content-Disposition'] = 'attachment; filename="{}"'.format(fname)
return response
and urls.py:
from django.urls import path
from . import views
urlpatterns = [
path('', views.index, name='index'),
path('downloadFile', views.downloadFile, name='downloadFile')
]
When I click the download button, everything works correctly except that my file has been renamed to a random string of 8 characters. Each time I re-download it, the string changes, but it's always 8 characters long. I'm guessing that something is happening where my browser thinks that the filename is unset so it's assigning a random string. But I'd like to set it to something I've specified - how can I do this?
Try adding a download attribute to the the <a> tag that you create. It allows you to set the file name.
https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a#attr-download

How do I add a js map to Pyqt5?

I want to add the map provided by Marinetraffic to pyqt5. When I add the HTML codes provided by MarineTraffic to my own program, it doesn't work.
The map I want to add:
MarineTraffic Map JS
from PyQt5 import QtCore, QtGui, QtWidgets, QtWebEngineWidgets, QtWebChannel
class Backend(QtCore.QObject):
valueChanged = QtCore.pyqtSignal(str)
def __init__(self, parent=None):
super().__init__(parent)
self._value = ""
#QtCore.pyqtProperty(str)
def value(self):
return self._value
#value.setter
def value(self, v):
self._value = v
self.valueChanged.emit(v)
class Widget(QtWidgets.QWidget):
def __init__(self, parent=None):
super().__init__(parent)
self.webEngineView = QtWebEngineWidgets.QWebEngineView()
self.label = QtWidgets.QLabel(alignment=QtCore.Qt.AlignCenter)
lay = QtWidgets.QVBoxLayout(self)
lay.addWidget(self.webEngineView, stretch=1)
lay.addWidget(self.label, stretch=1)
backend = Backend(self)
backend.valueChanged.connect(self.label.setText)
backend.valueChanged.connect(self.foo_function)
self.channel = QtWebChannel.QWebChannel()
self.channel.registerObject("backend", backend)
self.webEngineView.page().setWebChannel(self.channel)
path = "index.html"
self.webEngineView.setUrl(QtCore.QUrl.fromLocalFile(path))
#QtCore.pyqtSlot(str)
def foo_function(self, value):
print(value)
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
w = Widget()
w.show()
sys.exit(app.exec_())
When I run it, I get a connection failed error.
As a result of my searches, I get the same error in all the methods I tried, where am I doing wrong, can you help?
Read the documentation for QtCore.QUrl.fromLocalFile :
QtCore.QUrl.fromLocalFile
"A file URL with a relative path only makes sense if there is a base URL to resolve it against."
So we add the base path:
import os
...
path = os.getcwd() + "\\index.html"
self.webEngineView.setUrl(QtCore.QUrl.fromLocalFile(path))
Added path compatibility between os (edited)
from pathlib import Path
...
base_path = Path(Path.cwd())
full_path = base_path.joinpath('index.html')
self.webEngineView.setUrl(QtCore.QUrl.fromLocalFile(str(full_path)))

How to serve a Python pickle file from a HTML page

I am trying to allow for downloading a Python pickle file from a Flask app through
import pickle
from flask import Flask, render_template_string
app = Flask(__name__)
template = """
<button onclick="download_file()" data-trigger-update-context="false">Download</button>
<script>
function download_file() {
mime_type = '{{ mime_type }}';
var blob = new Blob(['{{ file_content }}'], { type: mime_type });
var dlink = document.createElement('a');
dlink.download = 'pickle.pkl';
dlink.href = window.URL.createObjectURL(blob);
dlink.onclick = function (e) {
// revokeObjectURL needs a delay to work properly.
var that = this;
setTimeout(function () {
window.URL.revokeObjectURL(that.href);
}, 1500);
};
document.body.appendChild(dlink);
dlink.click();
dlink.remove();
}
</script>
"""
#app.route("/")
def download():
return render_template_string(
template,
file_content=pickle.dumps("text"),
mime_type="application/octet-stream",
)
While downloading the file works fine, the downloaded file itself seems corrupted as I get the following error while reading it
Python 3.7.6 | packaged by conda-forge | (default, Mar 23 2020, 23:03:20)
Type 'copyright', 'credits' or 'license' for more information
IPython 7.14.0 -- An enhanced Interactive Python. Type '?' for help.
In [1]: import pickle
In [2]: with open("pickle.pkl", "rb") as f:
...: pickle.load(f)
...:
---------------------------------------------------------------------------
UnpicklingError Traceback (most recent call last)
<ipython-input-2-b5282f4164d8> in <module>
1 with open("pickle.pkl", "rb") as f:
----> 2 pickle.load(f)
3
UnpicklingError: unpickling stack underflow
Any hint on the issue with the download script?
Thanks for your help.
Basically, you need to change just two things:
In the download() method, you need to convert the serialized bytes to a list of Integers.
Then, you need to change the JavaScript code to read this list of numbers.
So, your code should look like this:
import pickle
from flask import Flask, render_template_string
app = Flask(__name__)
template = """
<button onclick="download_file()" data-trigger-update-context="false">Download</button>
<script>
function download_file() {
let bytes_array = new Uint8Array({{file_content}}); //<--- add this
mime_type = '{{ mime_type }}';
var blob = new Blob([bytes_array], { type: mime_type }); //<-- change this
var dlink = document.createElement('a');
dlink.download = 'pickle.pkl';
dlink.href = window.URL.createObjectURL(blob);
dlink.onclick = function (e) {
// revokeObjectURL needs a delay to work properly.
var that = this;
setTimeout(function () {
window.URL.revokeObjectURL(that.href);
}, 1500);
};
document.body.appendChild(dlink);
dlink.click();
dlink.remove();
}
</script>
"""
#app.route("/")
def download():
return render_template_string(
template,
file_content=list(pickle.dumps("text")), # change this
mime_type="application/octet-stream",
)
if __name__ == '__main__':
app.run(debug = True)
Now, you can read the pickled file using pickle.load() just like so:
import pickle
with open("pickle.pkl", 'rb') as fin:
print(pickle.load(fin))
# prints: text

Video is freezed while video streaming

I find the following code for streaming video over a socket in python2.7. When I run it, the video will be freeze at the beginning in the server-side (It shows the video in a web browser). I debugged the code and understood that in the streamer.py, the third while loop condition creates an infinite loop because of the condition while len(data) < msg_size: is always satisfied. In other words, len(data) is always less than msg_size.So, the streamer.py does not return the image to the server.py. Could anyone help me to solve this issue?
The server.py is:
from flask import Flask, render_template, Response
from streamer import Streamer
app = Flask(__name__)
def gen():
streamer = Streamer('localhost', 8089)
streamer.start()
while True:
if streamer.client_connected():
yield (b'--frame\r\n'b'Content-Type: image/jpeg\r\n\r\n' +
streamer.get_jpeg() + b'\r\n\r\n')
#app.route('/')
def index():
return render_template('index.html')
#app.route('/video_feed')
def video_feed():
return Response(gen(), mimetype='multipart/x-mixed-replace;
boundary=frame')
if __name__ == '__main__':
app.run(host='localhost', threaded=True)
The streamer.py is:
import threading
import socket
import struct
import StringIO
import json
import numpy
class Streamer (threading.Thread):
def __init__(self, hostname, port):
threading.Thread.__init__(self)
self.hostname = hostname
self.port = port
self.connected = False
self.jpeg = None
def run(self):
self.isRunning = True
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
print 'Socket created'
s.bind((self.hostname, self.port))
print 'Socket bind complete'
data = ""
payload_size = struct.calcsize("L")
s.listen(10)
print 'Socket now listening'
while self.isRunning:
conn, addr = s.accept()
print 'while 1...'
while True:
data = conn.recv(4096)
print 'while 2...'
if data:
packed_msg_size = data[:payload_size]
data = data[payload_size:]
msg_size = struct.unpack("L", packed_msg_size)[0]
while len(data) < msg_size:# the infinite loop is here(my problem)!
data += conn.recv(4096)
print ("lenght of data is " , len(data) )
print ("message size is " , msg_size )
frame_data = data[:msg_size]
#frame_data = data[:len(data)]
memfile = StringIO.StringIO()
memfile.write(json.loads(frame_data).encode('latin-1'))
memfile.seek(0)
frame = numpy.load(memfile)
ret, jpeg = cv2.imencode('.jpg', frame)
self.jpeg = jpeg
self.connected = True
print 'recieving...'
else:
conn.close()
self.connected = False
print 'connected=false...'
break
self.connected = False
def stop(self):
self.isRunning = False
def client_connected(self):
return self.connected
def get_jpeg(self):
return self.jpeg.tobytes()
Client.py is:
import socket
import sys
import pickle
import struct
import StringIO
import json
import time
cap=cv2.VideoCapture(0)
clientsocket=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
clientsocket.connect(('localhost',8089))
while(cap.isOpened()):
ret,frame=cap.read()
memfile = StringIO.StringIO()
np.save(memfile, fravidme)
memfile.seek(0)
data = json.dumps(memfile.read().decode('latin-1'))
clientsocket.sendall(struct.pack("L", len(data))+data)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
I want to show the video captured by my laptop's camera on a client machine in the same network. I expect video stream but in the browser, I just watch an image and it does not update continuously.
As I analyzed this code I noticed that the default implementation for sending OpenCV frames over the network was not working. I decided to replace it with ZeroMQ implementation I have used before. You can check out the linked question for a deeper explanation of how the streaming works. I have neatly packaged it into classes, with unit tests and documentation as SmoothStream check it out too.
Coming back to the question, here is the working code.
client.py
import base64
import cv2
import zmq
context = zmq.Context()
footage_socket = context.socket(zmq.PUB)
footage_socket.connect('tcp://localhost:5555')
camera = cv2.VideoCapture(0) # init the camera
while True:
try:
grabbed, frame = camera.read() # grab the current frame
frame = cv2.resize(frame, (640, 480)) # resize the frame
encoded, buffer = cv2.imencode('.jpg', frame)
jpg_as_text = base64.b64encode(buffer)
footage_socket.send(jpg_as_text)
except KeyboardInterrupt:
camera.release()
cv2.destroyAllWindows()
break
server.py
from flask import Flask, render_template, Response
from streamer import Streamer
app = Flask(__name__)
def gen():
streamer = Streamer('*', 5555)
streamer.start()
while True:
if streamer.client_connected():
yield (b'--frame\r\n'b'Content-Type: image/jpeg\r\n\r\n' + streamer.get_jpeg() + b'\r\n\r\n')
#app.route('/')
def index():
return render_template('index.html')
#app.route('/video_feed')
def video_feed():
return Response(gen(), mimetype='multipart/x-mixed-replace; boundary=frame')
if __name__ == '__main__':
app.run(host='localhost', threaded=True)
streamer.py
import base64
import threading
import cv2
import numpy as np
import zmq
class Streamer(threading.Thread):
def __init__(self, hostname, port):
threading.Thread.__init__(self)
self.hostname = hostname
self.port = port
self.connected = False
self.jpeg = None
def run(self):
self.isRunning = True
context = zmq.Context()
footage_socket = context.socket(zmq.SUB)
footage_socket.bind('tcp://{}:{}'.format(self.hostname, self.port))
footage_socket.setsockopt_string(zmq.SUBSCRIBE, np.unicode(''))
while self.isRunning:
frame = footage_socket.recv_string()
img = base64.b64decode(frame)
npimg = np.fromstring(img, dtype=np.uint8)
source = cv2.imdecode(npimg, 1)
ret, jpeg = cv2.imencode('.jpg', source)
self.jpeg = jpeg
self.connected = True
self.connected = False
def stop(self):
self.isRunning = False
def client_connected(self):
return self.connected
def get_jpeg(self):
return self.jpeg.tobytes()
I understand that copy-pasting entire .py files are probably not the best way to post an answer here, but this is a complex question with a lot of moving parts and I honestly could not think of a better way to help the OP.

Internal server error 500 in python

I have checked all cgi related thing, but still I am getting this error.
I tried to run other sample program which get response from .py file. Which works fine, but for this code only it gives :
GET http://localhost/testapp/demos/classifier_demo.py?query= 500 (Internal Server Error)x.ajaxTransport.n.send # jquery-1.10.2.min.js:6x.extend.ajax # jquery-1.10.2.min.js:6call_fun # demo.html:16
jquery-1.10.2.min.js:6 GET http://localhost/testapp/demos/classifier_demo.py?query=EASTERN. 500 (Internal Server Error)x.ajaxTransport.n.send # jquery-1.10.2.min.js:6x.extend.ajax # jquery-1.10.2.min.js:6call_fun # demo.html:16
P.S I have already set the permission 777
I am calling python script using jquery every 5 seconds.
<script>
$( document ).ready(function() {
function call_fun() {
var text = $('textarea#trans').val();
//alert(text)
var data = {"query" : text};
//alert(data);
$.ajax({
url: "classifier_demo.py",
type: "POST",
data: data,
success: function(response) {
console.log(response)
}
})
}
setInterval(call_fun, 5000);
});
</script>
here is the python code:
#!/usr/bin/python
"""
Using the words as features removing stopwords
"""
from sklearn.utils import check_random_state
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score
from sklearn.externals import joblib
from sklearn.feature_extraction.text import FeatureHasher
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
from time import time
import pprint, pickle
import gearman
import nltk, json, cgi
import re
import os
import sys
"""
This class will train and test the data and will give polarity for various emotions
"""
class SentimentAnalyzer(object):
"""
Init for SentimentAnalyzer
"""
def __init__(self):
self.root_dir = os.getcwd()
self.trainClassifier()
"""
Function to fetch the data from cache
#cache <dict> consist of training data
"""
def fetch_data(self, cache, data_home=None, subset='train', categories=None,
shuffle=True, random_state=42):
if subset in ('train', 'test'):
data = cache[subset]
else:
raise ValueError(
"subset can only be 'train', 'test' or 'all', got '%s'" % subset)
if shuffle:
random_state = check_random_state(random_state)
indices = np.arange(data.target.shape[0])
random_state.shuffle(indices)
data.filenames = data.filenames[indices]
data.target = data.target[indices]
# Use an object array to shuffle: avoids memory copy
data_lst = np.array(data.data, dtype=object)
data_lst = data_lst[indices]
data.data = data_lst.tolist()
return data
"""
For custom tokenizing the text, removed stop words from text
#text <type 'str'> text which needs to get tokenized
#return <type 'str'> tokens
"""
def token_ques(self, text):
things_to_replace = ['?']
#wh_tags = ['WP','WRB','MD','WDT']
things_to_replace += stopwords.words('english')
#wh_word = None
for tok in text.split('\n'):
original_query = tok
query_pos_tags = nltk.pos_tag(word_tokenize(tok))
for word in things_to_replace:
tok = tok.lower()
tok = re.sub("\s"+word+"\s|\s?"+"\?"+"$",' ',tok)
tok = tok.strip(" ")
tok = tok.lstrip(" ")
tok = tok.rstrip(" ")
for word in word_tokenize(tok):
yield word.lower()
"""
Train classifier
"""
def trainClassifier(self):
try:
t1 = time()
start_time = time()
self.hasher = FeatureHasher(input_type='string',non_negative=True)
self.clf = MultinomialNB(alpha=0.001)
self.hasher = FeatureHasher(input_type='string',non_negative=True)
self.clf = MultinomialNB(alpha=0.001)
data_folder = self.root_dir + "/emotions"
train_dataset = load_files(data_folder)
print("Time taken to load the data=>", time()-start_time)
print("data loaded")
cache = dict(train=train_dataset)
self.data_train = self.fetch_data(cache, subset='train')
try:
X_train = pickle.load(open("x_result.pickle", "rb" ) )
y_train = pickle.load(open("y_result.pickle", "rb" ) )
self.clf.fit(X_train, y_train)
except:
print "Updating the classifier"
training_data = []
for text in self.data_train.data:
#text = self.modifyQuery(text.decode('utf-8','ignore'))
text = text.decode('utf-8','ignore')
training_data.append(text)
raw_X = (self.token_ques(text) for text in training_data) #Type of raw_X <type 'generator'>
#X_train = self.vectorizer.fit_transform(raw_X)
X_train = self.hasher.transform(raw_X)
y_train = self.data_train.target
readx = open('x_result.pickle', 'wb')
pickle.dump(X_train, readx)
readx.close()
readY = open('y_result.pickle', 'wb')
pickle.dump(y_train, readY)
readY.close()
self.clf.fit(X_train, y_train)
print("Classifier tained ...")
print("time taken=>", time()-t1)
except Exception:
import traceback
print traceback.format_exc()
"""
Function to test classifier
"""
def testClassifier(self, query):
try:
result = {}
#To replace NE
#query = self.modifyQuery(query)
test_data = [query]
raw_X = (self.token_ques(text) for text in test_data)
X_test = self.hasher.transform(raw_X)
#X_test = self.vectorizer.fit_transform(raw_X)
pred = self.clf.predict(X_test)
print("pred=>", pred)
self.categories = self.data_train.target_names
for doc, category in zip(test_data, pred):
print('%r => %s' % (doc, self.categories[category]))
index = 1
predict_prob = self.clf.predict_proba(X_test)
final_result = []
for doc, category_list in zip(test_data, predict_prob):
# print('\n\n')
category_list = sorted(enumerate(category_list), key=lambda x:x[1], reverse=True)
i = 0
for val in category_list:
if float(val[1]) > float(0.05):
# print('%r => %s => %s' % (doc, self.categories[val[0]], str(val[1])))
result = {}
result[self.categories[val[0]]] = "%0.2f"%(float(val[1]) * 100)+"%"
final_result.append(result)
index += 1
except Exception:
import traceback
print traceback.format_exc()
import json
# print result
# print final_result
return final_result
if __name__ == '__main__':
fs = cgi.FieldStorage()
text = fs['query'].value
#query = fs.getvalue(query)
#query = raw_input("Please enter the text to process:")
query = "Love you man"
result = { "result" : text}
#result = SentimentAnalyzer().testClassifier(query)
json_result = json.dumps( result )
print json_result

Categories

Resources