Link sniffing - http get response codes wrong - javascript

I have the task to go trough around 200k links and check the status code of their responses. Anything other than 2xx would mean a problem which means that link has to be manually checked (added to a DB later).
The links I have from a DB and are both http and https, some of them are not valid (e.g. ww.fyxs.d). The format I get is JSON and it's something like this
{
"id": xxx,
"url": xxxx
}
I went with a really simple solution which unfortunately doesn't work.
I am taking the links from a json file and then starting from the back sending a http/https.get request, waiting for the response, checking and processing the status code and moving to the next link after removing the previous one from the list to preserve memory. The problem is that I keep getting 4xx almost all the time and if I do a GET from a REST client I get a 200 OK.
I don't know if it's possible but I only need the correct status code and the body I'm not interested in hence the HEAD method. I also tried with
method: 'GET' - still wrong status codes and http/https.request - I don't even get a response.
Here is my code:
var https = require('https');
var http = require('http');
var urlMod = require('url');
var links = require('./links_to_check.json').links_to_check;
var callsRemaining = links.length;
var current = links.length - 1;
startQueue();
function startQueue(){
getCode(links[current].url);
current--;
}
function getCode(url){
var urlObj = urlMod.parse(url);
var options = {
method: 'HEAD',
hostName: urlObj.host,
path: urlObj.path
};
var httpsIndex = url.indexOf('https');
if(httpsIndex > -1 && httpsIndex < 5){
https.get(options,function(response){
proccessResponse(response.statusCode);
}).on('error', (e) => {
startQueue();
});
}else{
if(url.indexOf('http:') < 0) return;
http.get(options,function(response){
proccessResponse(response.statusCode);
}).on('error', (e) => {
startQueue();
});
}
}
function proccessResponse(responseCode){
console.log("response => " + responseCode);
if(responseCode != 200){
errorCount++;
}
ResponseReady();
}
function ResponseReady(){
--callsRemaining;
if(callsRemaining <= 0){
//Proccess error when done
}
links.pop();
startQueue();
}
I would really appreciate some help - when we succeed I will publish it as a module so if someone needs to check a set of links they can just use it :)
After we solve this I was thinking of using async.map and splitting the links to chunks and running the analysis in parallel so it's faster. The current process written in shell takes around 36 hours.

Related

Polling with Flask and AJAX: Why is server response not showing up on in html?

I'm making a web application with Flask. The user submits a form which is passed to server. The value is passed to a python script I wrote that checks to see if a username is taken on a number of web forums.
As the results come in they are passed back to the page with AJAX. I want to update the webpage as the data comes in. I understand that websockets is probably more efficient, but this is just for practice and I want to learn how to do it with polling.
main.py:
#app.route('/search', methods=['GET', 'POST'])
def search_form():
#print(request.form)
x = request.form['id']
a = Vbulletin(x)
def result_gen():
return a.reg_ver()
result_gen()
def generate_resp():
with app.app_context():
for text in result_gen():
print(text)
text2 = json.dumps(text)
#print(text2)
yield (text2)
sleep(1)
return app.response_class(generate_resp(), mimetype='application/json')
app.run()
forumsearch.js:
$(document).ready(function(){
$("#submit").on('click',function(e){
e.preventDefault();
req = $.ajax({type: "POST",
url: "/search",
data: { id: $("#searchinput").val()},
});
req.done(function(temp){
var latest = document.getElementById('latest');
var output = document.getElementById('output');
var xhr = new XMLHttpRequest();
xhr.open('GET', '/search');
xhr.send();
var position = 0;
function handleNewData() {
var messages = xhr.responseText.split('\n');
messages.slice(position, -1).forEach(function(value) {
latest.textContent = value;
var item = document.createElement('li');
item.textContent = value;
output.appendChild(item);
});
position = messages.length - 1;
}
var timer;
timer = setInterval(function() {
handleNewData();
if (xhr.readyState == XMLHttpRequest.DONE) {
clearInterval(timer);
latest.textContent = 'Done';
}
}, 1000);
});
});
});
The issue i'm having is that the results are showing up from the POST requests in the browser when I look at the response in the network tab, but it is not being updated on the webpage. There a couple of things I see that could be the issue, but I am unsure how to proceed.
The first is that the XHR 'GET' request is never sent. The 'POST' request is sent and the response is coming in to the client, but the 'GET' request after it is never sent.
Maybe I'm not understanding polling, but don't i need to make a GET request to poll the server? Or do need to just handle the POST request?
Another problem I see is that response that I'm getting from 'POST' request has a syntax error.
The response is this:
"https://www.bimmerboost.com/register.php , user found!!! I SAID USER FOUND!!!""http://www.vbforums.com/register.php , user found!!! I SAID USER FOUND!!!"
and error in firefox is this:
SyntaxError: JSON.parse: unexpected non-whitespace character after
JSON data at line 1 column 80 of the JSON data
How can I fix this error, and what do I need to change to get the server response to show up in html?

Using XMLHttpRequest() PUT with GAE Python

I am currently trying to write some Javascript to interact with an API that I deployed on GAE (using Python) using XMXMLHttpRequest(). I've had no issue getting a GET, however the PUT is giving me a lot of trouble.
Interestingly, I have no issue touching the PUT request from a test HTTP site (https://www.hurl.it/), however I receive a status value of 0 every time I try from my own Javascript code. Below are snippets of my GAE and Javascript code.
(NOTE - I must use a "put" for this call as a requirement.)
Any guidance would be appreciated!
GAE (Server):
def put(self):
# Save variables for update
cardkey = self.request.get('key', default_value=None)
ident = self.request.get('ident', default_value=None)
brand = self.request.get('brand', default_value=None)
year = self.request.get('year', default_value=None)
player = self.request.get('player', default_value=None)
# If card key is provided then update card
if cardkey:
# Get card
card_to_update = ndb.Key(db_models.Card, int(cardkey)).get()
if ident:
card_to_update.ident = ident
if brand:
card_to_update.brand = brand
if year:
card_to_update.year = year
if player:
card_to_update.player = player
# Save changes and print update to requester
card_to_update.put()
card_dict_format = card_to_update.to_dict()
self.response.write(json.dumps(card_dict_format))
return
# If card key is not provided send error
else:
self.response.write('key not provided. must provide key for update.')
return
And the Javascript from my webpage:
<script>
window.onload = function()
{
var myRequest = new XMLHttpRequest();
var url = 'http://cs496-assignment3-mastrokn.appspot.com/updatecard';
var param = 'key=5636318331666432';
myRequest.open('put', url);
myRequest.onreadystatechange = function()
{
if ((myRequest.readyState == 4) && (myRequest.status == 200))
{
// var myArr = JSON.parse(myRequst.responseText);
// myFunction(myArr);
document.getElementById("viewCards").innerHTML = myRequest.status;
}
else
{
document.getElementById("viewCards").innerHTML = myRequest.status;
}
}
myRequest.send(param);
}
</script>
First, your onreadystatechange() handler should look like this:
myRequest.onreadystatechange = function()
{
if (myRequest.readyState == 4) //Don't do anything until the readyState==4
{
if(myRequest.status == 200) //Check for status==200
{
document.getElementById("viewCards").innerHTML = myRequest.status;
}
else //All other status codes
{
document.getElementById("viewCards").innerHTML =
'readyState='
+ myRequest.readyState
+ ' status='
+ myRequest.status
+ ' status text='
+ myRequest.statusText;
}
}
}
Then, from the docs:
If you end up with an XMLHttpRequest having status=0 and
statusText=null, it means that the request was not allowed to be
performed. It was UNSENT.
To see what went wrong, check the javascript console in your browser for an error, e.g.:
[Error] XMLHttpRequest cannot load
http://cs496-assignment3-mastrokn.appspot.com/updatecard. Origin
http://localhost:4567 is not allowed by Access-Control-Allow-Origin.
(4.htm, line 0)
When I run the code above and send the XMLHttpRequest to my own local server, the PUT request succeeds with a status code of 200.
Lastly, I have doubts about the server code you posted because I don't know of any framework where you return None from a request handler--rather you return some string or a response object. Yet, using other means to make a PUT request to your url returns a 200 status code. Is that really your server code? What framework are you using?

PhantomJS 2.0.0 - Select: Invalid argument error

The script below contains some URLs in "links" array. The function gatherLinks() is used to gather more URLs from sitemap.xml of the URLs in "links" array. Once the "links" array has enough URLs (decided by variable "limit"), function request() is called for each URL in "links" array to send a request to the server and fetch the response. Time taken for each response is reported. Total time taken by the program is reported when the program ends.
I wrote a PhantomJS program (source below) to send some requests and calculate the time taken (in order to compare the performance of 2.0.0 and 1.9.8). I get links using sitemap.xml file of the sites I hardcode in "links" array.
When run using PhantomJS 2.0.0, after some 65 requests the program (method page.open() of request function) starts outputting the following:
select: Invalid argument
select: Invalid argument
select: Invalid argument
select: Invalid argument
select: Invalid argument
.
.
.
.
When run using PhantomJS 1.9.8, it crashes after about 200 requests with the following error.
"PhantomJS has crashed. Please read the crash reporting guide at https://github.com/ariya/phantomjs/wiki/Crash-Reporting and file a bug report at https://github.com/ariya/phantomjs/issues/new with the crash dump file attached: /tmp/2A011800-3367-4B4A-A945-3B532B4D9B0F.dmp"
I tried to send the crash report but their guide is not very useful for me.
It's not the urls that I use, I have tried using other urls but same results.
Is there something wrong with my program? I am using OSX.
var system = require('system');
var fs = require('fs');
var links = [];
links = [
"http://somesite.com",
"http://someothersite.com",
.
.
.
];
var index = 0, fail = 0, limit = 300;
finalTime = Date.now();
var gatherLinks = function(link){
var page = require('webpage').create();
link = link + "/sitemap.xml";
console.log("Fetching links from " + link);
page.open(link, function(status){
if(status != "success"){
console.log("Sitemap Request FAILED, status: " + status);
fail++;
return;
}
var content = page.content;
parser = new DOMParser();
xmlDoc = parser.parseFromString(content, 'text/xml');
var loc = xmlDoc.getElementsByTagName('loc');
for(var i = 0; i < loc.length; i++){
if(links.length < limit){
links[links.length] = loc[i].textContent;
} else{
console.log(links.length + " Links prepared. Starting requests.\n");
index = 0;
request();
return;
}
}
if(index >= links.length){
index = 0;
console.log(links.length + " Links prepared\n\n");
request();
}
gatherLinks(links[index++]);
});
};
var request = function(){
t = Date.now();
var page = require('webpage').create();
page.open(links[index], function(status) {
console.log('Loading link #' + (index + 1) + ': ' + links[index]);
console.log("Time taken: " + (Date.now() - t) + " msecs");
if(status != "success"){
console.log("Request FAILED, status: " + status);
fail++;
}
if(index >= links.length-1){
console.log("\n\nAll links done, final time taken: " + (Date.now() - finalTime) + " msecs");
console.log("Requests sent: " + links.length + ", Failures: " + fail);
console.log("Success ratio: " + ((links.length - fail)/links.length)*100 + "%");
phantom.exit();
}
index++;
request();
});
}
gatherLinks(links[0]);
After playing around with the program, I couldn't find any particular pattern to the problems I mention below. For 2.0.0, I could only once succeed in sending 300 requests without an error. I have tried all different combinations of URLs, program usually fails between request 50-80. I maintain a log of urls that failed, all of them run fine when I send a single request using another PhantomJS program. For 1.9.8, it's much more stable and the crash I mention below is not very frequent. But again, I couldn't find any pattern to the crashing, it still crashes once in a while.
There are lots of problems with your code. The main one is probably that you're creating a new page for every single request and never close it afterwards. I think you're running out of memory.
I don't see a reason to create a new page for every request, so you can easily reuse a single page for all requests. Simply move the line var page = require('webpage').create(); to the global scope out of gatherLinks() and request(). If you don't want to do that, then you can call page.close() after you're done with it, but keep the asynchronous nature of PhantomJS in mind.
If the reason to use multiple page objects was to prevent cache re-use for later requests, then I have to tell you that this doesn't solve that problem. page objects in a single PhantomJS process can be regarded as tabs or windows and they share cookies and cache. If you want to isolate every request, then you will need to run every request in its own process for example through the use of the Child Process Module.
There is another problem with your code. You probably wanted to write the following in gatherLinks():
if(index >= links.length){
index = 0;
console.log(links.length + " Links prepared\n\n");
request();
return; // ##### THIS #####
}
gatherLinks(links[index++]);

nodejs: node-http-proxy and harmon: rewriting the html response from the end point instead of the 302 redirected response.

I'm using nodejs with node-http-proxy along with harmon. I am using harmon to rewrite the proxied response to include a javascript file and a css file. When I set the target of the proxy to be http://nodejs.org or anything other than localhost, I receive a 301 or 302 redirect. The script is rewriting the 301 response instead of the fully proxied response. How can I use harmon to rewrite the end response instead of the 302 response?
Here is the example of the script I am running from the harmon example folder:
var http = require('http');
var connect = require('connect');
var httpProxy = require('http-proxy');
var selects = [];
var simpleselect = {};
//<img id="logo" src="/images/logo.svg" alt="node.js">
simpleselect.query = 'img';
simpleselect.func = function (node) {
//Create a read/write stream wit the outer option
//so we get the full tag and we can replace it
var stm = node.createStream({ "outer" : true });
//variable to hold all the info from the data events
var tag = '';
//collect all the data in the stream
stm.on('data', function(data) {
tag += data;
});
//When the read side of the stream has ended..
stm.on('end', function() {
//Print out the tag you can also parse it or regex if you want
process.stdout.write('tag: ' + tag + '\n');
process.stdout.write('end: ' + node.name + '\n');
//Now on the write side of the stream write some data using .end()
//N.B. if end isn't called it will just hang.
stm.end('<img id="logo" src="http://i.imgur.com/LKShxfc.gif" alt="node.js">');
});
}
selects.push(simpleselect);
//
// Basic Connect App
//
var app = connect();
var proxy = httpProxy.createProxyServer({
target: 'http://nodejs.org'
})
app.use(require('../')([], selects, true));
app.use(
function (req, res) {
proxy.web(req, res);
}
);
The problem is that a lot of sites are now redirecting HTTP to HTTPS.
nodejs.org is one of those.
I have updated the sample https://github.com/No9/harmon/blob/master/examples/doge.js to show how the http-proxy needs to be configured to deal with HTTPS.
If you still have problems with other arbitrary redirects please log an issue on harmon.
Thanks

How to stop CasperJS execution and let the user input some value and then continue to execute?

I'm using PhantomJS and CasperJS to automate some of my tasks. In one of the task, I need to manually provide captcha strings before I can actually work on the task. For this problem, what I can think of is to capture a screenshot of the web page, then manually check the captured image and save the captcha string into a text file. After that I can use the file system module in CasperJS to read that value and continue to do the process. I want to know what's the best way to do this kind of tasks.
Because of the stuctured manner/control flow of CasperJS compared to PhantomJS, such a task is not easy.
1. Pull approach (file polling)
Let's say there is a secondary program (type 1) which handles showing the CAPTCHA, receiving the input and writing a text file with the CAPTCHA input. All that CasperJS can handle is to write the CAPTCHA screenshot to disk and wait for the file with the "parsed" text.
var fs = require("fs"),
captchaFile = "cfile.png",
parsedFile = "pfile.txt";
casper.waitForCaptcha = function(captchaFile, parsedFile){
casper.then(function(){
this.captureSelector(captchaFile, "someSelectorOfTheCaptcha");
});
casper.waitFor(function check(){
return fs.exists(parsedFile);
}, function then(){
// do something on time
// check if correct...
if (!correct) {
fs.remove(captchaFile);
fs.remove(parsedFile);
this.waitForCaptcha(captchaFile, parsedFile);
// Problem: the secondary process needs to sense that a new CAPTCHA is presented
}
}, function onTimeout(){
// do something when failed
}, 60000); // 1min should suffice as a timeout
return this;
};
casper.start(url).waitForCaptcha(captchaFile, parsedFile).run();
This code assumes that you want to retry when the CAPTCHA was wrong, but not if the minute deliberately passed without the decoded file. This is a pull process by polling if files are already there.
2. Push approach
A push process is also possible where the secondary program (type 2) sends requests to the CasperJS process by utilizing the PhantomJS webserver module. Because there will be two concurrent control flows, the CasperJS part needs to wait a long time, but as soon as a request is received with the decoded words the waiting can be broken with unwait.
var server = require('webserver').create(),
fs = require("fs"),
captchaFile = "cfile.png";
function neverendingWait(){
this.wait(5000, neverendingWait);
}
casper.checkCaptcha = function(captchaFile, phantomPort, secondaryPort){
// here the CAPTCHA is saved to disk but it can also be set directly if captured through casper.captureBase64
this.captureSelector(captchaFile, "someSelectorOfTheCaptcha");
// send request to the secondary program from the page context
this.evaluate(function(file){
__utils__.sendAJAX("http://localhost:"+secondaryPort+"/", "POST", {file: file}, true);
}, captchaFile);
// start the server to receive solved CAPTCHAs
server.listen(phantomPort, {
'keepAlive': true
}, function (request, response) {
console.log('Request received at ' + new Date());
if (request.post) { // is there a response?
this.then(function(){
// check if it is correct by reading request.post ...
if (!correct){
response.statusCode = 404;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/plain;charset=utf-8'
};
response.close();
server.close();
this.checkCaptcha(captchaFile, phantomPort, secondaryPort);
} else {
response.statusCode = 200;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/plain;charset=utf-8'
};
response.close();
server.close();
this.unwait(); // abort the neverendingWait
}
});
} else {
response.statusCode = 404;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/plain;charset=utf-8'
};
response.close();
server.close();
this.checkCaptcha(captchaFile, phantomPort, secondaryPort);
}
});
return this;
};
casper.start(url).then(function(){
this.checkCaptcha(captchaFile, 8080, 8081);
}).then(neverendingWait).then(function(){
// Do something here when the captcha is successful
}).run();

Categories

Resources