Intermittent behavior in my AJAX, Greasemonkey script - javascript

I've a small Greasemonkey script that doesn't include any random part, but its results change with each page reload.
I'm a noob and I'm probably doing something wrong, but I don't know what. I hope you'll be able to help me.
The code is too large and too poorly written to be reproduced here, so I'll try to sum up my situation:
I have a list of links which have href=javascript:void(0) and onclick=f(link_id).
f(x) makes an XML HTTP request to the server, and returns the link address.
My script is meant to precompute f(x) and change the href value when the page loads.
I have a function wait() that waits for the page to load, then a function findLinks() that gets the nodes that are to be changed (with xpath).
Then a function sendRequest() that sends the xhr to the server. And, finally handleRequest() that asynchronously (r.onreadystatechange) retrieves the response, and sets the nodes previously found.
Do you see anything wrong with this idea?
Using a network analyzer, I can see that the request is always sent fine, and the response also.
Sometimes the href value is changed, but sometimes for some links it isn't and remains javascript:void(0).
I really don't see why it works only half the time...
function getUrlParameterFromString(urlString, name) {
name = name.replace(/[\[]/, "\\\[").replace(/[\]]/, "\\\]");
var regexS = "[\\?&]" + name + "=([^&#]*)";
var regex = new RegExp(regexS);
var results = regex.exec(urlString);
if (results == null) {
return "";
} else {
return results[1];
}
}
function getUrlParameter(name) {
return getUrlParameterFromString(window.location.href, name);
}
function wait() {
var findPattern = "//a";
var resultLinks = document.evaluate(findPattern, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
if (resultLinks == null || resultLinks.snapshotLength == 0) {
return setTimeout(_wait, 100);
} else {
for (var i = 0, len = resultLinks.snapshotLength; i < len; i++) {
var node = resultLinks.snapshotItem(i);
var s = node.getAttribute('onclick');
var linkId = s.substring(2, s.length - 1); // f(x)->x
sendRequest(linkId, node);
}
}
}
function sendRequest(linkId, nodeToModify) {
window.XMLHttpRequest ? r = new XMLHttpRequest : window.ActiveXObject && (r = new ActiveXObject("Microsoft.XMLHTTP"));
if (r) {
r.open("POST", "some_url", !0);
r.onreadystatechange = function () {
handleRequest(nodeToModify, linkId, r);
}
r.setRequestHeader("Content-type", "application/x-www-form-urlencoded");
r.send(linkId);
}
}
function handleRequest(nodeToModify, num, r) {
if (r.readyState == 4) {
if (r.status == 200) {
console.log('handleRequest() used');
var a = r.responseText;
if (a == null || a.length < 10) {
sendRequest(num, nodeToModify);
} else {
var url = unescape((getUrlParameterFromString(a, "url")).replace(/\+/g, " "));
nodeToModify.setAttribute('href', url);
nodeToModify.setAttribute('onclick', "");
}
} else {
alert("An error occurred: " + r.statusText)
}
}
}
wait();

It looks like that script will change exactly 1 link. Look-up "closures"; this loop:
for (var i = 0, len = resultLinks.snapshotLength; i < len; i++) {
var node = resultLinks.snapshotItem(i);
var s = node.getAttribute('onclick');
var linkId = s.substring(2, s.length - 1); // f(x)->x
sendRequest(linkId, node);
}
needs a closure so that sendRequest() gets the correct values. Otherwise, only the last link will be modified.
Try:
for (var i = 0, len = resultLinks.snapshotLength; i < len; i++) {
var node = resultLinks.snapshotItem(i);
var s = node.getAttribute('onclick');
var linkId = s.substring(2, s.length - 1); // f(x)->x
//-- Create a closure so that sendRequest gets the correct values.
( function (linkId, node) {
sendRequest (linkId, node);
}
)(linkId, node);
}

Related

count total number of word in pdf book using node js? [duplicate]

I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.

Javascript global variables and references to them and their parts

I'm writing little snippets to learn more about using Javascript with API's, and have stumbled onto another problem I can't figure out on my own. I have a global variable (object?) "coins", read in from the API, and its' data field "symbol". I can use "symbol" to reference the data held there, in part of my code, without any errors. Later in the code, I use it again, and I get an error about it being undefined, despite the fact that the values returned from using it, are both defined, and, what I expected. While we are at it, maybe someone can tell me why I assign values to global variables (declared outside of all of the functions), but the variables when called, are "undefined". To see it in action, visit www.mattox.space/XCR and open up dev tools.
/*
FLOW:
get ALL coins, store NAME and SYMBOL into an object.
loop over the names object comparing to $SYMBOL text from form, return the NAME when found.
hit the API again, with the $NAME added to the URL.
create a table row.
insert data from second API hit, into table row
SOMEWHERE in there, do the USD conversion from BTC.
*/
//var name = getName();
var bitcoinValue = 0;
var coins = new Array;
var form = ""; // Value pulled from the form
var symbol = ""; // "id" on the table
var id = ""; // value pulled from the table at coins[i].id matched to coins[i].symbol
var formSym = "";
var formUSD = 0;
var formBTC = 0;
var form24h = 0;
function run() {
getFormData();
allTheCoins("https://api.coinmarketcap.com/v1/ticker/");
testGlobal();
}
function testGlobal() {
console.log("These are hopefully the values of the global variables");
console.log(formSym + " testGlobal");
console.log(formUSD + " testGlobal");
console.log(formBTC + " testGlobal");
console.log(form24h + " testGlobal");
}
function getFormData(){ //This function works GREAT!
form = document.getElementById("symbol").value //THIS WORKS
form = form.toUpperCase(); //THIS WORKS
}
function allTheCoins(URL) {
var tickerRequest = new XMLHttpRequest();
tickerRequest.open('GET', URL);
tickerRequest.send();
tickerRequest.onload = function() {
if (tickerRequest.status >= 200 && tickerRequest.status < 400) {
var input = JSON.parse(tickerRequest.responseText);
for(var i in input)
coins.push(input[i]);
testFunction(coins);
}
else {
console.log("We connected to the server, but it returned an error.");
}
console.log(formSym + " allTheCoins!"); // NOPE NOPE NOPE
console.log(formUSD) + " allTheCoins!"; // NOPE NOPE NOPE
console.log(formBTC + " allTheCoins!"); // NOPE NOPE NOPE
console.log(form24h + " allTheCoins!"); // NOPE NOPE NOPE
}
}
function testFunction(coins) {
for (var i = 0; i < coins.length; i++) {
if (coins[i].symbol == form) { // But right here, I get an error.
formSym = coins[i].name;
formUSD = coins[i].price_usd;
formBTC = coins[i].price_btc;
form24h = coins[i].percent_change_24h;
console.log(formSym + " testFunction");
console.log(formUSD + " testFunction");
console.log(formBTC + " testFunction");
console.log(form24h + " testFunction");
//DO EVERYTHING RIGHT HERE! On second thought, no, this needs fixed.
}
else if (i > coins.length) {
formSym = "Error";
formUSD = 0;
formBTC = 0;
form24h = 0;
}
}
}
/*
if (24h >= 0) {
colorRED
}
else {
colorGreen
}
*/
here is a possible way of doing it that you can get inspired by. its based on a httpRequest promise that set the headers and method.
let allTheCoins = obj => {
return new Promise((resolve, reject) => {
let xhr = new XMLHttpRequest();
xhr.open(obj.method || obj.method, obj.url);
if (obj.headers) {
Object.keys(obj.headers).forEach(key => {
xhr.setRequestHeader(key, obj.headers[key]);
});
}
xhr.onload = () => {
if (xhr.status >= 200 && xhr.status < 300) {
resolve(xhr.response);
} else {
reject(xhr.statusText);
}
};
xhr.onerror = () => reject(xhr.statusText);
xhr.send(obj.body);
});
};
allTheCoins({
url: "https://api.coinmarketcap.com/v1/ticker/",
method: "GET",
headers: {"Accept-Encoding": "gzip"}
})
.then(data => {
ParseCoins(data);
})
.catch(error => {
console.log("We connected to the server, but it returned an error.");
});
function ParseCoins(data) {
const coins = JSON.parse(data);
const form = getFormVal();/*retrieve form val*/
const id = getTableId(); /*retrieve table id*/
const bitcoinValue = getBitcoinVal();/*retrieve bitcoin Value*/
const final_result = [];
for (let i = 0, len = coins[0].length; i < len; i++) {
const coin = coins[0][i];
for (let ii in coin) {
if (coin.hasOwnProperty(ii)) {
if (coin[ii].symbol == form) {
let element = {
formSym: coin[ii].name,
formUSD: coin[ii].price_usd,
formBTC: coin[ii].price_btc,
form24h: coin[ii].percent_change_24h
};
final_result.push(element);
}
}
}
}
coontinueElseWhere(final_result);
}

'Juggling Async' - Why does my solution not return anything at all?

After asking a question and getting a very helpful answer on what the 'Async Juggling' assignment in learnyounode was asking me to do, I set out to implement it myself.
The problem is, my setup isn't having any success! Even though I've referred to other solutions out there, my setup simply isn't returning any results when I do a learnyounode verify myscript.js.
GIST: jugglingAsync.js
var http = require('http');
var app = (function () {
// Private variables...
var responsesRemaining,
urls = [],
responses = [];
var displayResponses = function() {
for(var iterator in responses) {
console.log(responses[iterator]);
}
};
// Public scope...
var pub = {};
pub.main = function (args) {
responsesRemaining = args.length - 2;
// For every argument, push a URL and prep a response.
for(var i = 2; i < args.length; i++) {
urls.push(args[i]);
responses.push('');
}
// For every URL, set off an async request.
for(var iterator in urls) {
var i = iterator;
var url = urls[i];
http.get(url, function(response) {
response.setEncoding('utf8');
response.on('data', function(data) {
if(response.headers.host == url)
responses[i] += data;
});
response.on('end', function() {
if(--responsesRemaining == 0)
displayResponses();
});
});
}
};
return pub;
})();
app.main(process.argv);
Question: What am I doing wrong?
This line
for(var iterator in urls) {
doesn't do what you think it does. It actually loops over the properties of urls (see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/for...in). Instead, you have to do something like
for(var i = 0; i < urls.length; i++) {
var url = urls[i];
...
}
or
urls.forEach(function(url, index) {
...
});
In addition to not properly looping through the arrays inside the app module, I was also not properly concatenating data returned from the response.on('data') event. Originally I was doing...
responses[index] += data;
Instead, the correct thing to do was:
responses[index] = responses[index] + data;
Changing that, as well as the things noted by #arghbleargh got the 'Async Juggling' to fully verify!
I have tested my code and it all worked:
~ $ node juggling_async.js site1 site2 site3 site4 ...
The JS code does not limit only to three sites.
var http = require('http');
// Process all the site-names from the arguments and store them in sites[].
// This way does not limit the count to only 3 sites.
var sites = [];
(function loadSites() {
for(var i = 2, len = process.argv.length; i < len; ++i) {
var site = process.argv[i];
if(site.substr(0, 6) != 'http://') site = 'http://' + site;
sites.push(site);
}
})();
var home_pages = [];
var count = 0;
function httpGet(index) {
var home_page = '';
var site = sites[index];
http.get(site, function(res) {
res.setEncoding('utf8');
res.on('data', function(data) {
home_page += data;
});
res.on('end', function() {
++count;
home_pages[index] = home_page;
if(count == sites.length) {
// Yahoo! We have reached the last one.
for(var i = 0; i < sites.length; ++i) {
console.log('\n############ Site #' + (+i+1) + ': ' + sites[i]);
console.log(home_pages[i]);
console.log('============================================\n');
}
}
});
})
.on('error', function(e) {
console.log('Error at loop index ' + inddex + ': ' + e.message);
})
;
}
for(var i = 0; i < sites.length; ++i) {
httpGet(i);
}

Javascript webworker won't load XML file via XMLHttpRequest

I'm am strugling to get a webworker to load an XML file from the same domain on the side of my main page, any help would be greatly appreciated.
function readXML(){
var xhr = new XMLHttpRequest(); //Only for FF
xhr.open("GET","../db/pointer.xml",true);
xhr.send(null);
xhr.onreadystatechange = function(e){
if(xhr.status == 200 && xhr.readyState == 4){
//Post back info to main page
postMessage(xhr.responseXML.getElementsByTagName("value").length);
}
}
When this runs in a script tag on the main page, i get a 3.
When running thru the WebWorker, FireBug gives me
hr.responseXML is null
postMessage(xhr.responseXML.getElementsByTagName("value").length);
In the FireBug console, GET Request responded with
<?xml version="1.0" encoding="UTF-8"?>
<root>
<value>A value</value>
<value>Another Value</value>
<value>A third Value</value>
</root>
So the response is correct but i cannot figure out where it's going wrong.
If i change responseXML to responseText the worker outputs
A value Another Value A third Value
Which is correct! why won't the script open it as an XML document?
UPDATE
function readXML(){
var xhr = new XMLHttpRequest(); //Only for FF
xhr.open("GET","../db/pointer.xml",false);
xmlhttp.setRequestHeader('Content-Type', 'text/xml');
xhr.overrideMimeType('text/xml');
xhr.send(null);
xhr.onreadystatechange = function(e){
if(xhr.status == 200 && xhr.readyState == 4){
//Post back info to main page
postMessage(xhr.responseXML.getElementsByTagName("value").length);
}
}
When setRequestHeader & overrideMimeType is changed, the onreadystatechange never fires, doesn't matter if status and readyState are there or not, it won't run. If i remove the onreadystatechange completely and just run xhr.responseXML, i get the null error again.
I still get the correct XML in as response in the console, is this a webworker issue instead of a httprequest problem? Getting desperate here :)
index.html http://www.axlonline.se/worker/index.html
worker.js http://www.axlonline.se/worker/worker.js
According to the standard, Web workers can have not have access to any type of DOM manipulation.
The DOM APIs (Node objects, Document objects, etc) are not available to workers in this version of this specification.
responseXML and channel properties are always null from an ajax request as the parsing of the XML is a DOM API. No matter the request and response headers there will be no way of getting requestXML unless you manually parse it.
Had the same Problem. Apparently XML parsing is not possible in webworkers.
I used sax.js to parse a XML on web worker.
https://github.com/isaacs/sax-js
this is basicly my parser.
function xmlParser(strict){
this.parser = sax.parser(strict, {lowercase:true});
}
xmlParser.prototype.parseFile = function(file, callback){
var _this = this;
$.ajax.get({
cache: false,
url: file,
dataType: "xml",
success: function(data){
var dom = _this.parseText(data.text);
callback( dom );
},
error: function(data){
}
});
}
xmlParser.prototype.parseText = function(xlmText){
var dom = undefined;
var activeNode = dom;
this.parser.onerror = function (e) { };
this.parser.onend = function () {};
this.parser.ontext = function (t) {
if(activeNode != undefined)
activeNode.Text = t;
};
this.parser.onopentag = function (node) {
var node = new xmlNode(node.name, activeNode, node.attributes, dom);
if(dom === undefined){
dom = node;
activeNode = node;
}else{
activeNode.Children.push(node);
activeNode = node;
}
};
this.parser.onclosetag = function (node) {
activeNode = activeNode.Parent;
};
this.parser.write(xlmText).close();
return dom;
}
xmlNode enables a jquery like handling of the tree.
function xmlFilterResult(){
this.length = 0;
}
xmlFilterResult.prototype.push = function(element){
this[this.length++] = element;
}
xmlFilterResult.prototype.attr = function(atribute){
if(this.length == 0)
return '';
return this[0].Attributes[atribute];
}
xmlFilterResult.prototype.text = function(atribute){
if(this.length == 0)
return '';
return this[0].Text;
}
xmlFilterResult.prototype.children = function(search, result){
if(result == undefined)
result = new xmlFilterResult();
if(search == undefined){
for(var i = 0; i < this.length; i++){
this[i].children(search, result);
}
}else{
this.find(search, true, result);
}
return result;
}
xmlFilterResult.prototype.find = function(search, nonrecursive, result){
if(result == undefined)
result = new xmlFilterResult();
if(search.charAt(0) == '.')
return this.findAttr('class', search.substring(1), nonrecursive, result);
else if(search.charAt(0) == '#')
return this.findAttr('id', search.substring(1), nonrecursive, result);
else
return this.findName(search, nonrecursive, result);
}
xmlFilterResult.prototype.findAttr = function(attr, value, nonrecursive, result){
if(result == undefined)
result = new xmlFilterResult();
var child;
for(var i = 0; i < this.length; i++){
child = this[i];
child.findAttr(attr, value, nonrecursive, result);
}
return result
}
xmlFilterResult.prototype.findName = function(name, nonrecursive, result){
if(result == undefined)
result = new xmlFilterResult();
var child;
for(var i = 0; i < this.length; i++){
child = this[i];
child.findName(name, nonrecursive, result);
}
return result
}
// xmlFilterResult.prototype.findID = function(id, nonrecursive){
// var child, result = new xmlFilterResult();
// for(var i = 0; i < this.length; i++){
// child = this[i];
// child.findID(id, nonrecursive, result);
// }
// return result
// }
function xmlNode(name, parent, atributes, root){
this.Name = name;
this.Children = [];
this.Parent = parent;
this.Attributes = atributes;
this.Document = root;
this.Text = '';
}
xmlNode.prototype.attr = function(atribute){
return this.Attributes[atribute];
}
xmlNode.prototype.text = function(atribute){
return this.Text;
}
xmlNode.prototype.children = function(search, result){
if(result == undefined)
result = new xmlFilterResult();
if(search == undefined){
for(i in this.Children)
result.push(this.Children[i]);
}else{
return this.find(search, true, result);
}
return result;
}
xmlNode.prototype.find = function(search, nonrecursive, result){
if(search.charAt(0) == '.')
return this.findAttr('class', search.substring(1), nonrecursive, result);
else if(search.charAt(0) == '#')
return this.findAttr('id', search.substring(1), nonrecursive, result);
else
return this.findName(search, nonrecursive, result);
}
xmlNode.prototype.findAttr = function(attr, value, nonrecursive, result){
var child, i;
if(result == undefined)
result = new xmlFilterResult();
for(i in this.Children){
child = this.Children[i];
if(child.Attributes[attr] == value)
result.push(child);
if(!nonrecursive)
child.findAttr(attr, value, nonrecursive, result);
}
return result
}
xmlNode.prototype.findName = function(name, nonrecursive, result){
var child, i;
if(result == undefined)
result = new xmlFilterResult();
for(i in this.Children){
child = this.Children[i];
if(child.Name == name){
result.push(child);
}
if(!nonrecursive)
child.findName(name, nonrecursive, result);
}
return result
}
Its nothing special but you get the idea of that to do.
You just need to set the content-type header to text/xml on the server side.
responseXML is null if the document that you're requesting isn't XML. Specifically, the content-type should be one of text/html, text/xml, application/xml, or something that ends in +xml. See the spec.
Also, see: responseXML is null and responseXML always null.
And a side note: since web workers are inherently asynchronous, you don't need to set the async flag to true in your open call.

document is undefined in Google URL harvester

I have code taken from Google to harvest URLs from Google. I saved it as filename.js. When I run the file it showed "'document' is undefined". The part of the code which is showing problem is
// ==UserScript==
// #name Google URL Harvester
// #namespace http://userscripts.org/scripts/show/42998
// #description Harvests URLs from a Google Search
// #include http://www.google.co.uk/
// #include http://www.google.com/
// ==/UserScript==
var btn_container;
var inputs = document.getElementsByTagName("input");
for (var i = 0; i < inputs.length; i++) {
if (inputs[i].name == "btnG")
btn_container = inputs[i].parentNode;
}
function find_next_link(html) {
var url = html.match(/(<a href="[^"]+">)\s*<span[^>]+style="[^"]*background-position:\s?-76px\s/);
if (url == null)
return false;
var div = document.createElement("div");
div.innerHTML = url[1];
return div.firstChild.href;
}
function harvest(query_url, callback) {
ajax(query_url, function(e){
var als = e.match(/<a[^>]+class=l[^>]*>/g);
for (var i = 0; i < als.length; i++) {
urls.push(als[i].match(/href="([^"]+)"/)[1]);
}
var next_url = find_next_link(e);
if (next_url)
harvest(next_url, callback);
else
callback();
});
}
function ajax(url, callback) {
var req = new XMLHttpRequest();
req.onreadystatechange = function() {
if (req.readyState == 4 && req.status == 200) {
callback(req.responseText);
}
}
req.open("GET", url, true);
req.send("");
}
var new_button = document.createElement("input");
new_button.type = "button";
new_button.value = "Harvest URLs";
new_button.setAttribute("onsubmit", "return false;");
btn_container.appendChild(new_button);
var urls = [];
new_button.addEventListener("click", function(){
var query_url = unsafeWindow.document.forms[0].action + "?num=100&q="+escape(unsafeWindow.document.forms[0].q.value);
document.body.innerHTML = "<img src='http://oneworldwebsites.com/images/wheel%20throbber.gif' />";
harvest(query_url, function() {
document.body.innerHTML = urls.join("<br/>");
});
},false);
Here I have not defined document(if it is necessary). Can anybody please rectify the error in this code. Operating system is Windows 7.
I am saving this document to my desktop as harv.js and running it. Am I doing anything wrong?
Google like most websites updates their structure over time.
for (var i = 0; i < inputs.length; i++) {
if (inputs[i].name == "btnG")
btn_container = inputs[i].parentNode;
}
Needs to become
for (var i = 0; i < inputs.length; i++) {
if (inputs[i].name == "btnK") //<<------G to K
btn_container = inputs[i].parentNode;
}
I can just guess that the error is thrown because you try to access the "input" element, before creating it later. Moreover you shouldn't use a html element as a identifier in JavaScript. You could also use Firebug to pinpoint the error.

Categories

Resources