How to get all images from document and store to local - javascript

My goal is to get all images from document, then download all images bigger than 150x150px to local.
I'm stucked on retrieving files from URL i got on previous steps. Here is the buggy code line (full code - at the end):
...
var copyResult = fs.copy(imagesURLs[i], destFile);
...
When i run from console it just hangs up on fs.copy(), without any errors.
As i can understand, fs.copy() doesn't work with remote URLs, even if you set all proper args (--load-images=yes, --local-to-remote-url-access=yes). Am i right or there's something i did wrong with copy()? And are there any methods to get files directly from webkit's cache?
Got latest phantomjs version and ubuntu server.
I would be appreciate for any kind of help.
Full script code:
if (phantom.args.length < 1 || phantom.args.length > 2)
{
console.log('Usage: phantomjs ' + phantom.scriptName + ' <URL>');
phantom.exit();
}
else
{
var page = new WebPage(),
address = phantom.args[0];
page.viewportSize = { width: 1200, height: 4000 };
page.open(address, function (status)
{
if (status === 'success')
{
var imagesURLs = page.evaluate(function ()
{
var documentImages = [], imagesCount = document.images.length, index = 0;
while (index < imagesCount)
{
if ((document.images[index].width >= 150) && (document.images[index].height >= 150))
{
documentImages.push(document.images[index].src);
}
index++;
}
return documentImages;
});
var fs = require('fs');
for (var i in imagesURLs)
{
var fileName = imagesURLs[i].replace(/^.*[\\\/]/, '');
var destFile = '' + fs.workingDirectory + '/www/images/' + fileName;
console.log(destFile);
var copyResult = fs.copy(imagesURLs[i], destFile);
console.log(copyResult);
}
}
else
{
console.log('status: ' + status);
}
phantom.exit();
});
}

man try this.
function SaveAs(imgURL)
{
var oPop = window.open(imgURL,"","width=1, height=1, top=5000, left=5000");
for(;oPop.document.readyState != "complete"; )
{
if (oPop.document.readyState == "complete")break;
}
oPop.document.execCommand("SaveAs");
oPop.close();
}

Related

Can't send data over serialPort

I would like to control my Arduino robot with Node.js and a joystick, but serialport.write doesn't send any data to Arduino. I have tried to use code without a joystick and it works but only with one serial.write.
Is there a bug in my code?
Arduino code:
String data = Serial.readString();
Serial.println(data);
if(data=="2") {
//motor1
}
Node.js
var hid = require('node-hid');
var SerialPort = require("serialport").SerialPort
var serialPort = new SerialPort('COM3', {
baudrate: 9600
});
serialPort.on("open", function() {
console.log('open');
function sentData(data) {
console.log(data);
if (data == 0)
setTimeout(function() {
serialPort.write('1')
}, 2000);
else if (data > 999)
setTimeout(function() {
serialPort.write('2')
}, 2000);
}
var device = new hid.HID(1133, 49685);
device.on('data', function(buf) {
var ch = buf.toString('hex').match(/.{1,2}/g).map(function(c) {
return parseInt(c, 16);
});
var position = ((ch[2] & 0x0f) << 6) + ((ch[1] & 0xfc) >> 2);
position = parseInt(position);sentData(position);
});
});
The arduino code should look like this:
String data = '';
while(Serial.available() > 0) {
data = data + Serial.read();
}
Serial.println(data);
if(data == "2") {
//code
}
But, sorry, I can't see if there is problem in you node.js

XMLHttpRequest.responseXML is NULL even when .readystate == 4

I am using javascript to load in data from a XML file. The file is not being loaded in after an if statement that checks the ready state and the status. The ready state brings back 4 and the status brings back 200, so the last condition (the responseXML) should not be null, but for some reason, it remains null and the XML file is not loaded.
function load() {
try {
console.log("in load");
asyncRequest = new XMLHttpRequest();
asyncRequest.addEventListener("readystatechange", function() {
processResponse();
}, false);
asyncRequest.open('GET', 'Catalog.xml', true);
asyncRequest.send(null);
} catch (exception) {
alert("Request Failed");
console.log("failed");
}
}
function processResponse() {
console.log(asyncRequest.readyState + " response" + asyncRequest.status + asyncRequest.responseXML);
if (asyncRequest.readyState == 4 && asyncRequest.status == 200 && asyncRequest.responseXML) {
console.log("found");
var planets = asyncRequest.responseXML.getElementsByTagName("planet");
var name = document.getElementById("planetinfo").value;
console.log(name);
for (var i = 0; i < planets.length; ++i) {
var planet = planets.item(i);
var planetName = planet.getElementsByTagName("name").item(0).firstChild.nodeValue;
if (name == planetName) {
document.getElementById("name").innerHTML = planet.getElementsByTagName("name").item(0).firstChild.nodeValue;
document.getElementById("discovered").innerHTML = planet.getElementsByTagName("discovered").item(0).firstChild.nodeValue;
document.getElementById("distance").innerHTML = planet.getElementsByTagName("distance").item(0).firstChild.nodeValue;
document.getElementById("contact").innerHTML = planet.getElementsByTagName("contact").item(0).firstChild.nodeValue;
document.getElementById("image").innerHTML = "<img src='../images/" + planet.getElementsByTagName("image").item(0).firstChild.nodeValue + "' + '/ width = '250' height = '250'>";
}
}
}
}
This is the code from the javascript file that pertains to the loading of the XML. Opening up the console shows logs that tells me the code does not get past the if statement checking the asyncRequest.

How can I show with PhantomJS the url of the processed page in the generated PDF?

My goal was to generate a PDF from every page included in the sitemap of a website created with Rails. I'm using PhantomJS to get it. I'm quite new in this field, but I could do it, but when I was finished, I realized that it would be usable also to see at the beginning of every PDF the url of the page from which the PDF was generated, so I can browse quicker to the page (the site has over hundred pages).
Here is the Javascript:
// Render Sitemap to file
var RenderUrlsToFile, arrayOfUrls, system;
system = require("system");
/*
Render given urls
#param array of URLs to render
#param callbackPerUrl Function called after finishing each URL, including the last URL
#param callbackFinal Function called after finishing everything
*/
var getFileNumber = function(urlIndex) {
if (urlIndex <10) {
return "00" + urlIndex;
} else {
if (urlIndex <100) {
return "0" + urlIndex;
} else {
return urlIndex;
}
}
};
RenderUrlsToFile = function(urls, callbackPerUrl, callbackFinal) {
var getFilename, next, page, retrieve, urlIndex, webpage;
urlIndex = 0;
webpage = require("webpage");
page = null;
getFilename = function() {
return "rendermulti-" + getFileNumber(urlIndex) + ".pdf";
};
next = function(status, url, file) {
page.close();
callbackPerUrl(status, url, file);
return retrieve();
};
retrieve = function() {
var url;
if (urls.length > 0) {
url = urls.shift();
urlIndex++;
page = webpage.create();
page.viewportSize = {
width: 1920,
height: 1880
};
page.settings.userAgent = "Phantom.js bot";
return page.open(url, function(status) {
var file;
file = getFilename();
if (status === "success") {
return window.setTimeout((function() {
// !!!!!!!!!!!!! Doesn't work !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
page.evaluate(function() {
var x = document.getElementById("logoAndNavigation");
var newP = document.createElement("P")
var textnode = window.location.protocol + "//" + window.location.host + "/" + window.location.pathname;
newP.appendChild(textnode)
x.insertBefore(newP, x.childNodes[0]);
});
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
page.render("tempPdfs/" + file);
return next(status, url, file);
}), 200);
} else {
return next(status, url, file);
}
});
} else {
return callbackFinal();
}
};
return retrieve();
};
// This makes an array with all the urls inside the sitemap
var arrayOfUrls = [''];
var page = require('webpage').create();
page.open('http://localhost:3000/sitemap.xml', function() {
var content = page.content;
parser = new DOMParser();
xmlDoc = parser.parseFromString(content,'text/xml');
var loc = xmlDoc.getElementsByTagName('loc');
for(var i=0; i < loc.length; i++)
{
var url=loc[i].textContent;
arrayOfUrls.push(url);
}
});
RenderUrlsToFile(arrayOfUrls, (function(status, url, file) {
if (status !== "success") {
return console.log("Unable to render '" + url + "'");
} else {
return console.log("Rendered '" + url + "' at '" + file + "'");
}
}), function() {
return phantom.exit();
});
I tried to solve the issue with the urls, with the code framed with the comment
// !!!!!!!!!!!!! Doesn't work !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
I wanted to show the url inside an element of the page, that has the id #logoAndNavigation, but I get this error:
NOT_FOUND_ERR: DOM Exception 8: An attempt was made to reference a Node in a context where it does not exist.
If I use only a string like "hello" inside the variable textnode, it works, but not if I try to use the url of the page.
Can anyone please help me?
Thank you in advance!
appendChild expects a node not a string. You probably mean to use
var x = document.getElementById("logoAndNavigation");
var newP = document.createElement("p"); // small p
var textnode = window.location.protocol + "//" + window.location.host + "/" + window.location.pathname;
newP.innerHTML = textnode; // this
x.insertBefore(newP, x.childNodes[0]);
You can also use the example of printheaderfooter.js to add the URL directly to the header or footer.

How to write to CSV file in Javascript

I have a script (using PhantomJS) that tests how long it takes to load a webpage. What I am trying to figure out is how to write the result of time taken to load the page to a .csv file. Then if I were to re-run the test again for it to add another result to the .csv file.
code:
var page = require('webpage').create(),
system = require('system'),
t, address;
var pageLoadArray = [];
var csvContents = "";
fs = require('fs');
if (system.args.length === 1) {
console.log('Usage: loadspeed.js <some URL>');
phantom.exit(1);
} else {
t = Date.now();
address = system.args[1];
page.open(address, function (status) {
if (status !== 'success') {
console.log('FAIL to load the address');
}
else {
t = Date.now() - t;
console.log('Page title is ' + page.evaluate(function () {
return document.title;
}));
if(t>7000){
console.log('Loading time was too long... ' + t + "msec");
pageLoadArray.push(t);
console.log(pageLoadArray.length);
console.log(pageLoadArray[0]);
//store the time value to the .csv file
phantom.exit(1);
}
else{
console.log('Loading time ' + t + ' msec');
pageLoadArray.push(t);
console.log(pageLoadArray.length);
console.log(pageLoadArray[0]);
//store the time value to the .csv file
}
}
phantom.exit();
});
}
You can use the fs module with the write(path, content, mode) method in append mode.
var fs = require('fs');
fs.write(filepath, content, 'a');
where filepath is the file path as a string and content is a string containing your CSV line.
Something like:
address+";"+(new Date()).getTime()+";"+t
If you have control over the Jenkins environment, you can use one of the browser specific methods of triggering a download like suggested in This Question
function download(strData, strFileName, strMimeType) {
var D = document,
A = arguments,
a = D.createElement("a"),
d = A[0],
n = A[1],
t = A[2] || "text/plain";
//build download link:
a.href = "data:" + strMimeType + "charset=utf-8," + escape(strData);
if (window.MSBlobBuilder) { // IE10
var bb = new MSBlobBuilder();
bb.append(strData);
return navigator.msSaveBlob(bb, strFileName);
} /* end if(window.MSBlobBuilder) */
if ('download' in a) { //FF20, CH19
a.setAttribute("download", n);
a.innerHTML = "downloading...";
D.body.appendChild(a);
setTimeout(function() {
var e = D.createEvent("MouseEvents");
e.initMouseEvent("click", true, false, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
a.dispatchEvent(e);
D.body.removeChild(a);
}, 66);
return true;
}; /* end if('download' in a) */
//do iframe dataURL download: (older W3)
var f = D.createElement("iframe");
D.body.appendChild(f);
f.src = "data:" + (A[2] ? A[2] : "application/octet-stream") + (window.btoa ? ";base64" : "") + "," + (window.btoa ? window.btoa : escape)(strData);
setTimeout(function() {
D.body.removeChild(f);
}, 333);
return true;
}
Maybe you can use this URL SCM Plugin to grab the download.
Or use Selenium to automate some things and grab the download file

how to scrape links with phantomjs

Can PhantomJS be used an an alternative to BeautifulSoup?
I am trying to search on Etsy and visit all the links in term. In Python, I know how to do this (with BeautifulSoup) but today I want to see if I can do the same with PhantomJS. I'm not getting very far.
This script should search "hello kitty" on Etsy and return all the of products
<a class="listing-thumb" href=...></a> and print them in the console. Ideally I'd visit them later on and get the information I need. Right now it just freezes. Any ideas?
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status){
// list all the a.href links in the hello kitty etsy page
var link = page.evaluate(function() {
return document.querySelectorAll('a.listing-thumb');
});
for(var i = 0; i < link.length; i++){ console.log(link[i].href); }
phantom.exit();
});
I have toyed with using CasperJS, which may be better designed for this.
PhantomJS evaluate() cannot serialize and return complex objects like HTMLElements or NodeLists, so you have to map them to serializable things before:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status) {
// list all the a.href links in the hello kitty etsy page
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
phantom.exit();
});
Note: here we use [].map.call() in order to treat a NodeList as a standard Array.
The only problem with your code is that you do not understand phantomjs scopes. You have phantom and page scopes. You tried to return JavaScript DOM object references (those can't be serialized) from page scope (page.evaluate runs in page scope) to phantom main scope. I think that is not possible. Here follows code that works:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
// for debug (to see if page returns status code 200)
page.onResourceReceived = function(response) {
if (response.url === url) {
console.log('Resorce: "' + response.url + '" status: ' + response.status);
if (response.status === 200) {
console.log(response.url);
for (var i = 0; i < response.headers.length; i++) {
console.log(response.headers[i].name + ': ' + response.headers[i].value);
}
}
}
};
page.onLoadFinished = function(status){
console.log('Status: ' + status);
console.log('Starting evaluate...');
var links = page.evaluate(function() {
var nodes = [],
matches = document.querySelectorAll("a.listing-thumb");
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
console.log('Done evaluate... count: ' + links.length);
if (links && links.length > 0) {
for(var i = 0; i < links.length; ++i) {
console.log('(' + i + ') ' + links[i]);
}
} else {
console.log("No match found!");
}
phantom.exit(0);
};
page.open(url);
Here is some code I recently wrote that scrapes urls using PhantomJs, if you provide only a URL it will display all URLS's on the page, if you supply an argument of class|id followed by a "class/id name" it will display the urls of the class/id only.
//////////////////////////////////////////////////////////
///// PhantomJS URL Scraper v.1.3 /////
//
// Copyrighted by +A.M.Danischewski 2016+ (c)
// This program may be reutilized without limits, provided this
// notice remain intact.
//
// Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]
//
// Argument 1: URL -- "https://www.youtube.com/watch?v=8TniRMwL2Vg"
// Argument 2: "class" or "id"
// Argument 3: If Argument 2 was provided, "class name" or "id name"
//
// By default this program will display ALL urls from a user supplied URL.
// If a class name or id name is provided then only URL's from the class
// or id are displayed.
//
///////////////////////////////////
var page = require('webpage').create(),
system = require('system'),
address;
if (system.args.length === 1) {
console.log(' Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]');
phantom.exit();
}
address = system.args[1];
querytype= system.args[2];
queryclass = system.args[3];
page.open(address, function(status) {
if (status !== 'success') {
console.log('Error loading address: '+address);
} else {
//console.log('Success! In loading address: '+address);
}
});
page.onConsoleMessage = function(msg) {
console.log(msg);
}
page.onLoadFinished = function(status) {
var dynclass="function() { window.class_urls = new Array(); window.class_urls_next=0; var listings = document.getElementsByClassName('"+queryclass+"'); for (var i=0; i < listings.length; i++) { var el = listings[i]; var ellnks=[].map.call(el.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=el.innerHTML; window.class_urls.push(ellnks.join('\\n')); }; return window.class_urls;}";
var dynid="function() { window.id_urls = new Array(); window.id_urls_next=0; var listings = document.getElementById('"+queryclass+"'); var ellnks=[].map.call(listings.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=listings.innerHTML; window.id_urls.push(ellnks.join('\\n')); return window.id_urls;}";
var allurls="function() { var links = page.evaluate(function() { return [].map.call(document.querySelectorAll('a'), function(link) { return link.getAttribute('href'); };); };); console.log(links.join('\\n')); }";
var page_eval_function="";
if (querytype === "class") {
console.log(page.evaluate(dynclass).toString().replace(/,/g, "\n"));
} else if (querytype === "id") {
console.log(page.evaluate(dynid).toString().replace(/,/g, "\n"));
} else {
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
}
phantom.exit();
};

Categories

Resources