Scrape image src URLs using PhantomJS

Scrape image src URLs using PhantomJS - javascript

I'm trying to get a list of all image src url's in a given webpage using PhantomJS. My understanding is that this should be extremely easy, but for whatever reason, I can't seem to make it work. Here is the code I currently have:
var page = require('webpage').create();
page.open('http://www.walmart.com');
page.onLoadFinished = function(){
var images = page.evaluate(function(){
return document.getElementsByTagName("img");
});
for(thing in a){
console.log(thing.src);
}
phantom.exit();
}
I've also tried this:
var a = page.evaluate(function(){
returnStuff = new Array;
for(stuff in document.images){
returnStuff.push(stuff);
}
return returnStuff;
});
And this:
var page = require('webpage').create();
page.open('http://www.walmart.com', function(status){
var images = page.evaluate(function() {
return document.images;
});
for(image in images){
console.log(image.src);
}
phantom.exit();
});
I've also tried iterating through the images in the evaluate function and getting the .src property that way.
None of them return anything meaningful. If I return the length of document.images, there are 54 images on the page, but trying to iterate through them provides nothing useful.
Also, I've looked at the following other questions and wasn't able to use the information they provided: How to scrape javascript injected image src and alt with phantom.js and How to download images from a site with phantomjs
Again, I just want the source url. I don't need the actual file itself. Thanks for any help.
UPDATE
I tried using
var a = page.evaluate(function(){
returnStuff = new Array;
for(stuff in document.images){
returnStuff.push(stuff.getAttribute('src'));
}
return returnStuff;
});
It threw an error saying that stuff.getAttribute('src') returns undefined. Any idea why that would be?

#MayorMonty was almost there. Indeed you cannot return HTMLCollection.
As the docs say:
Note: The arguments and the return value to the evaluate function must be a simple primitive object. The rule of thumb: if it can be serialized via JSON, then it is fine.
Closures, functions, DOM nodes, etc. will not work!
Thus the working script is like this:
var page = require('webpage').create();
page.onLoadFinished = function(){
var urls = page.evaluate(function(){
var image_urls = new Array;
var images = document.getElementsByTagName("img");
for(q = 0; q < images.length; q++){
image_urls.push(images[q].src);
}
return image_urls;
});
console.log(urls.length);
console.log(urls[0]);
phantom.exit();
}
page.open('http://www.walmart.com');

i am not sure about direct JavaScript method but recently i used jQuery to scrape image and other data so you can write script in below style after injecting jQuery
$('.someclassORselector').each(function(){
data['src']=$(this).attr('src');
});

document.images is not an Array of the nodes, it's a HTMLCollection, which is built off of an Object. You can see this if you for..in it:
for (a in document.images) {
console.log(a)
}
Prints:
0
1
2
3
length
item
namedItem
Now, there are several ways to solve this:
ES6 Spread Operator: This turns array-likes and iterables into arrays. Use like so [...document.images]
Regular for loop, like an array. This takes advantage of the fact that the keys are labeled like an array:
for(var i = 0; i < document.images.length; i++) {
document.images[i].src
}
And probably more, as well
Using solution 1 allows you to use Array functions on it, like map or reduce, but has less support (idk if the current version of javascript in phantom supports it).

I used the following code to get all images on the page loaded, the images loaded on the browser changed dimensions on the basis of the view port, Since i wanted the max dimensions i used the the maximum view port to get the actual image sizes.
Get All Images on Page USING Phantom JS
Download All Images URL on Page USING Phantom JS
No Matter even if the image is not in a img tag below code you can retrieve the URL
Even Images from such scripts will be retrieved
#media screen and (max-width:642px) {
.masthead--M4.masthead--textshadow.masthead--gradient.color-reverse {
background-image: url(assets/images/bg_studentcc-750x879-sm.jpg);
}
}
#media screen and (min-width:643px) {
.masthead--M4.masthead--textshadow.masthead--gradient.color-reverse {
background-image: url(assets/images/bg_studentcc-1920x490.jpg);
}
}
var page = require('webpage').create();
var url = "https://......";
page.settings.clearMemoryCaches = true;
page.clearMemoryCache();
page.viewportSize = {width: 1280, height: 1024};
page.open(url, function (status) {
if(status=='success'){
console.log('The entire page is loaded.............################');
}
});
page.onResourceReceived = function(response) {
if(response.stage == "start"){
var respType = response.contentType;
if(respType.indexOf("image")==0){
console.log('Content-Type : ' + response.contentType)
console.log('Status : ' + response.status)
console.log('Image Size in byte : ' + response.bodySize)
console.log('Image Url : ' + response.url)
console.log('\n');
}
}
};

Related

ReferenceError: Image is not defined

I am trying to make a website, in which I include images via links. If an image is non-existent, or just 1 pixel wide, the website should display an alternative image. I am using Jade/Pug and JS.
I try to make a list of links beforehand, before rendering them on the website. That way I can just iterate threw my link-list in the .pug file afterwards.
So what I am trying to do is, to check, if an image has a certain size, using JS only. If it does, I add the link to my list, if not, then I add an alternative link.
This is the important part of my code in the app.js-file:
app.get("/", function (req, res) {
//get all the books
var isbns = [];
var links = [];
dbClient.query("SELECT isbn FROM book WHERE id < 20", function (dbError, dbItemsResponse){
isbns = dbItemsResponse.rows;
var linkk = 0;
for(i=0;i<20;i++){
linkk = "http://covers.openlibrary.org/b/isbn/" + Object.values(isbns[i]) + "-M.jpg";
var wid = getMeta(linkk);
if(wid < 2){
links[i]="https://i.ibb.co/7JnVtcB/NoCover.png";
} else {
links[i]=linkk;
}
}
});
});
function getMeta(url){
var img = new Image(); //or document.createElement("img");
img.onload = function(){
alert(this.width);
};
img.src = url;
}
This gives me a ReferenceError: Image() is not defined. If I try to use document.createElement("img"); it says "document is not defined".
How can i check on the server-side if an Image is existent? Or how can I use the Image-Constructor in my app.js file? Without using my Jade/Pug/HTML file in any way.
Sorry If it's a dumb question, but I am trying to figure this out since 20 hours non-stop, and I can't get it to work.

You are mixing up nodejs and javascript. Your code is nodejs and therefore on the sererside. window and Image are only available in the browser, resp. on the client side.
For checking if a file exists, (Only on the serverside!=) you can use fs => fs.access.
var fs = require("fs");
// Check if the file exists in the current directory.
fs.access(file, fs.constants.F_OK, (err) => {
console.log(`${file} ${err ? 'does not exist' : 'exists'}`);
});
Note
There isn't something like a "dumb question" :=)

PDFJS stopAtErrors does not stop execution when encountering PDF parsing errors

I am using PDFJS to get textual data from PDF files, but occasionally encountering the following error:
Error: Invalid XRef table: unexpected first object.
I would prefer that my code just skip over problem files and continue on to the next file in the list. According to PDFJS documentation, setting stopAtErrors to true for the DocumentInitParameters in PDFJS should result in rejection of getTextContent when the associated PDF data cannot be successfully parsed. I am not finding such to be the case: even after setting stopAtErrors to true, I continue to get the above error and the code seems to be "spinning" on the problem file rather than just moving on to the next in the list. It is possible that I haven't properly set stopAtErrors to true as I think I have. A snippet of my code is below to illustrate what I think I've done (code based on this example):
// set up the variables to pass to getDocument, including the pdf file's url:
var obj = {};
obj.url = http://www.whatever.com/thefile.pdf; // the specific url linked to desired pdf file goes here
obj.stopAtErrors = true;
// now have PDF JS read in the file:
PDFJS.getDocument(obj).then(function(pdf) {
var pdfDocument = pdf;
var pagesPromises = [];
for (var i = 0; i < pdf.pdfInfo.numPages; i++) {
(function (pageNumber) {
pagesPromises.push(getPageText(pageNumber, pdfDocument));
}) (i+1);
}
Promise.all(pagesPromises).then(function(pagesText) {
// display text of all the pages in the console
console.log(pagesText);
});
}, function (reason) {
console.log('Error! '+reason);
});
function getPageText(pageNum, PDFDocumentInstance) {
return new Promise(function (resolve, reject) {
PDFDocumentInstance.getPage(pageNum).then(function(pdfPage) {
pdfPage.getTextContent().then(function(textContent) { // should stopAtErrors somehow be passed here to getTextContent instead of to getDocument??
var textItems = textContent.items;
var finalString = '';
for (var i = 0; i < textItems.length; i++) {
var item = textItems[i];
finalString += item.str + " ";
}
resolve(finalString);
});
});
}).catch(function(err) {
console.log('Error! '+err);
});
}
One thing I am wondering is if the stopAtErrors parameter should somehow instead be passed to getTextContent? I have not found any examples illustrating the use of stopAtErrors and the PDFJS documentation does not show a working example, either. Given that I am still at the stage of needing examples to get PDFJS to function, I am at a loss as to how to make PDFJS stop trying to parse a problem PDF file and just move on to the next one.

Appending and Preloading Images with JavaScript

I'm in the process of creating a site that preloads several large gifs. Due to the size of the images. I need them all to be loaded before displayed to the user. In the past I have done this numerous times using something basic like this:
var image = new Image();
image.onload = function () { document.appendChild(image); }
image.src = '/myimage.jpg';
However, i'm loading a group of images from an array, which contains the image source url. It should show a loading message and once they have all loaded it show perform a callback and hide the loading message etc.
The code I've been using is below:
var images = ['image1.gif', 'image2.gif', 'image3.gif'];
function preload_images (target, callback) {
// get feedback container
var feedback = document.getElementById('feedback');
// show feedback (loading message)
feedback.style.display = 'block';
// set target
var target = document.getElementById(target);
// clear html of target incase they refresh (tmp fix)
target.innerHTML = '';
// internal counter var
var counter = 0;
// image containers attach to window
var img = new Array();
// loop images
if (images.length > 0) {
for (var i in images) {
// new image object
img[i] = new Image();
// when ready peform certain actions.
img[i].onload = (function (value) {
// append to container
target.appendChild(img[value]);
// hide all images apart from the first image
if (value > 0) {
hide(img[value]);
}
// increment counter
++counter;
// on counter at correct value use callback!
if (counter == images.length) {
// hide feedback (loading message)
feedback.style.display = 'none';
if (callback) {
callback(); // when ready do callback!
}
}
})(i);
// give image alt name
img[i].alt = 'My Image ' + i;
// give image id
img[i].id = 'my_image_' + i
// preload src
img[i].src = images[i];
}//end loop
}//endif length
}//end preload image
It's really weird, I'm sure it should just work, but it doesn't even show my loading message. It just goes straight to the callback.. I'm sure it must be something simple, I've been busy and looking at it for ages and finding it a tad hard to narrow down.
I've been looking over stackoverflow and people have had similar problems and I've tried the solutions without much luck.
Any help would be greatly appreciated! I'll post more code if needed.
Cheers!

If I'm not totally wrong the problem is with you assignment to
// when ready peform certain actions.
img[i].onload = (function (value) {...})(i);
here you instantly call and execute the function and return undefined to the onload attribute, what can not be called when the image is loaded.
What you can do to have access to the value 'i' when the image is loaded you can try something like the following:
onload = (function(val){
var temp = val;
return function(){
i = temp;
//your code here
}
})(i);
this should store the value in temp and will return a callable function which should be able to access this value.
I did not test that if it is working and there maybe a better solution, but this one came to my mind :)

Try this for your onload callback:
img[i].onload = function(event) {
target.appendChild(this);
if (img.indexOf(this) > 0) {
hide(this);
}
// ...
};
Hope you can get it working! It's bed time for me though.
Edit: You'll probably have to do something about img.indexOf(this)... just realized you are using associative array for img. In your original code, I don't think comparing value to 0 is logical in that case, since value is a string. Perhaps you shouldn't use an associative array?

PhantomJS page fetching with nested loop to get new pages

I want to fetch a list online from a certain URL that is in JSON format and then use the DATA_ID from each item in that list to call a new URL. I'm just new with PhantomJS and I can't figure out why nest loops inside the page.open() acts all weird. Also the way to use phantom.exit() seems to be really weird doing what I want to achieve.
Here's my code:
console.log('Loading recipes');
console.log('===============================================================');
var page = require('webpage').create();
var url = 'http://www.hiddenurl.com/recipes/all';
page.open(url, function (status) {
//Page is loaded!
var js = page.evaluate(function () {
return document.getElementsByTagName('pre')[0];
});
var recipes = JSON.parse(js.innerHTML).results;
//console.log(recipes[0].name.replace('[s]', ''));
for (i = 0; i < recipes.length; i++) {
console.log(recipes[i].name.replace('[s]', ''));
var craft_page = require('webpage').create();
var craft_url = 'http://www.hiddenurl.com/recipe/' + recipes[i].data_id;
craft_page.open(craft_url, function (craft_status) {
//Page is loaded!
var craft_js = craft_page.evaluate(function () {
return document.getElementsByTagName('body')[0];
});
var craftp = craft_js.innerHTML;
console.log('test');
});
if (i == 5) {
console.log('===============================================================');
phantom.exit();
//break;
}
}
});
The thing that happens here is that this line:
console.log(recipes[i].name.replace('[s]', ''));
..prints the following:
===============================================================
Item from DATA_ID 1
Item from DATA_ID 2
Item from DATA_ID 3
Item from DATA_ID 4
Item from DATA_ID 5
..then it just prints the next:
===============================================================
..followed by:
'test'
'test'
'test'
'test'
'test'
Why is this not happening serial? The data from the innerly called page() request gets heaped up and dumped at the end, even after phantom.exit() should actually already be called.
Also when I free-loop a normal data-set I get this error:
QEventDispatcherUNIXPrivate(): Unable to create thread pipe: Too many open files
2013-01-31T15:35:18 [FATAL] QEventDispatcherUNIXPrivate(): Can not continue without a thread pipe
Abort trap: 6
Is there any way I can set GLOBAL_PARAMETERS or direct the process in some way so I can just handle 100's of page requests?
Thanks in advance!

I've made a workaround with Python by calling PhantomJS separately through the shell, like this:
import os
import json
cmd = "./phantomjs fetch.js"
fin,fout = os.popen4(cmd)
result = fout.read()
recipes = json.loads(result)
print recipes['count']
Not the actual solution for the PhantomJS issue, but it's a working solution and has less problems with memory and code-structure.

How to using javascript to read files into an array in order

I have a bunch of text files on server side with file names 0.txt, 1.txt, 2.txt, 3.txt and so forth. I want to read the content of all files and store them in an array A, such that A[0] has 0.txt's content, A[1] has 1.txt's, ...
How can I do it in Javascript / jquery?
Originally, I used $.ajax({}) in jQuery to load those text files. But it didn't work, because of the asynchronous nature of ajax. I tried to set $.ajax({...async=false...}), but it was very slow -- I have ~1000 10KB files to read in total.

from your question, you want to load txt file from server to local:
var done = 0, resultArr = [], numberOfFiles = 1000;
function getHandler(idx) {
return function(data) {
resultArr[idx] = data;
done++;
if (done === numberOfFiles) {
// tell your other part all files are loaded
}
}
}
for (var i = 0; i < numberOfFiles; i++) {
$.ajax(i + ".txt").done(getHandler(i));
}
jsFiddle: http://jsfiddle.net/LtQYF/1/

What you're looking for is File API introduced in HTML5 (working draft).
The examples in this article will point you in the right direction. Remember that the end user will have to initiate the action and manually select the files - otherwise it would have been a terrible idea privacy- and security-wise.
Update:
I found (yet again) the mozilla docos to be more readable! Quick html mockup:
<input type="file" id="files" name="files[]" onchange="loadTextFile();" multiple/>
<button id="test"onclick="test();">What have we read?</button>
...and the JavaScript:
var testArray = []; //your array
function loadTextFile() {
//this would be tidier with jQuery, but whatever
var _filesContainer = document.getElementById("files");
//check how many files have been selected and iterate over them
var _filesCount = _filesContainer.files.length;
for (var i = 0; i < _filesCount; i++) {
//create new FileReader instance; I have to read more into it
//but I was unable to just recycle one
var oFReader = new FileReader();
//when the file has been "read" by the FileReader locally
//log its contents and push them into an array
oFReader.onload = function(oFREvent) {
console.log(oFREvent.target.result);
testArray.push(oFREvent.target.result);
};
//actually initiate the read
oFReader.readAsText(_filesContainer.files[i]);
}
}
//sanity check
function test() {
for (var i = 0; i < testArray.length; i++) {
console.warn(testArray[i]);
}
}
Fiddled

You don't give much information to give a specific answer. However, it is my opinion that "it doesn't work because of the asynchronous nature of ajax" is not correct. You should be able to allocate an array of the correct size and use a callback for each file. You might try other options such as bundling the files on the server and unbundling them on the client, etc. The designs, that address the problem well, depend on specifics that you have not provided.

Develop Reference

JavaScript is the programming language of the Web.

Scrape image src URLs using PhantomJS - javascript

i am not sure about direct JavaScript method but recently i used jQuery to scrape image and other data so you can write script in below style after injecting jQuery $('.someclassORselector').each(function(){ data['src']=$(this).attr('src'); });

Related

ReferenceError: Image is not defined

PDFJS stopAtErrors does not stop execution when encountering PDF parsing errors

Appending and Preloading Images with JavaScript

PhantomJS page fetching with nested loop to get new pages

How to using javascript to read files into an array in order

Categories

Resources