Node Jsdom Scrape Google's Reverse Image Search

Node Jsdom Scrape Google's Reverse Image Search - javascript

I want to programatically find a list of URLs for similar images given an image URL. I can't find any free image search APIs so I'm trying to do this by scraping Google's Search by Image.
If I have an image URL, say http://i.imgur.com/oLmwq.png, then navigating to https://www.google.com/searchbyimage?&image_url=http://i.imgur.com/oLmwq.png gives related images and info.
How do I get jsdom.env to produce the HTML your browser gets from the above URL?
Here's what I've tried (CoffeeScript):
jsdom = require 'jsdom'
url = 'https://www.google.com/searchbyimage?&image_url=http://i.imgur.com/oLmwq.png'
jsdom.env
html: url
scripts: [ "http://code.jquery.com/jquery.js" ]
features:
FetchExternalResources: ['script']
ProcessExternalResources: ['script']
done: (errors, window) ->
console.log window.$('body').html()
You can see the HTML doesn't match what we want. Is this an issue with Jsdom's HTTP headers?

I find request + cheerio to be easier than jsdom for tasks like this. I see that you've found an answer already, but thought I'd mention it as an alternative solution.
Example:
var request = require('request'),
cheerio = require('cheerio');
var google = 'https://www.google.com/searchbyimage';
var image = 'http://i.imgur.com/oLmwq.png';
var options = {
url: google,
qs: { image_url: image },
headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11' }
};
request(options, function (err, res, body) {
var $ = cheerio.load(body);
…
});

The issue is Jsdom's User-Agent HTTP header. Once that is set everything (almost) works:
jsdom = require 'jsdom'
url = 'https://www.google.com/searchbyimage?&image_url=http://i.imgur.com/oLmwq.png'
jsdom.env
html: url
headers:
'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
scripts: [ "http://code.jquery.com/jquery.js" ]
features:
FetchExternalResources: ['script']
ProcessExternalResources: ['script']
done: (errors, window) ->
$ = window.$
$('#iur img').parent().each (index, elem) ->
href = $(elem).attr 'href'
url = href.split('?')[1].split('&')[0].split('=')[1]
console.log url
Which gives us a nice list of visually similar images. The only problem now is Jsdom throws an error after returning the result:
timers.js:103
if (!process.listeners('uncaughtException').length) throw e;
^
TypeError: Cannot call method 'call' of undefined
at new <anonymous> (/project-root/node_modules/jsdom/lib/jsdom/browser/index.js:54:13)
at _.Zl (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1238:93)
at _.jm (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1239:399)
at _.km (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1241:146)
at Object._onTimeout (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1248:727)
at Timer.list.ontimeout (timers.js:101:19)

Related

User Agent missing error in ReactJS fetch api

I am using News API in ReactJS for fetching news using fetch API.
CODE FRAGMENT
const url = `https://newsapi.org/v2/top-headlines?country=in&category=${this.props.category}&pageSize=${this.state.pageSize}&page=${this.state.page}`;
let data = await fetch(url);
But when I run it on the localhost, it gave an error 400 saying:
{status: 'error', code: 'userAgentMissing', message: 'Please set your User-Agent header to identify your application. Anonymous requests are not allowed.'}
When I searched the same on Stack Overflow, I got a link of a similar problem here:
User Agent Missing error when making a get request to an API
Then I added the User-Agent header in the fetch API's argument:
CODE FRAGMENT
headers = new Headers({
"X-Api-Key": this.apiKey,
"User-Agent": navigator.userAgent
});
const url = `https://newsapi.org/v2/top-headlines?country=in&category=${this.props.category}&pageSize=${this.state.pageSize}&page=${this.state.page}`;
let data = await fetch(url, {
headers: this.headers
});
But still it shows the same error:
{status: 'error', code: 'userAgentMissing', message: 'Please set your User-Agent header to identify your application. Anonymous requests are not allowed.'}
This error is asking for "User-Agent" in the request header, but it is already there! When I looked into the network section of the chrome browser.
So, what's exactly the problem is? And how to solve this issue?

I am having a similar problem. So far I have been able to pass in one of the web browser User-Agents from the Mozilla Developer websites.
function webCall(url,method,headers,postBody,callBack){
/* build options */
var options = {
method: method,
uri: url,
headers: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59',
body: postBody
}
...
};
This is still in development; however, I was able to return results from my newsapi.org pull.
Edit: For reference, I am using the old 'Requests' npm library (legacy requirement). So I make my request like so:
request(options, function handleResponse(err, response, body){
var errMessage = null;
var headers = {};
if(err) {
errMessage='HTTP ERROR: ' + err;
}
else {
headers = response.headers;
if(response.statusCode!=200){
errMessage='HTTP ' + response.statusCode + ' ERROR: ' + err;
}
}
});

How to find all the JavaScript requests made from my browser when I'm accessing a site

I want to scrape the contents of LinkedIn using requests and bs4 but I'm facing a problem with the JavaScript that is loading the page after I sign in(I don't get the home page directly), I don't wanna use Selenium
here is my code
import requests
from bs4 import BeautifulSoup
class Linkedin():
def __init__(self, url ):
self.url = url
self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"}
def saveRsulteToHtmlFile(self, nameOfFile=None):
if nameOfFile == None:
nameOfFile ="Linkedin_page"
with open(nameOfFile+".html", "wb") as file:
file.write(self.response.content)
def getSingInPage(self):
self.sess = requests.Session()
self.response = self.sess.get(self.url, headers=self.header)
soup = BeautifulSoup(self.response.content, "html.parser")
self.csrf = soup.find(attrs={"name" : "loginCsrfParam"})["value"]
def connecteToMyLinkdin(self):
self.form_data = {"session_key": "myemail#mail.com",
"loginCsrfParam": self.csrf,
"session_password": "mypassword"}
self.url = "https://www.linkedin.com/uas/login-submit"
self.response = self.sess.post(self.url, headers=self.header, data=self.form_data)
def getAnyPage(self,url):
self.response = self.sess.get(url, headers=self.header)
url = "https://www.linkedin.com/"
likedin_page = Linkedin(url)
likedin_page.getSingInPage()
likedin_page.connecteToMyLinkdin() #I'm connected but java script still loading
likedin_page.getAnyPage("https://www.linkedin.com/jobs/")
likedin_page.saveRsulteToHtmlFile()
I want help to pass the javascript loads without using Selenium...

Although it's technically possible to simulate all the calls from Python, at a dynamic page like LinkedIn, I think it will be quite tedious and brittle.
Anyway, you'd open "developer tools" in your browser before you open LinkedIn and see how the traffic looks like. You can filter for the requests from Javascript (in Firefox, the filter is called XHR).
You would then simulate the necessary/interesting requests in your code. The benefit is the servers usually return structured data to Javascript, such as JSON. Therefore you won't need to do as much HTML parsing.
If you find not progressing very much this way (it really depends on the particular site), then you will probably have to use Selenium or some alternative such as:
https://robotframework.org/
https://miyakogi.github.io/pyppeteer/ (port of Puppeteer to Python)

You should send all the XHR and JS requests manually [in the same session which you created during login]. Also, pass all the fields in request headers (copy from the network tools).
self.header_static = {
'authority': 'static-exp2.licdn.com',
'method': 'GET',
'path': '/sc/h/c356usw7zystbud7v7l42pz0s',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8,hi;q=0.7,la;q=0.6',
'cache-control': 'no-cache',
'dnt': '1',
'pragma': 'no-cache',
'referer': 'https://www.linkedin.com/jobs/',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'cross-site',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Mobile Safari/537.36'
}
def postConnectionRequests(self):
urls = [
"https://static-exp2.licdn.com/sc/h/62mb7ab7wm02esbh500ajmfuz",
"https://static-exp2.licdn.com/sc/h/mpxhij2j03tw91bpplja3u9b",
"https://static-exp2.licdn.com/sc/h/3nq91cp2wacq39jch2hz5p64y",
"https://static-exp2.licdn.com/sc/h/emyc3b18e3q2ntnbncaha2qtp",
"https://static-exp2.licdn.com/sc/h/9b0v30pbbvyf3rt7sbtiasuto",
"https://static-exp2.licdn.com/sc/h/4ntg5zu4sqpdyaz1he02c441c",
"https://static-exp2.licdn.com/sc/h/94cc69wyd1gxdiytujk4d5zm6",
"https://static-exp2.licdn.com/sc/h/ck48xrmh3ctwna0w2y1hos0ln",
"https://static-exp2.licdn.com/sc/h/c356usw7zystbud7v7l42pz0s",
]
for url in urls:
self.sess.get(url,headers=self.header_static)
print("REQUEST SENT TO "+url)
I called the postConnectionRequests() function after before saving the HTML content, and received the complete page.
Hope this helps.

XHR is send by JavaScript and Python will not run JavaScript code when it will get page using requests and beautifulsoup. Tools like Selenium loads page and runs JavaScript. You can also use Headless Browsers.

A Node.js function gives different results while working on my machine and on AWS-Lambda

I have a node function that gets a link to YouTube video and sends a request to https://www.convertmp3.io/ . It's a website that allows to download MP3 from YT.
Then it parses the response HTML to get a direct download link from the document (using a library called "cheerio", but if You're not familiar with it, it's just for scraping the link from HTML), and afterwards opens the link to download the MP3.
My code:
const request = require("request")
const cheerio = require("cheerio")
let link = "https://www.convertmp3.io/download/?video=https://www.youtube.com/watch?v=RRSDTE5nWnc"
const options = {
url: link,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
};
request(options, (error, response, html) => {
if(!error && response.statusCode == 200)
{
const $ = cheerio.load(html)
const document = $('.infoBox')
let href = "https://www.convertmp3.io" + document.find("#download").attr('href')
console.log(href)
}
})
While working on my machine, this code works correctly. The console.log gives a link which I can open on my browser and see that the mp3 is actually downloading.
But when I run it on Lambda, the link I get has the correct format, but it's simply not working. It redirects to some non-existing domain.
I'm not entirely sure how is this even possible. The only thing I might think about is that the website may think that the program is a bot (logically) and give a wrong link (which sounds pretty bizzare). But I decided to also send some user-agent headers. It didn't work either.
I'm really confused about how can this be possible and don't even know what else to try. Any thoughts?

Receiving JSON between local React App and local Springboot service issues

I am running a local Springboot server, that when I access it locally in the browser, gives me a valid JSON object properly formatted (I verified this via JSON formatter).
I am also locally running a React application using node. I am attempting to use fetch() to get back that JSON object and running into issues. Finally got around CORs header issues, but not cannot figure out why the JSON object isn't coming back. Here's my code
var headers = new Headers();
headers.append("Content-type", "application/json;charset=UTF-8");
var myInit = { method: 'GET',
headers: headers,
mode: 'no-cors',
cache: 'default',
};
fetch(`http://localhost:3010/getJSON`, myInit)
.then(function(response){
console.log(response.data);
console.log(response);
console.log(JSON.parse(JSON.stringify(response)));
},function(error){
console.log(error);
});
So when I run this in Chrome with the debugger, the responses to the 3 log statements are:
1st logger
undefined
2nd logger
Response {type: "opaque", url: "", redirected: false, status: 0, ok: false,
…}
body
:
(...)
bodyUsed
:
false
headers
:
Headers {}
ok
:
false
redirected
:
false
status
:
0
statusText
:
""
type
:
"opaque"
url
:
""
__proto__
:
Response
3rd logger
{}
I have tried many different JSON parsing, stringify, etc, to no avail.
The next confusing part, is if within the Chrome debugger I go to the "Network" tab, click on the /getJSON, it shows me the entire JSON object just fine in both the "Preview" and "Response" tabs. So clearly Chrome is connecting to it correctly. Here's Chrome's "Headers" tab within "Network":
Request URL:http://localhost:3010/getJSON
Request Method:GET
Status Code:200
Remote Address:[::1]:3010
Referrer Policy:no-referrer-when-downgrade
Response Headers
view source
Content-Type:application/json;charset=UTF-8
Date:Thu, 12 Oct 2017 16:05:05 GMT
Transfer-Encoding:chunked
Request Headers
view source
Accept:*/*
Accept-Encoding:gzip, deflate, br
Accept-Language:en-US,en;q=0.8
Connection:keep-alive
Host:localhost:3010
Referer:http://localhost:3000/
User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36
I have tried to mimic this header in my request, but not sure how it differs? Any help would be greatly appreciated as I am currently banging my head against the way with this!

You're getting an opaque response, which tells me that maybe you haven't completely resolved the cors headers situation. If you're fetching from the client, I would suggest proxying that through your nodejs so that instead of calling your springboot service, you call node, thus getting rid of the cors issues.
EDIT
You could create something like this:
import express from 'express';
import request from 'request';
const router = express.Router();
router.get('/proxyname', (req, res) => {
// Removing IPv4-mapped IPv6 address format, if present
const requestUrl = [your service's endpoint];
request(requestUrl, (err, apiResponse, body) => {
res.status(apiResponse.statusCode);
try {
res.json(JSON.parse(body));
} catch (e) {
res.send(body);
}
});
});
export default router;
and then on your nodejs server file, add it, like this:
import proxy from '[path to proxy file above]';
app.use('/path-to-endpoint', proxy);
and then call that from the client instead of your SpringBoot service.

Node.js Login to blog site and Create new post

I want to login to blogfa.com (a persian blog service) and create a new post by node.js
To do that i use request.js to post login site and go this url "/Desktop/Post.aspx?action=newpost" and post a new content
here is code i got so far :
var request = require('request');
var cheerio = require('cheerio');
var j = request.jar();
request = request.defaults({ jar : j }); //it will make the session default for every request
//...
var headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36 OPR/28.0.1750.48',
'Content-Type': 'application/x-www-form-urlencoded'
};
request({
url:"http://blogfa.com/Desktop/login.aspx",
method:"POST",
form:{uid:"demoblog1",pwd:"test"},
headers: headers,
followRedirect:true
},
function(error,response,body){
console.log(body);
});
Problem i have its, i cant login in to the site .
i dont why not working ! i used jar for cookie , posted user and password , also set headers.
here is runnable code you can test :
http://code.runnable.com/Vc7EnmyVlgRa1Hx-/blog-login-for-node-js
Update
Request Cookies
.ASPXBLOG BC045A0CD184FEA10D91561EB67A302F1E036D88E50CE4264E4ABD003
__utma 36873331.1996897518.1435135939.1437159460.1439563539.7
__utmz 36873331.1437159460.6.6.utmcsr=chat.delgarm.com|utmccn=(referral)
pubset ar=1&z=12600&ds=0&cmt=0&cats=0&tag=0&nu=1&bt=dGVzdCB
ten 67145377

By default request does not follow redirects for non-GET requests, which is what is happening in this case. So set followAllRedirects: true in your request() options and it should work fine.

Develop Reference

JavaScript is the programming language of the Web.

Node Jsdom Scrape Google's Reverse Image Search - javascript

Related

User Agent missing error in ReactJS fetch api

How to find all the JavaScript requests made from my browser when I'm accessing a site

A Node.js function gives different results while working on my machine and on AWS-Lambda

Receiving JSON between local React App and local Springboot service issues

Node.js Login to blog site and Create new post

Categories

Resources