How to get backgroundImage url in cheerio - javascript

I was trying to get banner of my Anime-Planet account for my scraper system.
I tried everything i have know with cheerio but i couldn't get the profileBackgrounds background-image url.
Properties
I tried
async function Al() {
const cheerio = require("cheerio");
const url = "https://www.anime-planet.com/users/kyoyacchi";
const {data} = await client.axios.get(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Mobile Safari/537.36",
},
});
const $ = cheerio.load(data);
return $(".pull-beta.pull-alpha.pullup.editableBanner")
.find(".wrapper")
.find("profileBackground");
}
Al();
Here is the result
This one is only returns avatar path.

I learned that it's $('div[id=profileBackground]')

You can retrieve the image path with:
axios.get(url)
.then(({data: html}) => {
const $ = cheerio.load(html);
const path = $("#profileBackground")
.attr("style")
.match(/background-image: *url *\((.+?)\)/)[1];
console.log(path); // => /images/users/backgrounds/3966485.jpg?t=1660418964
});
Use #some-id to select by the id attribute. In CSS, a bare word refers to a tag name, so p is the selector for <p></p>. Ids are supposed to be unique in the document, so you don't usually need to specify parent selectors.
The regex above extracts the contents from the parentheses after background-image: url.

Related

Node JS with Axios. How to get extension of the image from url

I am trying to download the image and save it in my server from the url address. So for example I make a POST request with URL of the image. I download the image and I save it in my server. The problem comes when I need to figure our the extension of the image. Right now it works staticaly only for jpg files, but it should work for png aswell. How can I find out the extension of the file before saving it?
One way would be to get the extension from the url itself, but not all urls will have the extension , for example: https://media.istockphoto.com/photos/winter-in-the-sequoias-picture-id1292624259
This is the code that I have made right now. It works, however how I said, its static and only working for jpg:
var config = {
responseType: 'stream'
};
async function getImage(url) {
let time = Math.floor(Date.now() / 1000)
let resp = await axios.get(url, config)
resp.data.pipe(fs.createWriteStream(time+'.jpg')) // here I need to get the image extension isntead of static '.jpg'
}
You can use response headers for that. The Content-Type header should tell you the type of the file and with Content-Disposition you can get the filename with extension.
In your code you can access these headers like this
resp.headers['content-type'];
resp.headers['content-disposition'];
I'd suggest using a module such as mime to get the extension from the content-type.
Complete example:
const axios = require('axios');
const fs = require('fs');
const mime = require('mime');
var config = {
responseType: 'stream'
};
async function getImage(url) {
let time = Math.floor(Date.now() / 1000)
let resp = await axios.get(url, config)
const contentLength = resp.headers['content-length'];
const contentType = resp.headers['content-type'];
const extension = mime.extension(contentType);
console.log(`Content type: ${contentType}`);
console.log(`Extension: ${extension}`);
const fileName = time + "." + extension;
console.log(`Writing ${contentLength} bytes to file ${fileName}`);
resp.data.pipe(fs.createWriteStream(fileName));
}
const url = 'https://media.istockphoto.com/photos/winter-in-the-sequoias-picture-id1292624259';
getImage(url)
This will give an output somewhat like:
Content type: image/jpeg
Extension: jpeg
Writing 544353 bytes to file 1638867349.jpeg

Downloading Image locally from GitHub Raw link using fs.writeFileSync() JS

Currently trying to download image from GitHub locally. Everything seems to work, the fetch goes through with a 200 OK response, however, I don't understand how to store image itself:
const rawGitLink = "https://raw.githubusercontent.com/cardano-foundation/CIPs/master/CIP-0001/CIP_Flow.png"
const folder = "/Folder"
const imageName = "/Test"
const imageResponse = await axios.get(rawGitLink)
fs.writeFileSync(___dirname + folder + imageName, imageResponse, (err) => {
//Error handling
}
)
Four problems had to be fixed:
Image name must include png format for this case
The response must be in the correct format as a buffer for an image
You must write the response data and not the object itself
__dirname only needs two underscores
const rawGitLink = "https://raw.githubusercontent.com/cardano-foundation/CIPs/master/CIP-0001/CIP_Flow.png"
const folder = "/Folder"
const imageName = "/Test.png"
const imageResponse = await axios.get(rawGitLink, { responseType: 'arraybuffer' });
fs.writeFileSync(__dirname + folder + imageName, imageResponse.data)
Axios returns a special object: https://github.com/axios/axios#response-schema
let {data} = await axios.get(...)
await fs.writeFile(filename, data) // you can use fs.promises instead of sync
As #Leau said you should include the extension on the filename
Another sugestion is to use the path module to create the filename:
filename = path.join(__dirname, "/Folder", "Test.png")

Javascript - Open PDF blob in browser with a nice looking url

I am using Node to grab a PDF from the server and send it to my React frontend. Then I want to display that PDF in the browser in a new tab. It's working fairly well, except the URL of the new tab with the PDF is not ideal. The URL of the new tab looks like: blob:http://localhost:3000/71659 but I would like it to look like http://localhost:3000/71659.pdf. No 'blob' and with a pdf extension like when I would click on a pdf on the web just like the examples here: https://file-examples.com/index.php/sample-documents-download/sample-pdf-download/
My current code that handles the saving of the blob and opening it is this:
.then((res) => {
console.log(res);
const file = new Blob([res.data], {
type: 'application/pdf'
});
//Build a URL from the file
const fileURL = URL.createObjectURL(file);
window.open(fileURL, '_blank');
});
And this is my Node route the sends the stream:
router.get('/openPDFFile', async (req, res) => {
console.log('we got to the server!! with: ', req.query);
const pdfFilename = req.query.pdf;
const pdfFilepath = `./path/to/pdf/${pdfFilename}`;
router.get();
const src = fs.createReadStream(pdfFilepath);
res.writeHead(200, {
'Content-Type': 'application/pdf',
'Content-Disposition': 'inline; filename=sample.pdf'
});
src.pipe(res);
});
Now I'm wondering if instead of sending the stream over the wire and converting it to a blob, if I can just simply create a route to that PDF from Node. Something like /PDF/${pdfFilename}. And then my React will just open that URL in a new tab?
Update - Here is my latest Node route based on x00's answer:
router.get('/openPDFFile', async (req, res) => {
console.log('we got to the server!! with: ', req.query);
const pretty_PDF_name = req.query.pdf;
const pdfFilename = (await SDS.getPDFFileName({ pretty_PDF_name }))
.dataValues.sheet_file_name;
console.log('pdfFilename: ', pdfFilename);
const cleanPDFName =
pretty_PDF_name
.substring(0, pretty_PDF_name.length - 4)
.replace(/[ ,.]/g, '') + '.pdf';
const pdfFilepath = '\\path\\to\\file\\' + pdfFilename;
const fullFilePath = path.join(__dirname + '/../' + pdfFilepath);
console.log(cleanPDFName, fullFilePath);
router.get('/pdf/' + cleanPDFName, async (req, res) => {
res.sendFile(fullFilePath);
});
// router.get('/pdf/' + cleanPDFName, express.static(fullFilePath));
// const src = fs.createReadStream(pdfFilepath);
//
// res.writeHead(200, {
// 'Content-Type': 'application/pdf',
// 'Content-Disposition': 'inline; filename=sample.pdf'
// });
//
// src.pipe(res);
// return res.json({ fileuploaded: cleanPDFName });
});
I had seen the express.static way as well and was trying that too.
As I understood from the comments you don't have any special requirements (at least you didn't mention any when answering my comment). So you can just do this:
client
window.open(`/pdf/${pdfFilepath}`, '_blank');
// no need for
// fetch('/openPDFFile', ... pdf: pdfFilepath ... })
// .then(res => ... Blob ... )
// or whatever you where doing
server
router.get('/pdf/:pdfFilename', async (req, res) => {
...
res.sendFile(__dirname + `/path/to/pdf/${req.params.pdfFilename}`)
})
As a result you'll get url in the form of http://localhost:3000/pdf/71659.pdf. Also you can get the url without /pdf/ part, but I don't see any reason for that.
Update
About the colon: see "Route parameters" section here https://expressjs.com/en/guide/routing.html
Full working example:
<!DOCTYPE html>
<html lang="en">
<head></head>
<body>
<div id="get_pdf">Get PDF</div>
</body>
<script>
// here can be your business logic
// for example the name of pdf can be entered by the user through <input>
const pdfFile = "1"
// click will open new window with url = `http://localhost:3000/pdf/1.pdf`
document
.getElementById("get_pdf")
.addEventListener("click", () => {
window.open(`http://localhost:3000/pdf/${pdfFile}.pdf`, '_blank')
// if you want the ".pdf" extension on the url - you must add it yourself
})
</script>
</html>
const express = require('express')
const app = express()
app.get('/pdf/:pdf', async (req, res) => {
const requested_pdf = req.params.pdf // === "1.pdf"
console.log(requested_pdf)
// here can be your business logic for mapping
// requested_pdf from request to filepath of pdf
// or maybe even to generated pdf with no file underneath
// but I'll simply map to some static path
const map_to_pdf_path = name => __dirname + `/path/to/pdf/${name}`
res.sendFile(map_to_pdf_path(requested_pdf))
})
const listener = app.listen(process.env.PORT || constants.server_port, err => {
if (err) return console.error(err)
console.log(`Find the server at: http://localhost:${listener.address().port}`)
})
You can get a pretty filename if you hijack a bit of DOM for your purposes as indicated in this older solution, but you'll hit a number of issues in different browsers. The FileSaver.js project is probably your best bet for a near-universal support for what you're trying to accomplish. It handles blob downloads with names in a cross-browser way, and even offers some fallback options if you need IE <10 support.
This is a EASY way to do it... By no means am I trying to say that this is a good way to do it; however.
You can change the name of the URL after it has loaded using:
window.history.pushState("","","/71659.pdf");
Assuming that you can load the pdf by already going to that url, this is all you would have to do. (you wouldn't want people who are sharing that url to be sharing a broken url) Otherwise, you would need to make a new route that will accept your desired url.
If you want to you could add some error checking to see if the loaded url is the one that you want to change using:
window.location.href

Discord.js - 'Not a well formed URL' in embed

In Node.js, I'm trying to send an image as the thumbnail in the top right of an embed. But if I just put the url as it is into the embed.setThumbnail() function, the image loads forever or cannot be loaded. The url would be this one:
https://static-cdn.jtvnw.net/ttv-boxart/./Oddworld:%20Abe%27s%20Oddysee-140x180.jpg
I noticed that the special characters are making problems because urls without those work perfectly fine. So I tried to encode the url with
const querystring = require('querystring');
var boxart_url = 'https://static-cdn.jtvnw.net/ttv-boxart/./Oddworld:%20Abe%27s%20Oddysee-140x180.jpg';
const embed = new Discord.MessageEmbed();
embed.setThumbnail(querystring.stringify(boxart_url));
But this still gives me the same error. The same goes when I try to encode the filename in the url only or when I try to use querystring.escape(boxart_url). So do you know, how to encode the url? :)
Edit: As mentioned by Karizma, I tried to send the embed with just the url as the following:
const boxart_url = "https://static-cdn.jtvnw.net/ttv-boxart/./Oddworld:%20Abe%27s%20Oddysee-140x180.jpg";
const embeded = new Discord.MessageEmbed();
embeded.setThumbnail(boxart_url);
message.channel.send({embed: embeded});
The problem remains the same (https://imgur.com/a/swoSweH)
Edit: After some experimenting, I at least found, that the spaces are not the problem. It's the colon and apostrophy. I tried several stuff with this, like replacing the apostrophy with
var boxart_url = "https://static-cdn.jtvnw.net/ttv-boxart/./" + encodeURIComponent("Oddworld: Abe's Oddysee-140x180.jpg").replace(/'/g, '%23');
or different version of encodeURI() / encodeURIComponent() and I also tried to use a simple object as embed like
const embeded = {
thumbnail: {
url: boxart_url
}
}
but nothing worked so far.
Edit: I came across my own solution. I cannot retrieve the image from the url with the embeds so I download the image locally before and then use this image as attachment for the embed. This seems to work :) i'll share the code here in case somebody else has the same problem.
const request = require('request'); //depricated !
const fs = require('fs');
const directory = "../data/images/boxart/";
var download = function(url, filename, callback){
request.head(url, function(err, res, body){
request(url).pipe(fs.createWriteStream(filename)).on('close', callback);
});
};
download('https://static-cdn.jtvnw.net/ttv-boxart/./Oddworld:%20Abe%27s%20Oddysee-70x99.jpg', directory + 'oddysee.jpg', function(){
console.log('image downloaded');
});
let boxart_url = "data/images/boxart/oddysee.jpg"; // locally
const filename = boxart_url.substring( boxart_url.lastIndexOf('/') + 1 );
const file = new Discord.MessageAttachment(boxart_url);
const embeded = new Discord.MessageEmbed();
embeded.setThumbnail('attachment://'+filename);
message.channel.send({files: [file], embed: embeded});

Node.js get file extension

Im creating a file upload function in node.js with express 3.
I would like to grab the file extension of the image. so i can rename the file and then append the file extension to it.
app.post('/upload', function(req, res, next) {
var is = fs.createReadStream(req.files.upload.path),
fileExt = '', // I want to get the extension of the image here
os = fs.createWriteStream('public/images/users/' + req.session.adress + '.' + fileExt);
});
How can i get the extension of the image in node.js?
I believe you can do the following to get the extension of a file name.
var path = require('path')
path.extname('index.html')
// returns
'.html'
If you would like to get all extensions in a file name (e.g. filename.css.gz => css.gz), try this:
const ext = 'filename.css.gz'
.split('.')
.filter(Boolean) // removes empty extensions (e.g. `filename...txt`)
.slice(1)
.join('.')
console.log(ext) // prints 'css.gz'
Update
Since the original answer, extname() has been added to the path module, see Snowfish answer
Original answer:
I'm using this function to get a file extension, because I didn't find a way to do it in an easier way (but I think there is) :
function getExtension(filename) {
var ext = path.extname(filename||'').split('.');
return ext[ext.length - 1];
}
you must require 'path' to use it.
another method which does not use the path module :
function getExtension(filename) {
var i = filename.lastIndexOf('.');
return (i < 0) ? '' : filename.substr(i);
}
// you can send full url here
function getExtension(filename) {
return filename.split('.').pop();
}
If you are using express please add the following line when configuring middleware (bodyParser)
app.use(express.bodyParser({ keepExtensions: true}));
It's a lot more efficient to use the substr() method instead of split() & pop()
Have a look at the performance differences here: http://jsperf.com/remove-first-character-from-string
// returns: 'html'
var path = require('path');
path.extname('index.html').substr(1);
Update August 2019
As pointed out by #xentek in the comments; substr() is now considered a legacy function (MDN documentation). You can use substring() instead. The difference between substr() and substring() is that the second argument of substr() is the maximum length to return while the second argument of substring() is the index to stop at (without including that character). Also, substr() accepts negative start positions to be used as an offset from the end of the string while substring() does not.
This solution supports querystrings!
var Url = require('url');
var Path = require('path');
var url = 'http://i.imgur.com/Mvv4bx8.jpg?querystring=true';
var result = Path.extname(Url.parse(url).pathname); // '.jpg'
You can use path.parse(path), for example
const path = require('path');
const { ext } = path.parse('/home/user/dir/file.txt');
A simple solution without need for require which solves the multiple period extension problem:
var filename = 'file.with.long.extension';
var ext = filename.substring(filename.indexOf('.'));
//ext = '.with.long.extension'
Or if you don't want the leading dot:
var filename = 'file.with.long.extension';
var ext = filename.substring(filename.indexOf('.')+1);
//ext = 'with.long.extension'
Make sure to test that the file has an extension too.
import extname in order to return the extension the file:
import { extname } from 'path';
extname(file.originalname);
where file is the file 'name' of form
I do think mapping the Content-Type header in the request will also work. This will work even for cases when you upload a file with no extension.
(when filename does not have an extension in the request)
Assume you are sending your data using HTTP POST:
POST /upload2 HTTP/1.1
Host: localhost:7098
Connection: keep-alive
Content-Length: 1047799
Accept: */*
Origin: http://localhost:63342
User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36
Content-Type: multipart/form-data; boundary=---- WebKitFormBoundaryPDULZN8DYK3VppPp
Referer: http://localhost:63342/Admin/index.html? _ijt=3a6a054pasorvrljf8t8ea0j4h
Accept-Encoding: gzip, deflate
Accept-Language: en-US,en;q=0.8,az;q=0.6,tr;q=0.4
Request Payload
------WebKitFormBoundaryPDULZN8DYK3VppPp
Content-Disposition: form-data; name="image"; filename="blob"
Content-Type: image/png
------WebKitFormBoundaryPDULZN8DYK3VppPp--
Here name Content-Type header contains the mime type of the data.
Mapping this mime type to an extension will get you the file extension :).
Restify BodyParser converts this header in to a property with name type
File {
domain:
Domain {
domain: null,
_events: { .... },
_eventsCount: 1,
_maxListeners: undefined,
members: [ ... ] },
_events: {},
_eventsCount: 0,
_maxListeners: undefined,
size: 1047621,
path: '/tmp/upload_2a4ac9ef22f7156180d369162ef08cb8',
name: 'blob',
**type: 'image/png'**,
hash: null,
lastModifiedDate: Wed Jul 20 2016 16:12:21 GMT+0300 (EEST),
_writeStream:
WriteStream {
... },
writable: true,
domain:
Domain {
...
},
_events: {},
_eventsCount: 0,
_maxListeners: undefined,
path: '/tmp/upload_2a4ac9ef22f7156180d369162ef08cb8',
fd: null,
flags: 'w',
mode: 438,
start: undefined,
pos: undefined,
bytesWritten: 1047621,
closed: true }
}
You can use this header and do the extension mapping (substring etc ...) manually, but there are also ready made libraries for this. Below two were the top results when i did a google search
mime
mime-types
and their usage is simple as well:
app.post('/upload2', function (req, res) {
console.log(mime.extension(req.files.image.type));
}
above snippet will print png to console.
A one liner which extends String.prototype:
Object.defineProperty(String.prototype, "ext", {get: function(x) {return this.split('.').pop()}})
str = 'fox.fbx';
str.ext
Result:
path.extname will do the trick in most cases. However, it will include everything after the last ., including the query string and hash fragment of an http request:
var path = require('path')
var extname = path.extname('index.html?username=asdf')
// extname contains '.html?username=asdf'
In such instances, you'll want to try something like this:
var regex = /[#\\?]/g; // regex of illegal extension characters
var extname = path.extname('index.html?username=asdf');
var endOfExt = extname.search(regex);
if (endOfExt > -1) {
extname = extname.substring(0, endOfExt);
}
// extname contains '.html'
Note that extensions with multiple periods (such as .tar.gz), will not work at all with path.extname.
var fileName = req.files.upload.name;
var arr = fileName.split('.');
var extension = arr[length-1];
const path = require('path');
function getExt(str) {
const basename = path
.basename(str)
// Patch: for hidden files
// Removes all dots at the beginning of a line
.replace(/^(\.+)/i, '');
const firstDot = basename.indexOf('.');
const lastDot = basename.lastIndexOf('.');
const extname = path.extname(basename).replace(/(\.[a-z0-9]+).*/i, '$1');
if (firstDot === lastDot) {
return extname;
}
return basename.slice(firstDot, lastDot) + extname;
}
const files = [
'/home/charlike/bar/.hidden.tar.gz~', // ".tar.gz"
'/home/charlike/bar/file.tar.gz~', // ".tar.gz"
'/home/charlike/bar/file.tar.gz+cdf2', // ".tar.gz"
'/home/charlike/bar/file.tar.gz?quz=zaz', // ".tar.gz"
];
const fileAndExt = files.map((file) => [ file, getExt(file) ]);
console.log(JSON.stringify(fileAndExt, null, 2));
The following function splits the string and returns the name and extension no matter how many dots there are in the extension. It returns an empty string for the extension if there is none. Names that start with dots and/or white space work also.
function basext(name) {
name = name.trim()
const match = name.match(/^(\.+)/)
let prefix = ''
if (match) {
prefix = match[0]
name = name.replace(prefix, '')
}
const index = name.indexOf('.')
const ext = name.substring(index + 1)
const base = name.substring(0, index) || ext
return [prefix + base, base === ext ? '' : ext]
}
const [base, ext] = basext('hello.txt')
Try this one
const path = require('path');
function getExt(str) {
const basename = path.basename(str);
const firstDot = basename.indexOf('.');
const lastDot = basename.lastIndexOf('.');
const extname = path.extname(basename).replace(/(\.[a-z0-9]+).*/i, '$1');
if (firstDot === lastDot) {
return extname;
}
return basename.slice(firstDot, lastDot) + extname;
}
// all are `.gz`
console.log(getExt('/home/charlike/bar/file.gz'));
console.log(getExt('/home/charlike/bar/file.gz~'));
console.log(getExt('/home/charlike/bar/file.gz+cdf2'));
console.log(getExt('/home/charlike/bar/file.gz?quz=zaz'));
// all are `.tar.gz`
console.log(getExt('/home/charlike/bar/file.tar.gz'));
console.log(getExt('/home/charlike/bar/file.tar.gz~'));
console.log(getExt('/home/charlike/bar/file.tar.gz+cdf2'));
console.log(getExt('/home/charlike/bar/file.tar.gz?quz=zaz'));

Categories

Resources