Extract hostname name from string - javascript

I would like to match just the root of a URL and not the whole URL from a text string. Given:
http://www.youtube.com/watch?v=ClkQA2Lb_iE
http://youtu.be/ClkQA2Lb_iE
http://www.example.com/12xy45
http://example.com/random
I want to get the 2 last instances resolving to the www.example.com or example.com domain.
I heard regex is slow and this would be my second regex expression on the page so If there is anyway to do it without regex let me know.
I'm seeking a JS/jQuery version of this solution.

A neat trick without using regular expressions:
var tmp = document.createElement ('a');
; tmp.href = "http://www.example.com/12xy45";
// tmp.hostname will now contain 'www.example.com'
// tmp.host will now contain hostname and port 'www.example.com:80'
Wrap the above in a function such as the below and you have yourself a superb way of snatching the domain part out of an URI.
function url_domain(data) {
var a = document.createElement('a');
a.href = data;
return a.hostname;
}

I give you 3 possible solutions:
Using an npm package psl that extract anything you throw at it.
Using my custom implementation extractRootDomain which works with most cases.
URL(url).hostname works, but not for every edge case. Click "Run Snippet" to see how it run against them.
1. Using npm package psl (Public Suffix List)
The "Public Suffix List" is a list of all valid domain suffixes and rules, not just Country Code Top-Level domains, but unicode characters as well that would be considered the root domain (i.e. www.食狮.公司.cn, b.c.kobe.jp, etc.). Read more about it here.
Try:
npm install --save psl
Then with my "extractHostname" implementation run:
let psl = require('psl');
let url = 'http://www.youtube.com/watch?v=ClkQA2Lb_iE';
psl.get(extractHostname(url)); // returns youtube.com
2. My Custom Implementation of extractRootDomain
Below is my implementation and it also runs against a variety of possible URL inputs.
function extractHostname(url) {
var hostname;
//find & remove protocol (http, ftp, etc.) and get hostname
if (url.indexOf("//") > -1) {
hostname = url.split('/')[2];
} else {
hostname = url.split('/')[0];
}
//find & remove port number
hostname = hostname.split(':')[0];
//find & remove "?"
hostname = hostname.split('?')[0];
return hostname;
}
// Warning: you can use this function to extract the "root" domain, but it will not be as accurate as using the psl package.
function extractRootDomain(url) {
var domain = extractHostname(url),
splitArr = domain.split('.'),
arrLen = splitArr.length;
//extracting the root domain here
//if there is a subdomain
if (arrLen > 2) {
domain = splitArr[arrLen - 2] + '.' + splitArr[arrLen - 1];
//check to see if it's using a Country Code Top Level Domain (ccTLD) (i.e. ".me.uk")
if (splitArr[arrLen - 2].length == 2 && splitArr[arrLen - 1].length == 2) {
//this is using a ccTLD
domain = splitArr[arrLen - 3] + '.' + domain;
}
}
return domain;
}
const urlHostname = url => {
try {
return new URL(url).hostname;
}
catch(e) { return e; }
};
const urls = [
"http://www.blog.classroom.me.uk/index.php",
"http://www.youtube.com/watch?v=ClkQA2Lb_iE",
"https://www.youtube.com/watch?v=ClkQA2Lb_iE",
"www.youtube.com/watch?v=ClkQA2Lb_iE",
"ftps://ftp.websitename.com/dir/file.txt",
"websitename.com:1234/dir/file.txt",
"ftps://websitename.com:1234/dir/file.txt",
"example.com?param=value",
"https://facebook.github.io/jest/",
"//youtube.com/watch?v=ClkQA2Lb_iE",
"www.食狮.公司.cn",
"b.c.kobe.jp",
"a.d.kyoto.or.jp",
"http://localhost:4200/watch?v=ClkQA2Lb_iE"
];
const test = (method, arr) => console.log(
`=== Testing "${method.name}" ===\n${arr.map(url => method(url)).join("\n")}\n`);
test(extractHostname, urls);
test(extractRootDomain, urls);
test(urlHostname, urls);
Regardless having the protocol or even port number, you can extract the domain. This is a very simplified, non-regex solution, so I think this will do given the data set we were provided in the question.
3. URL(url).hostname
URL(url).hostname is a valid solution but it doesn't work well with some edge cases that I have addressed. As you can see in my last test, it doesn't like some of the URLs. You can definitely use a combination of my solutions to make it all work though.
*Thank you #Timmerz, #renoirb, #rineez, #BigDong, #ra00l, #ILikeBeansTacos, #CharlesRobertson for your suggestions! #ross-allen, thank you for reporting the bug!

There is no need to parse the string, just pass your URL as an argument to URL constructor:
const url = 'http://www.youtube.com/watch?v=ClkQA2Lb_iE';
const { hostname } = new URL(url);
console.assert(hostname === 'www.youtube.com');

Try this:
var matches = url.match(/^https?\:\/\/([^\/?#]+)(?:[\/?#]|$)/i);
var domain = matches && matches[1]; // domain will be null if no match is found
If you want to exclude the port from your result, use this expression instead:
/^https?\:\/\/([^\/:?#]+)(?:[\/:?#]|$)/i
Edit: To prevent specific domains from matching, use a negative lookahead. (?!youtube.com)
/^https?\:\/\/(?!(?:www\.)?(?:youtube\.com|youtu\.be))([^\/:?#]+)(?:[\/:?#]|$)/i

There are two good solutions for this, depending on whether you need to optimize for performance or not (and without external dependencies!):
1. Use URL.hostname for readability
The cleanest and easiest solution is to use URL.hostname.
const getHostname = (url) => {
// use URL constructor and return hostname
return new URL(url).hostname;
}
// tests
console.log(getHostname("https://stackoverflow.com/questions/8498592/extract-hostname-name-from-string/"));
console.log(getHostname("https://developer.mozilla.org/en-US/docs/Web/API/URL/hostname"));
URL.hostname is part of the URL API, supported by all major browsers except IE (caniuse). Use a URL polyfill if you need to support legacy browsers.
Bonus: using the URL constructor will also give you access to other URL properties and methods!
2. Use RegEx for performance
URL.hostname should be your choice for most use cases. However, it's still much slower than this regex (test it yourself on jsPerf):
const getHostnameFromRegex = (url) => {
// run against regex
const matches = url.match(/^https?\:\/\/([^\/?#]+)(?:[\/?#]|$)/i);
// extract hostname (will be null if no match is found)
return matches && matches[1];
}
// tests
console.log(getHostnameFromRegex("https://stackoverflow.com/questions/8498592/extract-hostname-name-from-string/"));
console.log(getHostnameFromRegex("https://developer.mozilla.org/en-US/docs/Web/API/URL/hostname"));
TL;DR
You should probably use URL.hostname. If you need to process an incredibly large number of URLs (where performance would be a factor), consider RegEx.

Parsing a URL can be tricky because you can have port numbers and special chars. As such, I recommend using something like parseUri to do this for you. I doubt performance is going to be a issue unless you are parsing hundreds of URLs.

I tried to use the Given solutions, the Chosen one was an overkill for my purpose and "Creating a element" one messes up for me.
It's not ready for Port in URL yet. I hope someone finds it useful
function parseURL(url){
parsed_url = {}
if ( url == null || url.length == 0 )
return parsed_url;
protocol_i = url.indexOf('://');
parsed_url.protocol = url.substr(0,protocol_i);
remaining_url = url.substr(protocol_i + 3, url.length);
domain_i = remaining_url.indexOf('/');
domain_i = domain_i == -1 ? remaining_url.length - 1 : domain_i;
parsed_url.domain = remaining_url.substr(0, domain_i);
parsed_url.path = domain_i == -1 || domain_i + 1 == remaining_url.length ? null : remaining_url.substr(domain_i + 1, remaining_url.length);
domain_parts = parsed_url.domain.split('.');
switch ( domain_parts.length ){
case 2:
parsed_url.subdomain = null;
parsed_url.host = domain_parts[0];
parsed_url.tld = domain_parts[1];
break;
case 3:
parsed_url.subdomain = domain_parts[0];
parsed_url.host = domain_parts[1];
parsed_url.tld = domain_parts[2];
break;
case 4:
parsed_url.subdomain = domain_parts[0];
parsed_url.host = domain_parts[1];
parsed_url.tld = domain_parts[2] + '.' + domain_parts[3];
break;
}
parsed_url.parent_domain = parsed_url.host + '.' + parsed_url.tld;
return parsed_url;
}
Running this:
parseURL('https://www.facebook.com/100003379429021_356001651189146');
Result:
Object {
domain : "www.facebook.com",
host : "facebook",
path : "100003379429021_356001651189146",
protocol : "https",
subdomain : "www",
tld : "com"
}

If you end up on this page and you are looking for the best REGEX of URLS try this one:
^(?:https?:)?(?:\/\/)?([^\/\?]+)
https://regex101.com/r/pX5dL9/1
You can use it like below and also with case insensitive manner to match with HTTPS and HTTP as well.:
const match = str.match(/^(?:https?:)?(?:\/\/)?([^\/\?]+)/i);
const hostname = match && match[1];
It works for urls without http:// , with http, with https, with just // and dont grab the path and query path as well.
Good Luck

All url properties, no dependencies, no JQuery, easy to understand
This solution gives your answer plus additional properties. No JQuery or other dependencies required, paste and go.
Usage
getUrlParts("https://news.google.com/news/headlines/technology.html?ned=us&hl=en")
Output
{
"origin": "https://news.google.com",
"domain": "news.google.com",
"subdomain": "news",
"domainroot": "google.com",
"domainpath": "news.google.com/news/headlines",
"tld": ".com",
"path": "news/headlines/technology.html",
"query": "ned=us&hl=en",
"protocol": "https",
"port": 443,
"parts": [
"news",
"google",
"com"
],
"segments": [
"news",
"headlines",
"technology.html"
],
"params": [
{
"key": "ned",
"val": "us"
},
{
"key": "hl",
"val": "en"
}
]
}
Code
The code is designed to be easy to understand rather than super fast. It can be called easily 100 times per second, so it's great for front end or a few server usages, but not for high volume throughput.
function getUrlParts(fullyQualifiedUrl) {
var url = {},
tempProtocol
var a = document.createElement('a')
// if doesn't start with something like https:// it's not a url, but try to work around that
if (fullyQualifiedUrl.indexOf('://') == -1) {
tempProtocol = 'https://'
a.href = tempProtocol + fullyQualifiedUrl
} else
a.href = fullyQualifiedUrl
var parts = a.hostname.split('.')
url.origin = tempProtocol ? "" : a.origin
url.domain = a.hostname
url.subdomain = parts[0]
url.domainroot = ''
url.domainpath = ''
url.tld = '.' + parts[parts.length - 1]
url.path = a.pathname.substring(1)
url.query = a.search.substr(1)
url.protocol = tempProtocol ? "" : a.protocol.substr(0, a.protocol.length - 1)
url.port = tempProtocol ? "" : a.port ? a.port : a.protocol === 'http:' ? 80 : a.protocol === 'https:' ? 443 : a.port
url.parts = parts
url.segments = a.pathname === '/' ? [] : a.pathname.split('/').slice(1)
url.params = url.query === '' ? [] : url.query.split('&')
for (var j = 0; j < url.params.length; j++) {
var param = url.params[j];
var keyval = param.split('=')
url.params[j] = {
'key': keyval[0],
'val': keyval[1]
}
}
// domainroot
if (parts.length > 2) {
url.domainroot = parts[parts.length - 2] + '.' + parts[parts.length - 1];
// check for country code top level domain
if (parts[parts.length - 1].length == 2 && parts[parts.length - 1].length == 2)
url.domainroot = parts[parts.length - 3] + '.' + url.domainroot;
}
// domainpath (domain+path without filenames)
if (url.segments.length > 0) {
var lastSegment = url.segments[url.segments.length - 1]
var endsWithFile = lastSegment.indexOf('.') != -1
if (endsWithFile) {
var fileSegment = url.path.indexOf(lastSegment)
var pathNoFile = url.path.substr(0, fileSegment - 1)
url.domainpath = url.domain
if (pathNoFile)
url.domainpath = url.domainpath + '/' + pathNoFile
} else
url.domainpath = url.domain + '/' + url.path
} else
url.domainpath = url.domain
return url
}

Just use the URL() constructor:
new URL(url).host

function hostname(url) {
var match = url.match(/:\/\/(www[0-9]?\.)?(.[^/:]+)/i);
if ( match != null && match.length > 2 && typeof match[2] === 'string' && match[2].length > 0 ) return match[2];
}
The above code will successfully parse the hostnames for the following example urls:
http://WWW.first.com/folder/page.html
first.com
http://mail.google.com/folder/page.html
mail.google.com
https://mail.google.com/folder/page.html
mail.google.com
http://www2.somewhere.com/folder/page.html?q=1
somewhere.com
https://www.another.eu/folder/page.html?q=1
another.eu
Original credit goes to: http://www.primaryobjects.com/CMS/Article145

Was looking for a solution to this problem today. None of the above answers seemed to satisfy. I wanted a solution that could be a one liner, no conditional logic and nothing that had to be wrapped in a function.
Here's what I came up with, seems to work really well:
hostname="http://www.example.com:1234"
hostname.split("//").slice(-1)[0].split(":")[0].split('.').slice(-2).join('.') // gives "example.com"
May look complicated at first glance, but it works pretty simply; the key is using 'slice(-n)' in a couple of places where the good part has to be pulled from the end of the split array (and [0] to get from the front of the split array).
Each of these tests return "example.com":
"http://example.com".split("//").slice(-1)[0].split(":")[0].split('.').slice(-2).join('.')
"http://example.com:1234".split("//").slice(-1)[0].split(":")[0].split('.').slice(-2).join('.')
"http://www.example.com:1234".split("//").slice(-1)[0].split(":")[0].split('.').slice(-2).join('.')
"http://foo.www.example.com:1234".split("//").slice(-1)[0].split(":")[0].split('.').slice(-2).join('.')

Here's the jQuery one-liner:
$('<a>').attr('href', url).prop('hostname');

This is not a full answer, but the below code should help you:
function myFunction() {
var str = "https://www.123rf.com/photo_10965738_lots-oop.html";
matches = str.split('/');
return matches[2];
}
I would like some one to create code faster than mine. It help to improve my-self also.

Okay, I know this is an old question, but I made a super-efficient url parser so I thought I'd share it.
As you can see, the structure of the function is very odd, but it's for efficiency. No prototype functions are used, the string doesn't get iterated more than once, and no character is processed more than necessary.
function getDomain(url) {
var dom = "", v, step = 0;
for(var i=0,l=url.length; i<l; i++) {
v = url[i]; if(step == 0) {
//First, skip 0 to 5 characters ending in ':' (ex: 'https://')
if(i > 5) { i=-1; step=1; } else if(v == ':') { i+=2; step=1; }
} else if(step == 1) {
//Skip 0 or 4 characters 'www.'
//(Note: Doesn't work with www.com, but that domain isn't claimed anyway.)
if(v == 'w' && url[i+1] == 'w' && url[i+2] == 'w' && url[i+3] == '.') i+=4;
dom+=url[i]; step=2;
} else if(step == 2) {
//Stop at subpages, queries, and hashes.
if(v == '/' || v == '?' || v == '#') break; dom += v;
}
}
return dom;
}

oneline with jquery
$('<a>').attr('href', document.location.href).prop('hostname');

// use this if you know you have a subdomain
// www.domain.com -> domain.com
function getDomain() {
return window.location.hostname.replace(/([a-zA-Z0-9]+.)/,"");
}

String.prototype.trim = function(){return his.replace(/^\s+|\s+$/g,"");}
function getHost(url){
if("undefined"==typeof(url)||null==url) return "";
url = url.trim(); if(""==url) return "";
var _host,_arr;
if(-1<url.indexOf("://")){
_arr = url.split('://');
if(-1<_arr[0].indexOf("/")||-1<_arr[0].indexOf(".")||-1<_arr[0].indexOf("\?")||-1<_arr[0].indexOf("\&")){
_arr[0] = _arr[0].trim();
if(0==_arr[0].indexOf("//")) _host = _arr[0].split("//")[1].split("/")[0].trim().split("\?")[0].split("\&")[0];
else return "";
}
else{
_arr[1] = _arr[1].trim();
_host = _arr[1].split("/")[0].trim().split("\?")[0].split("\&")[0];
}
}
else{
if(0==url.indexOf("//")) _host = url.split("//")[1].split("/")[0].trim().split("\?")[0].split("\&")[0];
else return "";
}
return _host;
}
function getHostname(url){
if("undefined"==typeof(url)||null==url) return "";
url = url.trim(); if(""==url) return "";
return getHost(url).split(':')[0];
}
function getDomain(url){
if("undefined"==typeof(url)||null==url) return "";
url = url.trim(); if(""==url) return "";
return getHostname(url).replace(/([a-zA-Z0-9]+.)/,"");
}

I personally researched a lot for this solution, and the best one I could find is actually from CloudFlare's "browser check":
function getHostname(){
secretDiv = document.createElement('div');
secretDiv.innerHTML = "<a href='/'>x</a>";
secretDiv = secretDiv.firstChild.href;
var HasHTTPS = secretDiv.match(/https?:\/\//)[0];
secretDiv = secretDiv.substr(HasHTTPS.length);
secretDiv = secretDiv.substr(0, secretDiv.length - 1);
return(secretDiv);
}
getHostname();
I rewritten variables so it is more "human" readable, but it does the job better than expected.

Well, doing using an regular expression will be a lot easier:
mainUrl = "http://www.mywebsite.com/mypath/to/folder";
urlParts = /^(?:\w+\:\/\/)?([^\/]+)(.*)$/.exec(mainUrl);
host = Fragment[1]; // www.mywebsite.com

in short way you can do like this
var url = "http://www.someurl.com/support/feature"
function getDomain(url){
domain=url.split("//")[1];
return domain.split("/")[0];
}
eg:
getDomain("http://www.example.com/page/1")
output:
"www.example.com"
Use above function to get domain name

This solution works well and you can also use if URL contains a lot of invalid characters.
install psl package
npm install --save psl
implementation
const psl = require('psl');
const url= new URL('http://www.youtube.com/watch?v=ClkQA2Lb_iE').hostname;
const parsed = psl.parse(url);
console.log(parsed)
output:
{
input: 'www.youtube.com',
tld: 'com',
sld: 'youtube',
domain: 'youtube.com',
subdomain: 'www',
listed: true
}

import URL from 'url';
const pathname = URL.parse(url).path;
console.log(url.replace(pathname, ''));
this takes care of both the protocol.

Code:
var regex = /\w+.(com|co\.kr|be)/ig;
var urls = ['http://www.youtube.com/watch?v=ClkQA2Lb_iE',
'http://youtu.be/ClkQA2Lb_iE',
'http://www.example.com/12xy45',
'http://example.com/random'];
$.each(urls, function(index, url) {
var convertedUrl = url.match(regex);
console.log(convertedUrl);
});
Result:
youtube.com
youtu.be
example.com
example.com

Parse-Urls appears to be the JavaScript library with the most robust patterns
Here is a rundown of the features:
Chapter 1. Normalize or parse one URL
Chapter 2. Extract all URLs
Chapter 3. Extract URIs with certain names
Chapter 4. Extract all fuzzy URLs
Chapter 5. Highlight all URLs in texts
Chapter 6. Extract all URLs in raw HTML or XML

parse-domain - a very solid lightweight library
npm install parse-domain
const { fromUrl, parseDomain } = require("parse-domain");
Example 1
parseDomain(fromUrl("http://www.example.com/12xy45"))
{ type: 'LISTED',
hostname: 'www.example.com',
labels: [ 'www', 'example', 'com' ],
icann:
{ subDomains: [ 'www' ],
domain: 'example',
topLevelDomains: [ 'com' ] },
subDomains: [ 'www' ],
domain: 'example',
topLevelDomains: [ 'com' ] }
Example 2
parseDomain(fromUrl("http://subsub.sub.test.ExAmPlE.coM/12xy45"))
{ type: 'LISTED',
hostname: 'subsub.sub.test.example.com',
labels: [ 'subsub', 'sub', 'test', 'example', 'com' ],
icann:
{ subDomains: [ 'subsub', 'sub', 'test' ],
domain: 'example',
topLevelDomains: [ 'com' ] },
subDomains: [ 'subsub', 'sub', 'test' ],
domain: 'example',
topLevelDomains: [ 'com' ] }
Why?
Depending on the use case and volume I strongly recommend against solving this problem yourself using regex or other string manipulation means. The core of this problem is that you need to know all the gtld and cctld suffixes to properly parse url strings into domain and subdomains, these suffixes are regularly updated. This is a solved problem and not one you want to solve yourself (unless you are google or something). Unless you need the hostname or domain name in a pinch don't try and parse your way out of this one.

A URL is schema://domain/path/to/resource?key=value#fragment so you could split on /:
/**
* Get root of URL
* #param {string} url - string to parse
* #returns {string} url root or empty string
*/
function getUrlRoot(url) {
return String(url || '').split('/').slice(0, 3).join('/');
}
Example:
getUrlRoot('http://www.youtube.com/watch?v=ClkQA2Lb_iE');
// returns http://www.youtube.com
getUrlRoot('http://youtu.be/ClkQA2Lb_iE');
// returns http://youtu.be
getUrlRoot('http://www.example.com/12xy45');
// returns http://www.example.com
getUrlRoot('http://example.com/random');
// returns http://example.com

My code looks like this.
Regular expressions can come in many forms, and here are my test cases
I think it's more scalable.
function extractUrlInfo(url){
let reg = /^((?<protocol>http[s]?):\/\/)?(?<host>((\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])|[-a-zA-Z0-9#:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()#:%_\+.~#?&//=]*)))(\:(?<port>[0-9]|[1-9]\d|[1-9]\d{2}|[1-9]\d{3}|[1-5]\d{4}|6[0-4]\d{3}|65[0-4]\d{2}|655[0-2]\d|6553[0-5]))?$/
return reg.exec(url).groups
}
var url = "https://192.168.1.1:1234"
console.log(extractUrlInfo(url))
var url = "https://stackoverflow.com/questions/8498592/extract-hostname-name-from-string"
console.log(extractUrlInfo(url))

Try below code for exact domain name using regex,
String line = "http://www.youtube.com/watch?v=ClkQA2Lb_iE";
String pattern3="([\\w\\W]\\.)+(.*)?(\\.[\\w]+)";
Pattern r = Pattern.compile(pattern3);
Matcher m = r.matcher(line);
if (m.find( )) {
System.out.println("Found value: " + m.group(2) );
} else {
System.out.println("NO MATCH");
}

Related

regex detect url and prepend http:// [duplicate]

This question already has answers here:
Adding http:// to all links without a protocol
(4 answers)
Closed 8 years ago.
I would like to detect url's that are entered in a text input. I have the following code which prepends http:// to the beginning of what has been entered:
var input = $(this);
var val = input.val();
if (val && !val.match(/^http([s]?):\/\/.*/)) {
input.val('http://' + val);
}
How would I go about adapting this to only append the http:// if it contains a string followed by a tld? At the moment if I enter a string for example:
Hello. This is a test
the http:// will get appended to hello, even though it's not a url. Any help would be greatly appreciated.
This simple function works for me. We don't care about the real existence of a TLD domain to gain speed, rather we check the syntax like example.com.
Sorry, I've forgotten that VBA trim() is not intrinsic function in js, so:
// Removes leading whitespaces
function LTrim(value)
{
var re = /\s*((\S+\s*)*)/;
return value.replace(re, "$1");
}
// Removes ending whitespaces
function RTrim(value)
{
var re = /((\s*\S+)*)\s*/;
return value.replace(re, "$1");
}
// Removes leading and ending whitespaces
function trim(value)
{
return LTrim(RTrim(value));
}
function hasDomainTld(strAddress)
{
var strUrlNow = trim(strAddress);
if(strUrlNow.match(/[,\s]/))
{
return false;
}
var i, regex = new RegExp();
regex.compile("[A-Za-z0-9\-_]+\\.[A-Za-z0-9\-_]+$");
i = regex.test(strUrlNow);
regex = null;
return i;
}
So your code, $(this) is window object, so I pass the objInput through an argument, using classical js instead of jQuery:
function checkIt(objInput)
{
var val = objInput.value;
if(val.match(/http:/i)) {
return false;
}
else if (hasDomainTld(val)) {
objInput.value = 'http://' + val;
}
}
Please test yourself: http://jsfiddle.net/SDUkZ/8/
The best solution i have found is to use the following regex:
/\.[a-zA-Z]{2,3}/
This detects the . after the url, and characters for the extension with a limit of 2/3 characters.
Does this seem ok for basic validation? Please let me know if you see any problems that could arise.
I know that it will detect email address's but this wont matter in this instance.
You need to narrow down your requirements first as URL detection with regular expressions can be very tricky. These are just a few situations where your parser can fail:
IDNs (госуслуги.рф)
Punycode cases (xn--blah)
New TLD being registered (.amazon)
SEO-friendly URLs (domain.com/Everything you need to know about RegEx.aspx)
We recently faced a similar problem and what we ended up doing was a simple check whether the URL starts with either http://, https://, or ftp:// and prepending with http:// if it doesn't start with any of the mentioned schemes. Here's the implementation in TypeScript:
public static EnsureAbsoluteUri(uri: string): string {
var ret = uri || '', m = null, i = -1;
var validSchemes = ko.utils.arrayMap(['http', 'https', 'ftp'], (i) => { return i + '://' });
if (ret && ret.length) {
m = ret.match(/[a-z]+:\/\//gi);
/* Checking against a list of valid schemes and prepending with "http://" if check fails. */
if (m == null || !m.length || (i = $.inArray(m[0].toLowerCase(), validSchemes)) < 0 ||
(i >= 0 && ret.toLowerCase().indexOf(validSchemes[i]) != 0)) {
ret = 'http://' + ret;
}
}
return ret;
}
As you can see, we're not trying to be smart here as we can't predict every possible URL form. Furthermore, this method is usually executed against field values we know are meant to be URLs so the change of misdetection is minimal.
Hope this helps.

Check if a string starts with http using Javascript

I've been searching all over for an answer to this and all of the answers I've found haven't been in JavaScript.
I need a way, in javascript, to check if a string starts with http, https, or ftp. If it doesn't start with one of those I need to prepend the string with http://. indexOf won't work for me I don't think as I need either http, https or ftp. Also I don't want something like google.com/?q=http://google.com to trigger that as being valid as it doesn't start with an http whereas indexOf would trigger that as being true (if I'm not entirely mistaken).
The closest PHP regex I've found is this:
function addhttp($url) {
if (!preg_match("~^(?:f|ht)tps?://~i", $url)) {
$url = "http://" . $url;
}
return $url;
}
Source: How to add http if its not exists in the url
I just don't know how to convert that to javascript. Any help would be greatly appreciated.
export const getValidUrl = (url = "") => {
let newUrl = window.decodeURIComponent(url);
newUrl = newUrl.trim().replace(/\s/g, "");
if(/^(:\/\/)/.test(newUrl)){
return `http${newUrl}`;
}
if(!/^(f|ht)tps?:\/\//i.test(newUrl)){
return `http://${newUrl}`;
}
return newUrl;
};
Tests:
expect(getValidUrl('https://www.test.com')).toBe('https://www.test.com');
expect(getValidUrl('http://www.test.com')).toBe('http://www.test.com');
expect(getValidUrl(' http : / / www.test.com')).toBe('http://www.test.com');
expect(getValidUrl('ftp://www.test.com')).toBe('ftp://www.test.com');
expect(getValidUrl('www.test.com')).toBe('http://www.test.com');
expect(getValidUrl('://www.test.com')).toBe('http://www.test.com');
expect(getValidUrl('http%3A%2F%2Fwww.test.com')).toBe('http://www.test.com');
expect(getValidUrl('www . test.com')).toBe('http://www.test.com');
This should work:
var pattern = /^((http|https|ftp):\/\/)/;
if(!pattern.test(url)) {
url = "http://" + url;
}
jsFiddle
var url = "http://something.com"
if( url.indexOf("http") == 0 ) {
alert("yeah!");
} else {
alert("No no no!");
}
Non-Regex declarative way:
const hasValidUrlProtocol = (url = '') =>
['http://', 'https://', 'ftp://'].some(protocol => url.startsWith(protocol))
This should work:
var re = /^(http|https|ftp)/
Refining previous answers a bit more, I used new RegExp(...) to avoid messy escapes, and also added an optional s.
var pattern = new RegExp('^(https?|ftp)://');
if(!pattern.test(url)) {
url = "http://" + url;
}
var pattern = new RegExp('^(https?|ftp)://');
console.log(pattern.test('http://foo'));
console.log(pattern.test('https://foo'));
console.log(pattern.test('ftp://foo'));
console.log(pattern.test('bar'));
Best readability I'd say is to use .startsWith('theString').
The code below checks for both http://, https:// and ftp:// and sets okUrl to a boolean true if any of them comply, by using the || which means or.
When you know if the url contains none of those strings, then just just add http:// to the start of the string either with literals (${})
let url = 'google.com'
const urlOK = url.startsWith('http://') || url.startsWith('https://') || url.startsWith('ftp://')
if (!urlOk) url = `http://${url}`
// => 'http://google.com'
or with simple string concat, which can be done in various ways, for example:
url = 'http://' + url
Read more about it, and test it, here:
https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/startsWith

Improving regex for parsing YouTube / Vimeo URLs

I've made a function (in JavaScript) that takes an URL from either YouTube or Vimeo. It figures out the provider and ID for that particular video (demo: http://jsfiddle.net/csjwf/).
function parseVideoURL(url) {
var provider = url.match(/http:\/\/(:?www.)?(\w*)/)[2],
id;
if(provider == "youtube") {
id = url.match(/http:\/\/(?:www.)?(\w*).com\/.*v=(\w*)/)[2];
} else if (provider == "vimeo") {
id = url.match(/http:\/\/(?:www.)?(\w*).com\/(\d*)/)[2];
} else {
throw new Error("parseVideoURL() takes a YouTube or Vimeo URL");
}
return {
provider : provider,
id : id
}
}
It works, however as a regex Novice, I'm looking for ways to improve it. The input I'm dealing with, typically looks like this:
http://vimeo.com/(id)
http://youtube.com/watch?v=(id)&blahblahblah.....
1) Right now I'm doing three separate matches, would it make sense to try and do everything in one single expression? If so, how?
2) Could the existing matches be more concise? Are they unnecessarily complex? or perhaps insufficient?
3) Are there any YouTube or Vimeo URL's that would fail being parsed? I've tried quite a few and so far it seems to work pretty well.
To summarize: I'm simply looking for ways improve the above function. Any advice is greatly appreciated.
Here's my attempt at the regex, which covers most updated cases:
function parseVideo(url) {
// - Supported YouTube URL formats:
// - http://www.youtube.com/watch?v=My2FRPA3Gf8
// - http://youtu.be/My2FRPA3Gf8
// - https://youtube.googleapis.com/v/My2FRPA3Gf8
// - Supported Vimeo URL formats:
// - http://vimeo.com/25451551
// - http://player.vimeo.com/video/25451551
// - Also supports relative URLs:
// - //player.vimeo.com/video/25451551
url.match(/(https?\/\/)(player.|www.)?(vimeo\.com|youtu(be\.com|\.be|be\.googleapis\.com))\/(video\/|embed\/|watch\?v=|v\/)?([A-Za-z0-9._%-]*)(\&\S+)?/);
var type = null;
if (RegExp.$3.indexOf('youtu') > -1) {
type = 'youtube';
} else if (RegExp.$3.indexOf('vimeo') > -1) {
type = 'vimeo';
}
return {
type: type,
id: RegExp.$6
};
}
Regex is wonderfully terse but can quickly get complicated.
http://jsfiddle.net/8nagx2sk/
function parseYouTube(str) {
// link : //youtube.com/watch?v=Bo_deCOd1HU
// share : //youtu.be/Bo_deCOd1HU
// embed : //youtube.com/embed/Bo_deCOd1HU
var re = /\/\/(?:www\.)?youtu(?:\.be|be\.com)\/(?:watch\?v=|embed\/)?([a-z0-9_\-]+)/i;
var matches = re.exec(str);
return matches && matches[1];
}
function parseVimeo(str) {
// embed & link: http://vimeo.com/86164897
var re = /\/\/(?:www\.)?vimeo.com\/([0-9a-z\-_]+)/i;
var matches = re.exec(str);
return matches && matches[1];
}
Sometimes simple code is nicer to your fellow developers.
https://jsfiddle.net/vkg02mhp/1/
// protocol and www nuetral
function getVideoId(str, prefixes) {
const cleaned = str.replace(/^(https?:)?\/\/(www\.)?/, '');
for(const prefix of prefixes) {
if (cleaned.startsWith(prefix))
return cleaned.substr(prefix.length)
}
return undefined;
}
function getYouTubeId(url) {
return getVideoId(url, [
'youtube.com/watch?v=',
'youtu.be/',
'youtube.com/embed/'
]);
}
function getVimeoId(url) {
return getVideoId(url, [
'vimeo.com/'
]);
}
Which do you prefer to update?
I am not sure about your question 3), but provided that your induction on the url forms is correct, the regexes can be combined into one as follows:
/http:\/\/(?:www.)?(?:(vimeo).com\/(.*)|(youtube).com\/watch\?v=(.*?)&)/
You will get the match under different positions (1st and 2nd matches if vimeo, 3rd and 4th matches if youtube), so you just need to handle that.
Or, if you are quite sure that vimeo's id only includes numbers, then you can do:
/http:\/\/(?:www.)?(vimeo|youtube).com\/(?:watch\?v=)?(.*?)(?:\z|&)/
and the provider and the id will apprear under 1st and 2nd match, respcetively.
Here is my regex
http://jsfiddle.net/csjwf/1/
For Vimeo, Don't rely on Regex as Vimeo tends to change/update their URL pattern every now and then. As of October 2nd, 2017, there are in total of six URL schemes Vimeo supports.
https://vimeo.com/*
https://vimeo.com/*/*/video/*
https://vimeo.com/album/*/video/*
https://vimeo.com/channels/*/*
https://vimeo.com/groups/*/videos/*
https://vimeo.com/ondemand/*/*
Instead, use their API to validate vimeo URLs. Here is this oEmbed (doc) API which takes an URL, checks its validity and return a object with bunch of video information(check out the dev page). Although not intended but we can easily use this to validate whether a given URL is from Vimeo or not.
So, with ajax it would look like this,
var VIMEO_BASE_URL = "https://vimeo.com/api/oembed.json?url=";
var yourTestUrl = "https://vimeo.com/23374724";
$.ajax({
url: VIMEO_BASE_URL + yourTestUrl,
type: 'GET',
success: function(data) {
if (data != null && data.video_id > 0)
// Valid Vimeo url
else
// not a valid Vimeo url
},
error: function(data) {
// not a valid Vimeo url
}
});
about sawa's answer :
a little update on the second regex :
/http:\/\/(?:www\.)?(vimeo|youtube)\.com\/(?:watch\?v=)?(.*?)(?:\z|$|&)/
(escaping the dots prevents from matching url of type www_vimeo_com/… and $ added…)
here is the same idea for matching the embed urls :
/http:\/\/(?:www\.|player\.)?(vimeo|youtube)\.com\/(?:embed\/|video\/)?(.*?)(?:\z|$|\?)/
FWIW, I just used the following to validate and parse both YouTube and Vimeo URLs in an app. I'm sure you could add parentheses to parse out the specific things you're looking for...
/^(?:https?:\/\/)?(?:www\.)?(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))((\w|-){11})(?:\S+)?$|^(https?:\/\/)?(www.)?(player.)?vimeo.com\/([a-z]*\/)*([0-9]{6,11})[?]?.*$/
^^ This is just a combination of 2 separate expressions using | (or) to join them. Here are the original 2 expressions separately:
/^(?:https?:\/\/)?(?:www\.)?(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))((\w|-){11})(?:\S+)?$/
/^(https?:\/\/)?(www.)?(player.)?vimeo.com\/([a-z]*\/)*([0-9]{6,11})[?]?.*$/
I'm no expert, but it seems to work according to Rubular. Hopefully this helps someone out in the future.
3) Your regex does not match https url's. I haven't tested it, but I guess the "http://" part would become "http(s)?://". Note that this would change the matching positions of the provider and id.
Just in case here is a php version
/*
* parseVideo
* #param (string) $url
* mi-ca.ch 27.05.2016
* parse vimeo & youtube id
* format url for iframe embed
* https://regex101.com/r/lA0fP4/1
*/
function parseVideo($url) {
$re = "/(http:|https:|)\\/\\/(player.|www.)?(vimeo\\.com|youtu(be\\.com|\\.be|be\\.googleapis\\.com))\\/(video\\/|embed\\/|watch\\?v=|v\\/)?([A-Za-z0-9._%-]*)(\\&\\S+)?/";
preg_match($re, $url, $matches);
if(strrpos($matches[3],'youtu')>-1){
$type='youtube';
$src='https://www.youtube.com/embed/'.$matches[6];
}else if(strrpos($matches[3],'vimeo')>-1){
$type="vimeo";
$src='https://player.vimeo.com/video/'.$matches[6];
}else{
return false;
}
return array(
'type' => $type // return youtube or vimeo
,'id' => $matches[6] // return the video id
,'src' => $src // return the src for iframe embed
);
}
I had a task to enable adding a dropbox videos. So the same input should take href, check it and transform to the playable link which I can then insert in .
const getPlayableUrl = (url) => {
// Check youtube and vimeo
let firstCheck = url.match(/(http:|https:|)\/\/(player.|www.)?(vimeo\.com|youtu(be\.com|\.be|be\.googleapis\.com))\/(video\/|embed\/|watch\?v=|v\/)?([A-Za-z0-9._%-]*)(\&\S+)?/);
if (firstCheck) {
if (RegExp.$3.indexOf('youtu') > -1) {
return "//www.youtube.com/embed/" + RegExp.$6;
} else if (RegExp.$3.indexOf('vimeo') > -1) {
return 'https://player.vimeo.com/video/' + RegExp.$6
}
} else {
// Check dropbox
let candidate = ''
if (url.indexOf('.mp4') !== -1) {
candidate = url.slice(0, url.indexOf('.mp4') + 4)
} else if (url.indexOf('.m4v') !== -1) {
candidate = url.slice(0, url.indexOf('.m4v') + 4)
} else if (url.indexOf('.webm') !== -1) {
candidate = url.slice(0, url.indexOf('.webm') + 5)
}
let secondCheck = candidate.match(/(http:|https:|)\/\/(player.|www.)?(dropbox\.com)\/(s\/|embed\/|watch\?v=|v\/)?([A-Za-z0-9._%-]*\/)?(.*)/);
if (secondCheck) {
return 'https://dropbox.com/' + RegExp.$4 + RegExp.$5 + RegExp.$6 + '?raw=1'
} else {
throw Error("Not supported video resource.");
}
}
}
I based myself the previous answers but I needed more out the regex.
Maybe it worked in 2011 but in 2019 the syntax has changed a bit. So this is a refresh.
The regex will allow us to detect weather the url is Youtube or Vimeo.
I've added Capture group to easily retrieve the videoID.
If ran with Case insensitive setting please remove the (?i).
(?:(?i)(?:https:|http:)?\/\/)?(?:(?i)(?:www\.youtube\.com\/(?:embed\/|watch\?v=)|youtu\.be\/|youtube\.googleapis\.com\/v\/)(?<YoutubeID>[a-z0-9-_]{11,12})|(?:vimeo\.com\/|player\.vimeo\.com\/video\/)(?<VimeoID>[0-9]+))
https://regex101.com/r/PVdjg0/2
Use this Regex devs:This works like Makhan(react js,Javascript)
^(http\:\/\/|https\:\/\/)?((www\.)?(vimeo\.com\/)([0-9]+)$)|((www\.youtube\.com|youtu\.be)\/.+$)

js function to get filename from url

I have a url like http://www.example.com/blah/th.html
I need a javascript function to give me the 'th' value from that.
All my urls have the same format (2 letter filenames, with .html extension).
I want it to be a safe function, so if someone passes in an empty url it doesn't break.
I know how to check for length, but I should be checking for null to right?
var filename = url.split('/').pop()
Why so difficult?
var filename = url.split('/').pop().split('#')[0].split('?')[0];
Use the match function.
function GetFilename(url)
{
if (url)
{
var m = url.toString().match(/.*\/(.+?)\./);
if (m && m.length > 1)
{
return m[1];
}
}
return "";
}
Similar to the others, but...I've used Tom's simple script - a single line,
then you can use the filename var anywhere:
http://www.tomhoppe.com/index.php/2008/02/grab-filename-from-window-location/
var filename = location.pathname.substr(location.pathname.lastIndexOf("/")+1);
This should work for all cases
function getFilenameFromUrl(url) {
const pathname = new URL(url).pathname;
const index = pathname.lastIndexOf('/');
return pathname.substring(index + 1) // if index === -1 then index+1 will be 0
}
In addition to the existing answers, I would recommend using URL() constructor (works both in browsers and Node.js) because you can be sure your URL is valid:
const url = 'https://test.com/path/photo123.png?param1=1&param2=2#hash';
let filename = '';
try {
filename = new URL(url).pathname.split('/').pop();
} catch (e) {
console.error(e);
}
console.log(`filename: ${filename}`);
A regex solution which accounts for URL query and hash identifier:
function fileNameFromUrl(url) {
var matches = url.match(/\/([^\/?#]+)[^\/]*$/);
if (matches.length > 1) {
return matches[1];
}
return null;
}
JSFiddle here.
Because cases tend to fail with custom code, I looked up to the JavaScript URL class. Alas, it chokes on relative URLs! Also, it doesn't have a property to get the file name. Not epic.
There has to be a good library out there which solves this common problem. Behold URI.js. All you need is a simple statement like the following:
let file = new URI(url).filename()
Then we can create a simple function that does null checks and removes the file extension:
function fileName(url) {
if (url === null || typeof url === 'undefined')
return ''
let file = new URI(url).filename() // File name with file extension
return file.substring(0, file.lastIndexOf('.')) // Remove the extension
}
Here's a snippet with test cases to play around with. All cases pass except drive paths.
test('Dots in file name without URL', 'dotted.file.name.png', 'dotted.file.name')
test('Dots in file name with URL', 'http://example.com/file.name.txt', 'file.name')
test('Lengthy URL with parameters', '/my/folder/filename.html#dssddsdsd?toto=33&dududu=podpodpo', 'filename')
test('URL with hash', '/my/folder/filename.html#dssddsdsd', 'filename')
test('URL with query strings', '/my/folder/filename.html?toto=33&dududu=podpodp', 'filename')
test('Hash after query string', 'http://www.myblog.com/filename.php?year=2019#06', 'filename')
test('Query parameter with file path character', 'http://www.example.com/filename.zip?passkey=1/2', 'filename')
test('Query parameter with file path character and hash', 'http://www.example.com/filename.html?lang=en&user=Aan9u/o8ai#top', 'filename')
test('Asian characters', 'http://example.com/文件名.html', '文件名')
test('URL without file name', 'http://www.example.com', '')
test('Null', null, '')
test('Undefined', undefined, '')
test('Empty string', '', '')
test('Drive path name', 'C:/fakepath/filename.csv', 'filename')
function fileName(url) {
if (url === null || typeof url === 'undefined')
return ''
let file = new URI(url).filename() // File name with file extension
return file.substring(0, file.lastIndexOf('.')) // Remove the extension
}
function test(description, input, expected) {
let result = fileName(input)
let pass = 'FAIL'
if (result === expected)
pass = 'PASS'
console.log(pass + ': ' + description + ': ' + input)
console.log(' => "' + fileName(input) + '"')
}
<script src="https://cdn.jsdelivr.net/gh/medialize/URI.js#master/src/URI.js"></script>
Results
PASS: Dots in file name without URL: dotted.file.name.png
=> "dotted.file.name"
PASS: Dots in file name with URL: http://example.com/file.name.txt
=> "file.name"
PASS: Lengthy URL with parameters: /my/folder/filename.html#dssddsdsd?toto=33&dududu=podpodpo
=> "filename"
PASS: URL with hash: /my/folder/filename.html#dssddsdsd
=> "filename"
PASS: URL with query strings: /my/folder/filename.html?toto=33&dududu=podpodp
=> "filename"
PASS: Hash after query string: http://www.myblog.com/filename.php?year=2019#06
=> "filename"
PASS: Query parameter with file path character: http://www.example.com/filename.zip?passkey=1/2
=> "filename"
PASS: Query parameter with file path character and hash: http://www.example.com/filename.html?lang=en&user=Aan9u/o8ai#top
=> "filename"
PASS: Asian characters: http://example.com/文件名.html
=> "文件名"
PASS: URL without file name: http://www.example.com
=> ""
PASS: Null: null
=> ""
PASS: Undefined: undefined
=> ""
PASS: Empty string:
=> ""
FAIL: Drive path name: C:/fakepath/filename.csv
=> ""
This solution is for you if you're too lazy to write custom code and don't mind using a library to do work for you. It isn't for you if you want to code golf the solution.
those will not work for lenghty url like
"/my/folder/questions.html#dssddsdsd?toto=33&dududu=podpodpo"
here I expect to get "questions.html".
So a possible (slow) solution is as below
fname=function(url)
{ return url?url.split('/').pop().split('#').shift().split('?').shift():null }
then you can test that in any case you get only the filename.
fname("/my/folder/questions.html#dssddsdsd?toto=33&dududu=podpodpo")
-->"questions.html"
fname("/my/folder/questions.html#dssddsdsd")
-->"questions.html"
fname("/my/folder/questions.html?toto=33&dududu=podpodpo")
"-->questions.html"
(and it works for null)
(I would love to see a faster or smarter solution)
This answer only works in browser environment. Not suitable for node.
function getFilename(url) {
const filename = decodeURIComponent(new URL(url).pathname.split('/').pop());
if (!filename) return 'index.html'; // some default filename
return filename;
}
function filenameWithoutExtension(filename) {
return filename.replace(/^(.+?)(?:\.[^.]*)?$/, '$1');
}
Here are two functions:
first one get filename from url
second one get filename without extension from a full filename
For parsing URL, new an URL object should be the best choice. Also notice that URL do not always contain a filename.
Notice: This function try to resolve filename from an URL. But it do NOT guarantee that the filename is valid and suitable for use:
Some OS disallow certain character in filename (e.g. : in windows, \0 in most OS, ...);
Some filename may reserved by OS (e.g. CON in windows);
Some filename may make user unhappy to handle it (e.g. a file named "--help" in Linux)
Test it out:
function getFilename(url) {
const filename = decodeURIComponent(new URL(url).pathname.split('/').pop());
if (!filename) return 'index.html'; // some default filename
return filename;
}
function test(url) {
console.log('Filename: %o\nUrl: %o', getFilename(url), url);
}
test('http://www.example.com');
test('http://www.example.com/');
test('http://www.example.com/name.txt');
test('http://www.example.com/path/name.txt');
test('http://www.example.com/path/name.txt/realname.txt');
test('http://www.example.com/page.html#!/home');
test('http://www.example.com/page.html?lang=en&user=Aan9u/o8ai#top');
test('http://www.example.com/%E6%96%87%E4%BB%B6%E5%90%8D.txt')
I'd use the substring function combined with lastIndexOf. This will allow for filenames with periods in them e.g. given http://example.com/file.name.txt this gives file.name unlike the accepted answer that would give file.
function GetFilename(url)
{
if (url)
{
return url.substring(url.lastIndexOf("/") + 1, url.lastIndexOf("."));
}
return "";
}
Using jQuery with the URL plugin:
var file = jQuery.url.attr("file");
var fileNoExt = file.replace(/\.(html|htm)$/, "");
// file == "th.html", fileNoExt = "th"
For node and browsers, based on #pauls answer but solving issues with hash and more defensive:
export function getFileNameFromUrl(url) {
const hashIndex = url.indexOf('#')
url = hashIndex !== -1 ? url.substring(0, hashIndex) : url
return (url.split('/').pop() || '').replace(/[\?].*$/g, '')
}
Few cases:
describe('getFileNameFromUrl', () => {
it('absolute, hash and no extension', () => {
expect(getFileNameFromUrl(
'https://foo.bar/qs/bar/js-function-to-get-filename-from-url#comment95124061_53560218'))
.toBe('js-function-to-get-filename-from-url')
})
it('relative, extension and parameters', () => {
expect(getFileNameFromUrl('../foo.png?ar=8')).toBe('foo.png')
})
it('file name with multiple dots, hash with slash', () => {
expect(getFileNameFromUrl('questions/511761/js-function.min.js?bar=9.9&y=1#/src/jjj?=9.9')).toBe('js-function.min.js')
})
})
This should handle anything you throw at it (absolute URLs, relative URLs, complex AWS URLs, etc). It includes an optional default or uses a psuedorandom string if neither a filename nor a default were present.
function parseUrlFilename(url, defaultFilename = null) {
let filename = new URL(url, "https://example.com").href.split("#").shift().split("?").shift().split("/").pop(); //No need to change "https://example.com"; it's only present to allow for processing relative URLs.
if(!filename) {
if(defaultFilename) {
filename = defaultFilename;
//No default filename provided; use a pseudorandom string.
} else {
filename = Math.random().toString(36).substr(2, 10);
}
}
return filename;
}
Props to #hayatbiralem for nailing the order of the split()s.
Similarly to what #user2492653 suggested, if all you want is the name of the file like Firefox gives you, then you the split() method, which breaks the string into an array of components, then all you need to do it grab the last index.
var temp = url.split("//");
if(temp.length > 1)
return temp[temp.length-1] //length-1 since array indexes start at 0
This would basically break C:/fakepath/test.csv into {"C:", "fakepath", "test.csv"}
my 2 cents
the LastIndexOf("/") method in itself falls down if the querystrings contain "/"
We all know they "should" be encoded as %2F but it would only take one un-escaped value to cause problems.
This version correctly handles /'s in the querystrings and has no reliance on .'s in the url
function getPageName() {
//#### Grab the url
var FullUrl = window.location.href;
//#### Remove QueryStrings
var UrlSegments = FullUrl.split("?")
FullUrl = UrlSegments[0];
//#### Extract the filename
return FullUrl.substr(FullUrl.lastIndexOf("/") + 1);
}
Try this
url.substring(url.lastIndexOf('/')+1, url.length)
url? url.substring(url.lastIndexOf('/')+1, url.lastIndexOf('.')):''
Safety is as asked for. when url is null or undefined the result is ''.
Removes all of the path with ':', dots and any symbol including the last '/'.
This gives the true answer 'th' as asked and not 'th.index'. That is very important of course to have it work at all.
It allows filename to have several periods!
Not asked, but you can also have a query string without '/' and '.'
It is a corrected answer from Abhishek Sharma so I gave him an upvote. So genious and minimal one-liner - I saw it there :)
function getFileNameWithoutExtension(url) {
if (typeof url !== 'string') throw new Error('url must be a string');
// Remove the QueryString
return url.replace(/\?.*$/, '')
// Extract the filename
.split('/').pop()
// Remove the extension
.replace(/\.[^.]+$/, '');
}
This will return news from this URL http://www.myblog.com/news.php?year=2019#06.
ES6 syntax based on TypeScript
Actually, the marked answer is true but if the second if doesn't satisfy the function returns undefined, I prefer to write it like below:
const getFileNameFromUrl = (url: string): string => {
if (url) {
const tmp = url.split('/');
const tmpLength = tmp.length;
return tmpLength ? tmp[tmpLength - 1] : '';
}
return '';
};
For my problem, I need to have the file extension.
Simple Function (Using RegEx Pattern)
function pathInfo(s) {
s=s.match(/(.*?\/)?(([^/]*?)(\.[^/.]+?)?)(?:[?#].*)?$/);
return {path:s[1],file:s[2],name:s[3],ext:s[4]};
}
var sample='/folder/another/file.min.js?query=1';
var result=pathInfo(sample);
console.log(result);
/*
{
"path": "/folder/another/",
"file": "file.min.js",
"name": "file.min",
"ext": ".js"
}
*/
console.log(result.name);
Using RegEx
Let´s say you have this url:
http://127.0.0.1:3000/pages/blog/posts/1660345251.html
Using the following line of code:
var filename = location.pathname.replace(/[\D]/g, "");
Will return:
1660345251
Notice that this number is Unix Time, which returns your local time no matter where you are in the World (this should give you really unique post names for this blog example).
.replace(/[\D]/g, ""), replaces any non-digit character with an empty string. /[\D]/g says non-digit, and "" says empty string. More about it: here for numbers and here for letters.
More about RegEx, here. There are lots of RegEx tools out there that can help you out to get better results in the returning value for filename.
Extra code: Unix Time to Local Time
var humanDate = new Date(0);
var timestamp = entries[index].timestamp;
humanDate.setUTCSeconds(timestamp);
humanDate is for my local time:
Fri Aug 12 2022 20:00:51 GMT-0300 (Argentina Standard Time)
Credits for this code, here.
function getFilenameFromUrlString(url) {
const response = {
data: {
filename: ""
},
error: ""
};
try {
response.data.filename = new URL(url).pathname.split("/").pop();
} catch (e) {
response.error = e.toString();
}
return response;
}
For tests check this:
https://codesandbox.io/s/get-filename-from-url-string-wqthx1
from How to get the file name from a full path using JavaScript?
var filename = fullPath.replace(/^.*[\\\/]/, '')

How can I get file extensions with JavaScript?

See code:
var file1 = "50.xsl";
var file2 = "30.doc";
getFileExtension(file1); //returns xsl
getFileExtension(file2); //returns doc
function getFileExtension(filename) {
/*TODO*/
}
Newer Edit: Lots of things have changed since this question was initially posted - there's a lot of really good information in wallacer's revised answer as well as VisioN's excellent breakdown
Edit: Just because this is the accepted answer; wallacer's answer is indeed much better:
return filename.split('.').pop();
My old answer:
return /[^.]+$/.exec(filename);
Should do it.
Edit: In response to PhiLho's comment, use something like:
return (/[.]/.exec(filename)) ? /[^.]+$/.exec(filename) : undefined;
return filename.split('.').pop();
Edit:
This is another non-regex solution that I think is more efficient:
return filename.substring(filename.lastIndexOf('.')+1, filename.length) || filename;
There are some corner cases that are better handled by VisioN's answer below, particularly files with no extension (.htaccess etc included).
It's very performant, and handles corner cases in an arguably better way by returning "" instead of the full string when there's no dot or no string before the dot. It's a very well crafted solution, albeit tough to read. Stick it in your helpers lib and just use it.
Old Edit:
A safer implementation if you're going to run into files with no extension, or hidden files with no extension (see VisioN's comment to Tom's answer above) would be something along these lines
var a = filename.split(".");
if( a.length === 1 || ( a[0] === "" && a.length === 2 ) ) {
return "";
}
return a.pop(); // feel free to tack .toLowerCase() here if you want
If a.length is one, it's a visible file with no extension ie. file
If a[0] === "" and a.length === 2 it's a hidden file with no extension ie. .htaccess
This should clear up issues with the slightly more complex cases. In terms of performance, I think this solution is a little slower than regex in most browsers. However, for most common purposes this code should be perfectly usable.
The following solution is fast and short enough to use in bulk operations and save extra bytes:
return fname.slice((fname.lastIndexOf(".") - 1 >>> 0) + 2);
Here is another one-line non-regexp universal solution:
return fname.slice((Math.max(0, fname.lastIndexOf(".")) || Infinity) + 1);
Both work correctly with names having no extension (e.g. myfile) or starting with . dot (e.g. .htaccess):
"" --> ""
"name" --> ""
"name.txt" --> "txt"
".htpasswd" --> ""
"name.with.many.dots.myext" --> "myext"
If you care about the speed you may run the benchmark and check that the provided solutions are the fastest, while the short one is tremendously fast:
How the short one works:
String.lastIndexOf method returns the last position of the substring (i.e. ".") in the given string (i.e. fname). If the substring is not found method returns -1.
The "unacceptable" positions of dot in the filename are -1 and 0, which respectively refer to names with no extension (e.g. "name") and to names that start with dot (e.g. ".htaccess").
Zero-fill right shift operator (>>>) if used with zero affects negative numbers transforming -1 to 4294967295 and -2 to 4294967294, which is useful for remaining the filename unchanged in the edge cases (sort of a trick here).
String.prototype.slice extracts the part of the filename from the position that was calculated as described. If the position number is more than the length of the string method returns "".
If you want more clear solution which will work in the same way (plus with extra support of full path), check the following extended version. This solution will be slower than previous one-liners but is much easier to understand.
function getExtension(path) {
var basename = path.split(/[\\/]/).pop(), // extract file name from full path ...
// (supports `\\` and `/` separators)
pos = basename.lastIndexOf("."); // get last position of `.`
if (basename === "" || pos < 1) // if file name is empty or ...
return ""; // `.` not found (-1) or comes first (0)
return basename.slice(pos + 1); // extract extension ignoring `.`
}
console.log( getExtension("/path/to/file.ext") );
// >> "ext"
All three variants should work in any web browser on the client side and can be used in the server side NodeJS code as well.
function getFileExtension(filename)
{
var ext = /^.+\.([^.]+)$/.exec(filename);
return ext == null ? "" : ext[1];
}
Tested with
"a.b" (=> "b")
"a" (=> "")
".hidden" (=> "")
"" (=> "")
null (=> "")
Also
"a.b.c.d" (=> "d")
".a.b" (=> "b")
"a..b" (=> "b")
There is a standard library function for this in the path module:
import path from 'path';
console.log(path.extname('abc.txt'));
Output:
.txt
So, if you only want the format:
path.extname('abc.txt').slice(1) // 'txt'
If there is no extension, then the function will return an empty string:
path.extname('abc') // ''
If you are using Node, then path is built-in. If you are targetting the browser, then Webpack will bundle a path implementation for you. If you are targetting the browser without Webpack, then you can include path-browserify manually.
There is no reason to do string splitting or regex.
function getExt(filename)
{
var ext = filename.split('.').pop();
if(ext == filename) return "";
return ext;
}
var extension = fileName.substring(fileName.lastIndexOf('.')+1);
If you are dealing with web urls, you can use:
function getExt(filepath){
return filepath.split("?")[0].split("#")[0].split('.').pop();
}
getExt("../js/logic.v2.min.js") // js
getExt("http://example.net/site/page.php?id=16548") // php
getExt("http://example.net/site/page.html#welcome.to.me") // html
getExt("c:\\logs\\yesterday.log"); // log
Demo: https://jsfiddle.net/squadjot/q5ard4fj/
var parts = filename.split('.');
return parts[parts.length-1];
function file_get_ext(filename)
{
return typeof filename != "undefined" ? filename.substring(filename.lastIndexOf(".")+1, filename.length).toLowerCase() : false;
}
Code
/**
* Extract file extension from URL.
* #param {String} url
* #returns {String} File extension or empty string if no extension is present.
*/
var getFileExtension = function (url) {
"use strict";
if (url === null) {
return "";
}
var index = url.lastIndexOf("/");
if (index !== -1) {
url = url.substring(index + 1); // Keep path without its segments
}
index = url.indexOf("?");
if (index !== -1) {
url = url.substring(0, index); // Remove query
}
index = url.indexOf("#");
if (index !== -1) {
url = url.substring(0, index); // Remove fragment
}
index = url.lastIndexOf(".");
return index !== -1
? url.substring(index + 1) // Only keep file extension
: ""; // No extension found
};
Test
Notice that in the absence of a query, the fragment might still be present.
"https://www.example.com:8080/segment1/segment2/page.html?foo=bar#fragment" --> "html"
"https://www.example.com:8080/segment1/segment2/page.html#fragment" --> "html"
"https://www.example.com:8080/segment1/segment2/.htaccess?foo=bar#fragment" --> "htaccess"
"https://www.example.com:8080/segment1/segment2/page?foo=bar#fragment" --> ""
"https://www.example.com:8080/segment1/segment2/?foo=bar#fragment" --> ""
"" --> ""
null --> ""
"a.b.c.d" --> "d"
".a.b" --> "b"
".a.b." --> ""
"a...b" --> "b"
"..." --> ""
JSLint
0 Warnings.
Fast and works correctly with paths
(filename.match(/[^\\\/]\.([^.\\\/]+)$/) || [null]).pop()
Some edge cases
/path/.htaccess => null
/dir.with.dot/file => null
Solutions using split are slow and solutions with lastIndexOf don't handle edge cases.
// 获取文件后缀名
function getFileExtension(file) {
var regexp = /\.([0-9a-z]+)(?:[\?#]|$)/i;
var extension = file.match(regexp);
return extension && extension[1];
}
console.log(getFileExtension("https://www.example.com:8080/path/name/foo"));
console.log(getFileExtension("https://www.example.com:8080/path/name/foo.BAR"));
console.log(getFileExtension("https://www.example.com:8080/path/name/.quz/foo.bar?key=value#fragment"));
console.log(getFileExtension("https://www.example.com:8080/path/name/.quz.bar?key=value#fragment"));
i just wanted to share this.
fileName.slice(fileName.lastIndexOf('.'))
although this has a downfall that files with no extension will return last string.
but if you do so this will fix every thing :
function getExtention(fileName){
var i = fileName.lastIndexOf('.');
if(i === -1 ) return false;
return fileName.slice(i)
}
"one-liner" to get filename and extension using reduce and array destructuring :
var str = "filename.with_dot.png";
var [filename, extension] = str.split('.').reduce((acc, val, i, arr) => (i == arr.length - 1) ? [acc[0].substring(1), val] : [[acc[0], val].join('.')], [])
console.log({filename, extension});
with better indentation :
var str = "filename.with_dot.png";
var [filename, extension] = str.split('.')
.reduce((acc, val, i, arr) => (i == arr.length - 1)
? [acc[0].substring(1), val]
: [[acc[0], val].join('.')], [])
console.log({filename, extension});
// {
// "filename": "filename.with_dot",
// "extension": "png"
// }
There's also a simple approach using ES6 destructuring:
const path = 'hello.world.txt'
const [extension, ...nameParts] = path.split('.').reverse();
console.log('extension:', extension);
function extension(fname) {
var pos = fname.lastIndexOf(".");
var strlen = fname.length;
if (pos != -1 && strlen != pos + 1) {
var ext = fname.split(".");
var len = ext.length;
var extension = ext[len - 1].toLowerCase();
} else {
extension = "No extension found";
}
return extension;
}
//usage
extension('file.jpeg')
always returns the extension lower cas so you can check it on field change
works for:
file.JpEg
file (no extension)
file. (noextension)
This simple solution
function extension(filename) {
var r = /.+\.(.+)$/.exec(filename);
return r ? r[1] : null;
}
Tests
/* tests */
test('cat.gif', 'gif');
test('main.c', 'c');
test('file.with.multiple.dots.zip', 'zip');
test('.htaccess', null);
test('noextension.', null);
test('noextension', null);
test('', null);
// test utility function
function test(input, expect) {
var result = extension(input);
if (result === expect)
console.log(result, input);
else
console.error(result, input);
}
function extension(filename) {
var r = /.+\.(.+)$/.exec(filename);
return r ? r[1] : null;
}
I'm sure someone can, and will, minify and/or optimize my code in the future. But, as of right now, I am 200% confident that my code works in every unique situation (e.g. with just the file name only, with relative, root-relative, and absolute URL's, with fragment # tags, with query ? strings, and whatever else you may decide to throw at it), flawlessly, and with pin-point precision.
For proof, visit: https://projects.jamesandersonjr.com/web/js_projects/get_file_extension_test.php
Here's the JSFiddle: https://jsfiddle.net/JamesAndersonJr/ffcdd5z3/
Not to be overconfident, or blowing my own trumpet, but I haven't seen any block of code for this task (finding the 'correct' file extension, amidst a battery of different function input arguments) that works as well as this does.
Note: By design, if a file extension doesn't exist for the given input string, it simply returns a blank string "", not an error, nor an error message.
It takes two arguments:
String: fileNameOrURL (self-explanatory)
Boolean: showUnixDotFiles (Whether or Not to show files that begin with a dot ".")
Note (2): If you like my code, be sure to add it to your js library's, and/or repo's, because I worked hard on perfecting it, and it would be a shame to go to waste. So, without further ado, here it is:
function getFileExtension(fileNameOrURL, showUnixDotFiles)
{
/* First, let's declare some preliminary variables we'll need later on. */
var fileName;
var fileExt;
/* Now we'll create a hidden anchor ('a') element (Note: No need to append this element to the document). */
var hiddenLink = document.createElement('a');
/* Just for fun, we'll add a CSS attribute of [ style.display = "none" ]. Remember: You can never be too sure! */
hiddenLink.style.display = "none";
/* Set the 'href' attribute of the hidden link we just created, to the 'fileNameOrURL' argument received by this function. */
hiddenLink.setAttribute('href', fileNameOrURL);
/* Now, let's take advantage of the browser's built-in parser, to remove elements from the original 'fileNameOrURL' argument received by this function, without actually modifying our newly created hidden 'anchor' element.*/
fileNameOrURL = fileNameOrURL.replace(hiddenLink.protocol, ""); /* First, let's strip out the protocol, if there is one. */
fileNameOrURL = fileNameOrURL.replace(hiddenLink.hostname, ""); /* Now, we'll strip out the host-name (i.e. domain-name) if there is one. */
fileNameOrURL = fileNameOrURL.replace(":" + hiddenLink.port, ""); /* Now finally, we'll strip out the port number, if there is one (Kinda overkill though ;-)). */
/* Now, we're ready to finish processing the 'fileNameOrURL' variable by removing unnecessary parts, to isolate the file name. */
/* Operations for working with [relative, root-relative, and absolute] URL's ONLY [BEGIN] */
/* Break the possible URL at the [ '?' ] and take first part, to shave of the entire query string ( everything after the '?'), if it exist. */
fileNameOrURL = fileNameOrURL.split('?')[0];
/* Sometimes URL's don't have query's, but DO have a fragment [ # ](i.e 'reference anchor'), so we should also do the same for the fragment tag [ # ]. */
fileNameOrURL = fileNameOrURL.split('#')[0];
/* Now that we have just the URL 'ALONE', Let's remove everything to the last slash in URL, to isolate the file name. */
fileNameOrURL = fileNameOrURL.substr(1 + fileNameOrURL.lastIndexOf("/"));
/* Operations for working with [relative, root-relative, and absolute] URL's ONLY [END] */
/* Now, 'fileNameOrURL' should just be 'fileName' */
fileName = fileNameOrURL;
/* Now, we check if we should show UNIX dot-files, or not. This should be either 'true' or 'false'. */
if ( showUnixDotFiles == false )
{
/* If not ('false'), we should check if the filename starts with a period (indicating it's a UNIX dot-file). */
if ( fileName.startsWith(".") )
{
/* If so, we return a blank string to the function caller. Our job here, is done! */
return "";
};
};
/* Now, let's get everything after the period in the filename (i.e. the correct 'file extension'). */
fileExt = fileName.substr(1 + fileName.lastIndexOf("."));
/* Now that we've discovered the correct file extension, let's return it to the function caller. */
return fileExt;
};
Enjoy! You're Quite Welcome!:
Try this:
function getFileExtension(filename) {
var fileinput = document.getElementById(filename);
if (!fileinput)
return "";
var filename = fileinput.value;
if (filename.length == 0)
return "";
var dot = filename.lastIndexOf(".");
if (dot == -1)
return "";
var extension = filename.substr(dot, filename.length);
return extension;
}
If you are looking for a specific extension and know its length, you can use substr:
var file1 = "50.xsl";
if (file1.substr(-4) == '.xsl') {
// do something
}
JavaScript reference: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/substr
I just realized that it's not enough to put a comment on p4bl0's answer, though Tom's answer clearly solves the problem:
return filename.replace(/^.*?\.([a-zA-Z0-9]+)$/, "$1");
For most applications, a simple script such as
return /[^.]+$/.exec(filename);
would work just fine (as provided by Tom). However this is not fool proof. It does not work if the following file name is provided:
image.jpg?foo=bar
It may be a bit overkill but I would suggest using a url parser such as this one to avoid failure due to unpredictable filenames.
Using that particular function, you could get the file name like this:
var trueFileName = parse_url('image.jpg?foo=bar').file;
This will output "image.jpg" without the url vars. Then you are free to grab the file extension.
function func() {
var val = document.frm.filename.value;
var arr = val.split(".");
alert(arr[arr.length - 1]);
var arr1 = val.split("\\");
alert(arr1[arr1.length - 2]);
if (arr[1] == "gif" || arr[1] == "bmp" || arr[1] == "jpeg") {
alert("this is an image file ");
} else {
alert("this is not an image file");
}
}
I'm many moons late to the party but for simplicity I use something like this
var fileName = "I.Am.FileName.docx";
var nameLen = fileName.length;
var lastDotPos = fileName.lastIndexOf(".");
var fileNameSub = false;
if(lastDotPos === -1)
{
fileNameSub = false;
}
else
{
//Remove +1 if you want the "." left too
fileNameSub = fileName.substr(lastDotPos + 1, nameLen);
}
document.getElementById("showInMe").innerHTML = fileNameSub;
<div id="showInMe"></div>
A one line solution that will also account for query params and any characters in url.
string.match(/(.*)\??/i).shift().replace(/\?.*/, '').split('.').pop()
// Example
// some.url.com/with.in/&ot.s/files/file.jpg?spec=1&.ext=jpg
// jpg
return filename.replace(/\.([a-zA-Z0-9]+)$/, "$1");
edit: Strangely (or maybe it's not) the $1 in the second argument of the replace method doesn't seem to work... Sorry.
fetchFileExtention(fileName) {
return fileName.slice((fileName.lastIndexOf(".") - 1 >>> 0) + 2);
}
Wallacer's answer is nice, but one more checking is needed.
If file has no extension, it will use filename as extension which is not good.
Try this one:
return ( filename.indexOf('.') > 0 ) ? filename.split('.').pop().toLowerCase() : 'undefined';
Don't forget that some files can have no extension, so:
var parts = filename.split('.');
return (parts.length > 1) ? parts.pop() : '';

Categories

Resources