Get the domain and page name from a String URL

Get the domain and page name from a String URL - javascript

Well i'm currently have some issue about manipulating an URL.
Technically what i want is to get the domain name and the page name from a page.
For example :
www.myWebSite.com => domain : myWebSite
http://myWebSite.com => domain : myWebSite
myWebSite.com/xxx.hmtl => domain : myWebSite page : xxx

window.location.hostname; //Domain name
$("title").text(); //Page name
EDIT:
var loc = window.location;
var filename = loc.pathname.split("/");
filename = filename[pathname.length-1];
alert("Domain: "+loc.hostname);
alert("Filename: "+filename);

try with url.match(/:\/\/(.[^/]+)/)[1]
example :
var r = /:\/\/(.[^/]+)/;
"http://stackoverflow.com/questions/5343288/get-the-domain-and-page-name-from-a-string-url".match(r)[1]
=> stackoverflow.com

Use window.location.hostname or window.location.host. Check location reference.

I hope this helps:
function breakDownURL(url) {
var domain = "",
page = "";
//remove "http://"
if (url.indexOf("http://") == 0) {
url = url.substr(7);
}
//remove "www."
if (url.indexOf("www.") == 0) {
url = url.substr(4);
}
domain = url.split('/')[0].split('.')[0]
if (url.split('/').length > 1) {
page = url.split('/')[1].split('.')[0];
}
document.write("domain : " + domain +
(page == "" ? "" : " page : " + page) + page + "<br/>");
}
breakDownURL("www.myWebSite.com"); // domain : myWebSite
breakDownURL("http://myWebSite.com"); // domain : myWebSite
breakDownURL("myWebSite.com/xxx.html"); // domain : myWebSite page : xxx

var url = window.location.href; //www.myWebSite.com/myWebSite
var arr = url.split("/");
var page = arr[arr.length-1];
var domain = window.location.host;
alert(domain); //www.myWebSite.com
var n = page.includes("?"); // if www.myWebSite.com/myWebSite?parameter
if(n)
{
var page_arr = page.split("?");
var page = page_arr[0]; //myWebSite
}
alert(page); //myWebSite

You can write something like my function below.
function getHostnameFromURI(url) {
// Check slashes in URL
if (url.indexOf('?') == -1 && !url.endsWith('/') || url.indexOf('#') == -1 && !url.endsWith('/')) {
url += '/';
}
let start = url.indexOf('://')+3;
let end = url.indexOf('/', start);
let domain = url.slice(start, end);
// Remove port specification from the URL
if (domain.indexOf(':') != -1) {
domain = domain.slice(0, domain.indexOf(':'));
}
return domain;
}
It correctly works with localhost, .co.uk, etc. domains.
https://stackoverflow.com/opensearch.xml --> stackoverflow.com
https://github.com/arichr --> github.com
Information: Please, check that your URLs are not wrong e.g. https://example.com?a=a will return example.com?a=a

Related

nodeJS Crawler : unable to get the tagname associated with search word

I have created a crawler in NodeJS
I have a website : "http://www.google.com" for which I have written the crawler
Technology used is nodeJS, cheerio
Sample example of what I have achieved :
For example , lets search google.com. there is a button called "google search".
Let us search for the text "google search". Today my crawler can find the word in the page and say it has found it.
Today it shows : text " google search" found on google.com
What I need the result to be :
What it needs to do is in addition to finding text, also tell me the tag name , that in this case is a button
Needed output is : text "google search" found on google.com of "TAGNAME: BUTTON"
I tried using indexOf, but it isn't working. Please suggest how to do ?
Here is the code
!!!
index.js
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var START_URL = "https://www.mytravelexp.com/";
var SEARCH_WORD ="Pack your travel essentials";
var MAX_PAGES_TO_VISIT = 20;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
request(url, function(error, response, body) {
console.log("***************************")
console.log(" Visiting page: " + url + '\n');
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
var isWordFound = searchForWord($, SEARCH_WORD);
if(isWordFound) {
console.log(' ' + SEARCH_WORD + ' found at page ' + url);
collectInternalLinks($);
callback();
} else {
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
callback();
}
});
}
function searchForWord($, word) {
var bodyText = $('html > body').html().toLowerCase();
return(bodyText.includes(word.toLowerCase()) !== -1);
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
var absoluteLinks = $("a[href^='http']");
absoluteLinks.each(function() {
pagesToVisit.push($(this).attr('href'));
});
}

if ( $('123')[0].name === 'button' ){
console.log($('button').contents().first().text());
console.log( $('123').attr('name') );
}

Getting domain without subdomain from an url with javascript [duplicate]

How to get the domain name without subdomains?
e.g. if the url is "http://one.two.roothost.co.uk/page.html" how to get "roothost.co.uk"?

Following is a solution to extract a domain name without any subdomains. This solution doesn't make any assumptions about the URL format, so it should work for any URL. Since some domain names have one suffix (.com), and some have two or more (.co.uk), to get an accurate result in all cases, we need to parse the hostname using the Public Suffix List, which contains a list of all public domain name suffixes.
Solution
First, include the public suffix list js api in a script tag in your HTML, then in JavaScript to get the hostname you can call:
var parsed = psl.parse('one.two.roothost.co.uk');
console.log(parsed.domain);
...which will return "roothost.co.uk". To get the name from the current page, you can use location.hostname instead of a static string:
var parsed = psl.parse(location.hostname);
console.log(parsed.domain);
Finally, if you need to parse a domain name directly out of a full URL string, you can use the following:
var url = "http://one.two.roothost.co.uk/page.html";
url = url.split("/")[2]; // Get the hostname
var parsed = psl.parse(url); // Parse the domain
document.getElementById("output").textContent = parsed.domain;
JSFiddle Example (it includes the entire minified library in the jsFiddle, so scroll down!): https://jsfiddle.net/6aqdbL71/2/

What about this?
function getCanonicalHost(hostname) {
const MAX_TLD_LENGTH = 3;
function isNotTLD(_) { return _.length > MAX_TLD_LENGTH; };
hostname = hostname.split('.');
hostname = hostname.slice(Math.max(0, hostname.findLastIndex(isNotTLD)));
hostname = hostname.join('.');
return hostname;
}
console.log(getCanonicalHost('mail.google.com'));
console.log(getCanonicalHost('some.google.com.ar'));
console.log(getCanonicalHost('some.another.google.com.ar'));
console.log(getCanonicalHost('foo.bar.google.com'));
console.log(getCanonicalHost('foo.bar.google.com.ar'));
console.log(getCanonicalHost('bar.google.ar'));
Its works since https://developer.mozilla.org/en-US/docs/Learn/Common_questions/What_is_a_domain_name say:
TLDs can contain special as well as latin characters. A TLD's maximum length is 63 characters, although most are around 2–3.
In https://data.iana.org/TLD/tlds-alpha-by-domain.txt are 1481 TLD, 466 of this has length around 2–3 and the most used TLD no has more than 3.
If you need a solution that works with all TLDS, here is a more complex aproach:
function getCanonicalHost(hostname) {
return getCanonicalHost.tlds.then(function(tlds) {
function isNotTLD(_) { return tlds.indexOf(_) === -1; };
hostname = hostname.toLowerCase();
hostname = hostname.split('.');
hostname = hostname.slice(Math.max(0, hostname.findLastIndex(isNotTLD)));
hostname = hostname.join('.');
return hostname;
});
}
getCanonicalHost.tlds = new Promise(function(res, rej) {
const TLD_LIST_URL= 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt';
const xhr = new XMLHttpRequest();
xhr.addEventListener('error', rej);
xhr.addEventListener('load', function() {
const MAX_TLD_LENGTH = 63;
var tlds = xhr.responseText.split('\n');
tlds = tlds.map(function(_) { return _.trim().toLowerCase(); });
tlds = tlds.filter(Boolean);
tlds = tlds.filter(function(_) { return _.length < MAX_TLD_LENGTH; });
res(tlds);
});
xhr.open('GET', TLD_LIST_URL);
xhr.send();
})
getCanonicalHost('mail.google.com').then(console.log);
getCanonicalHost('some.google.com.ar').then(console.log);
getCanonicalHost('some.another.google.com.ar').then(console.log);
getCanonicalHost('foo.bar.google.com').then(console.log);
getCanonicalHost('foo.bar.google.com.ar').then(console.log);
getCanonicalHost('bar.google.ar').then(console.log);

You can use parse-domain to do the heavy lifting for you. This package considers the public suffix list and returns an easy to work with object breaking up the domain.
Here is an example from their readme:
npm install parse-domain
import { parseDomain, ParseResultType } from 'parse-domain';
const parseResult = parseDomain(
// should be a string with basic latin characters only. more details in the readme
'www.some.example.co.uk',
);
// check if the domain is listed in the public suffix list
if (parseResult.type === ParseResultType.Listed) {
const { subDomains, domain, topLevelDomains } = parseResult;
console.log(subDomains); // ["www", "some"]
console.log(domain); // "example"
console.log(topLevelDomains); // ["co", "uk"]
} else {
// more about other parseResult types in the readme
}

This works for me:
const firstTLDs = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|be|bf|bg|bh|bi|bj|bm|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|cl|cm|cn|co|cr|cu|cv|cw|cx|cz|de|dj|dk|dm|do|dz|ec|ee|eg|es|et|eu|fi|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jo|jp|kg|ki|km|kn|kp|kr|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|na|nc|ne|nf|ng|nl|no|nr|nu|nz|om|pa|pe|pf|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|yt".split('|');
const secondTLDs = "com|edu|gov|net|mil|org|nom|sch|caa|res|off|gob|int|tur|ip6|uri|urn|asn|act|nsw|qld|tas|vic|pro|biz|adm|adv|agr|arq|art|ato|bio|bmd|cim|cng|cnt|ecn|eco|emp|eng|esp|etc|eti|far|fnd|fot|fst|g12|ggf|imb|ind|inf|jor|jus|leg|lel|mat|med|mus|not|ntr|odo|ppg|psc|psi|qsl|rec|slg|srv|teo|tmp|trd|vet|zlg|web|ltd|sld|pol|fin|k12|lib|pri|aip|fie|eun|sci|prd|cci|pvt|mod|idv|rel|sex|gen|nic|abr|bas|cal|cam|emr|fvg|laz|lig|lom|mar|mol|pmn|pug|sar|sic|taa|tos|umb|vao|vda|ven|mie|北海道|和歌山|神奈川|鹿児島|ass|rep|tra|per|ngo|soc|grp|plc|its|air|and|bus|can|ddr|jfk|mad|nrw|nyc|ski|spy|tcm|ulm|usa|war|fhs|vgs|dep|eid|fet|fla|flå|gol|hof|hol|sel|vik|cri|iwi|ing|abo|fam|gok|gon|gop|gos|aid|atm|gsm|sos|elk|waw|est|aca|bar|cpa|jur|law|sec|plo|www|bir|cbg|jar|khv|msk|nov|nsk|ptz|rnd|spb|stv|tom|tsk|udm|vrn|cmw|kms|nkz|snz|pub|fhv|red|ens|nat|rns|rnu|bbs|tel|bel|kep|nhs|dni|fed|isa|nsn|gub|e12|tec|орг|обр|упр|alt|nis|jpn|mex|ath|iki|nid|gda|inc".split('|');
const knownSubdomains = "www|studio|mail|remote|blog|webmail|server|ns1|ns2|smtp|secure|vpn|m|shop|ftp|mail2|test|portal|ns|ww1|host|support|dev|web|bbs|ww42|squatter|mx|email|1|mail1|2|forum|owa|www2|gw|admin|store|mx1|cdn|api|exchange|app|gov|2tty|vps|govyty|hgfgdf|news|1rer|lkjkui";
function removeSubdomain(s) {
const knownSubdomainsRegExp = new RegExp(`^(${knownSubdomains})\.`, 'i');
s = s.replace(knownSubdomainsRegExp, '');
const parts = s.split('.');
while (parts.length > 3) {
parts.shift();
}
if (parts.length === 3 && ((parts[1].length > 2 && parts[2].length > 2) || (secondTLDs.indexOf(parts[1]) === -1) && firstTLDs.indexOf(parts[2]) === -1)) {
parts.shift();
}
return parts.join('.');
};
var tests = {
'www.sidanmor.com': 'sidanmor.com',
'exemple.com': 'exemple.com',
'argos.co.uk': 'argos.co.uk',
'www.civilwar.museum': 'civilwar.museum',
'www.sub.civilwar.museum': 'civilwar.museum',
'www.xxx.sub.civilwar.museum': 'civilwar.museum',
'www.exemple.com': 'exemple.com',
'main.testsite.com': 'testsite.com',
'www.ex-emple.com.ar': 'ex-emple.com.ar',
'main.test-site.co.uk': 'test-site.co.uk',
'en.tour.mysite.nl': 'tour.mysite.nl',
'www.one.lv': 'one.lv',
'www.onfdsadfsafde.lv': 'onfdsadfsafde.lv',
'aaa.onfdsadfsafde.aa': 'onfdsadfsafde.aa',
};
const firstTLDs = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|be|bf|bg|bh|bi|bj|bm|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|cl|cm|cn|co|cr|cu|cv|cw|cx|cz|de|dj|dk|dm|do|dz|ec|ee|eg|es|et|eu|fi|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jo|jp|kg|ki|km|kn|kp|kr|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|na|nc|ne|nf|ng|nl|no|nr|nu|nz|om|pa|pe|pf|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|yt".split('|');
const secondTLDs = "com|edu|gov|net|mil|org|nom|sch|caa|res|off|gob|int|tur|ip6|uri|urn|asn|act|nsw|qld|tas|vic|pro|biz|adm|adv|agr|arq|art|ato|bio|bmd|cim|cng|cnt|ecn|eco|emp|eng|esp|etc|eti|far|fnd|fot|fst|g12|ggf|imb|ind|inf|jor|jus|leg|lel|mat|med|mus|not|ntr|odo|ppg|psc|psi|qsl|rec|slg|srv|teo|tmp|trd|vet|zlg|web|ltd|sld|pol|fin|k12|lib|pri|aip|fie|eun|sci|prd|cci|pvt|mod|idv|rel|sex|gen|nic|abr|bas|cal|cam|emr|fvg|laz|lig|lom|mar|mol|pmn|pug|sar|sic|taa|tos|umb|vao|vda|ven|mie|北海道|和歌山|神奈川|鹿児島|ass|rep|tra|per|ngo|soc|grp|plc|its|air|and|bus|can|ddr|jfk|mad|nrw|nyc|ski|spy|tcm|ulm|usa|war|fhs|vgs|dep|eid|fet|fla|flå|gol|hof|hol|sel|vik|cri|iwi|ing|abo|fam|gok|gon|gop|gos|aid|atm|gsm|sos|elk|waw|est|aca|bar|cpa|jur|law|sec|plo|www|bir|cbg|jar|khv|msk|nov|nsk|ptz|rnd|spb|stv|tom|tsk|udm|vrn|cmw|kms|nkz|snz|pub|fhv|red|ens|nat|rns|rnu|bbs|tel|bel|kep|nhs|dni|fed|isa|nsn|gub|e12|tec|орг|обр|упр|alt|nis|jpn|mex|ath|iki|nid|gda|inc".split('|');
const knownSubdomains = "www|studio|mail|remote|blog|webmail|server|ns1|ns2|smtp|secure|vpn|m|shop|ftp|mail2|test|portal|ns|ww1|host|support|dev|web|bbs|ww42|squatter|mx|email|1|mail1|2|forum|owa|www2|gw|admin|store|mx1|cdn|api|exchange|app|gov|2tty|vps|govyty|hgfgdf|news|1rer|lkjkui";
function removeSubdomain(s) {
const knownSubdomainsRegExp = new RegExp(`^(${knownSubdomains})\.`, 'i');
s = s.replace(knownSubdomainsRegExp, '');
const parts = s.split('.');
while (parts.length > 3) {
parts.shift();
}
if (parts.length === 3 && ((parts[1].length > 2 && parts[2].length > 2) || (secondTLDs.indexOf(parts[1]) === -1) && firstTLDs.indexOf(parts[2]) === -1)) {
parts.shift();
}
return parts.join('.');
};
for (var test in tests) {
if (tests.hasOwnProperty(test)) {
var t = test;
var e = tests[test];
var r = removeSubdomain(test);
var s = e === r;
if (s) {
console.log('OK: "' + t + '" should be "' + e + '" and it is really "' + r + '"');
} else {
console.log('Fail: "' + t + '" should be "' + e + '" but it is NOT "' + r + '"');
}
}
}
Referance:
psl.min.js file
Maximillian Laumeister Answer to this question
The most popular subdomains on the internet

Simplest solution:
var domain='https://'+window.location.hostname.split('.')[window.location.hostname.split('.').length-2]+'.'+window.location.hostname.split('.')[window.location.hostname.split('.').length-1];
alert(domain);

I created this function which uses URL to parse. It cheats by assuming all hostnames will have either 4 or less parts.
const getDomainWithoutSubdomain = url => {
const urlParts = new URL(url).hostname.split('.')
return urlParts
.slice(0)
.slice(-(urlParts.length === 4 ? 3 : 2))
.join('.')
}
[
'https://www.google.com',
'https://www.google.co.uk',
'https://mail.google.com',
'https://www.bbc.co.uk/news',
'https://github.com',
].forEach(url => {
console.log(getDomainWithoutSubdomain(url))
})

Here is a working JSFiddle
My solution works with the assumption that the root hostname you are looking for is of the type "abc.xyz.pp".
extractDomain() returns the hostname with all the subdomains.
getRootHostName() splits the hostname by . and then based on the assumption mentioned above, it uses the shift() to remove each subdomain name.
Finally, whatever remains in parts[], it joins them by . to form the root hostname.
Javascript
var urlInput = "http://one.two.roothost.co.uk/page.html";
function extractDomain(url) {
var domain;
//find & remove protocol (http, ftp, etc.) and get domain
if (url.indexOf("://") > -1) {
domain = url.split('/')[2];
} else {
domain = url.split('/')[0];
}
//find & remove port number
domain = domain.split(':')[0];
return domain;
}
function getRootHostName(url) {
var parts = extractDomain(url).split('.');
var partsLength = parts.length - 3;
//parts.length-3 assuming root hostname is of type abc.xyz.pp
for (i = 0; i < partsLength; i++) {
parts.shift(); //remove sub-domains one by one
}
var rootDomain = parts.join('.');
return rootDomain;
}
document.getElementById("result").innerHTML = getRootHostName(urlInput);
HTML
<div id="result"></div>
EDIT 1: Updated the JSFiddle link. It was reflecting the incorrect code.

What about...
function getDomain(){
if(document.domain.length){
var parts = document.domain.replace(/^(www\.)/,"").split('.');
//is there a subdomain?
while(parts.length > 2){
//removing it from our array
var subdomain = parts.shift();
}
//getting the remaining 2 elements
var domain = parts.join('.');
return domain.replace(/(^\.*)|(\.*$)/g, "");
}
return '';
}

My solution worked for me: Get "gocustom.com" from "shop.gocustom.com"
var site_domain_name = 'shop.gocustom.com';
alert(site_domain_name);
var strsArray = site_domain_name.split('.');
var strsArrayLen = strsArray.length;
alert(strsArray[eval(strsArrayLen - 2)]+'.'+strsArray[eval(strsArrayLen - 1)])

You can try this in JavaScript:
alert(window.location.hostname);
It will return the hostname.

Adding a Parameter to Url with javascript

I am trying to add the parameter "referer=" to my url corresponding to the trafic referer of a new session.
I used some of the code from this topic... but it keeps reloading the page in a loop... then the url is like :
https://example.com?refere=facebookreferer=facebookreferer=facebook
Note:
I have been using this solution 1 :
function addOrUpdateUrlParam(name, value)
{
var ref = document.referrer;
var refsplit = ref.split(".")[1];
var href = window.location.href;
var regex = new RegExp("[&\\?]" + name + "=");
if(regex.test(href))
{
regex = new RegExp("([&\\?])" + name + "=\\d+");
{
else
{
if(href.indexOf("?") > -1)
window.location.href = href + "&" + name + "=" + value;
else
window.location.href = href + "?" + name + "=" + value;
}
if (refsplit != "example") {
return addOrUpdateUrlParam("referer", refsplit);
}
}
And this solution 2:
function () {
var ref = document.referrer;
var refsplit = ref.split(".")[1];
if (refsplit != "example") {
return location.search += "referer=" + refsplit;
}
}
Edit 1:
Thanks to Prasanth I improved the code to :
function () {
var ref = document.referrer;
var refsplit = ref.split(".")[1];
var currentUrl = location.href;
var url1 = currentUrl += "?referer="+refsplit;
var url2 = currentUrl += "&referer="+refsplit;
if(currentUrl.indexOf("?") < 0) {
return window.location = url1;
} else {
return window.location = url2;
}
}
However, it is returning both conditions :
https://example.com/?referer=facebook&referer=facebook
Edit 2:
So after many attempts, I achieved it by working with the parameters of the url (location.search) instead of the full url (location.href) :
function addRefererParam () {
var ref = document.referrer; //Get Referrer
var refDomain = ref.match(/[^(?:http:\/\/|www\.|https:\/\/)]([^\/]+)/i)[0]; //Extract Referrer Domain name for better readability
var params = location.search; //Get Url parameters
if (refDomain.match(/mydomain|null|undefined/i)) { //check if domain not null or own domain.
return params ;
} else {
return params += "utm_source=" + refDomain; //create new query string with referrer domain
}
}
However, it is no making a persistent query string through browsing... how can I make the new parameters persistent ?

Obtain the url of the current window and after the domain name just concat your url with &referer=value.
var currentUrl = location.href;
var paramsInUrl = currentUrl.split('&');
var flag = true;
for(var i in paramsInUrl)
{
if(!paramsInUrl[i].includes('referer=')
{
continue;
}
else
{
flag = false;
break;
}
}
if(flag)
{
currentUrl += '&referer='+value;
window.location = currentUrl;
}

For what it's worth (because the more generic question of just how to do this generally is what lead me to this post), I've made a 178 byte helper function that takes in an object of the query parameters you want to add to a url for a GET request (in similar format for how you might add headers to a request) and made an npm package for it here: https://www.npmjs.com/package/add-query-params-to-url
Hopefully this is helpful to some.

How can I use javascript to attach url parameters to all outgoing requests?

Background
I have customers arriving on my landing page and I would like to attach url parameters such as aff_id=X to all outgoing traffic.
Currently, I have something like this:
$('a').each(function(i, node) {
if (node.href != '#') {
$(node).addClass('outgoing');
}
});
$('.outgoing').attr('href', function(i, h) {
if (h) {
var arg_str = 's4=' + aff_id + '&s5=' + uuid;
return h + (h.indexOf('?') != -1 ? "&" + arg_str : "?" + arg_str);
}
});
The problem with this is that it doesn't work on all outgoing requests such as form submit.
Is it possible to attach a listener via javascript to attach url parameters to all outgoing traffic?
Are there cleaner ways to attach url parameters to outgoing traffic on my landing page?

Assuming you're using jQuery:
var uuid = '',
aff_id = '';
$('a, form').each(function() {
var self = $(this);
var qstr = 's4=' + aff_id + '&s5=' + uuid;
if ('a' === self.get(0).tagName.toLowerCase()) {
var href = self.attr('href') || '';
if (!href.match(/^#/)) {
self.attr('href', href + (href.indexOf('?') > -1 ? '?' : '&') + qstr);
}
} else {
var action = self.attr('action') || '';
if (!action.match(/^#/)) {
self.attr('action', action + (action.indexOf('?') > -1 ? '?' : '&') + qstr);
}
}
});
Could work as a simple quick-fix.

Get domain name without subdomains using JavaScript?

How to get the domain name without subdomains?
e.g. if the url is "http://one.two.roothost.co.uk/page.html" how to get "roothost.co.uk"?

Following is a solution to extract a domain name without any subdomains. This solution doesn't make any assumptions about the URL format, so it should work for any URL. Since some domain names have one suffix (.com), and some have two or more (.co.uk), to get an accurate result in all cases, we need to parse the hostname using the Public Suffix List, which contains a list of all public domain name suffixes.
Solution
First, include the public suffix list js api in a script tag in your HTML, then in JavaScript to get the hostname you can call:
var parsed = psl.parse('one.two.roothost.co.uk');
console.log(parsed.domain);
...which will return "roothost.co.uk". To get the name from the current page, you can use location.hostname instead of a static string:
var parsed = psl.parse(location.hostname);
console.log(parsed.domain);
Finally, if you need to parse a domain name directly out of a full URL string, you can use the following:
var url = "http://one.two.roothost.co.uk/page.html";
url = url.split("/")[2]; // Get the hostname
var parsed = psl.parse(url); // Parse the domain
document.getElementById("output").textContent = parsed.domain;
JSFiddle Example (it includes the entire minified library in the jsFiddle, so scroll down!): https://jsfiddle.net/6aqdbL71/2/

What about this?
function getCanonicalHost(hostname) {
const MAX_TLD_LENGTH = 3;
function isNotTLD(_) { return _.length > MAX_TLD_LENGTH; };
hostname = hostname.split('.');
hostname = hostname.slice(Math.max(0, hostname.findLastIndex(isNotTLD)));
hostname = hostname.join('.');
return hostname;
}
console.log(getCanonicalHost('mail.google.com'));
console.log(getCanonicalHost('some.google.com.ar'));
console.log(getCanonicalHost('some.another.google.com.ar'));
console.log(getCanonicalHost('foo.bar.google.com'));
console.log(getCanonicalHost('foo.bar.google.com.ar'));
console.log(getCanonicalHost('bar.google.ar'));
Its works since https://developer.mozilla.org/en-US/docs/Learn/Common_questions/What_is_a_domain_name say:
TLDs can contain special as well as latin characters. A TLD's maximum length is 63 characters, although most are around 2–3.
In https://data.iana.org/TLD/tlds-alpha-by-domain.txt are 1481 TLD, 466 of this has length around 2–3 and the most used TLD no has more than 3.
If you need a solution that works with all TLDS, here is a more complex aproach:
function getCanonicalHost(hostname) {
return getCanonicalHost.tlds.then(function(tlds) {
function isNotTLD(_) { return tlds.indexOf(_) === -1; };
hostname = hostname.toLowerCase();
hostname = hostname.split('.');
hostname = hostname.slice(Math.max(0, hostname.findLastIndex(isNotTLD)));
hostname = hostname.join('.');
return hostname;
});
}
getCanonicalHost.tlds = new Promise(function(res, rej) {
const TLD_LIST_URL= 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt';
const xhr = new XMLHttpRequest();
xhr.addEventListener('error', rej);
xhr.addEventListener('load', function() {
const MAX_TLD_LENGTH = 63;
var tlds = xhr.responseText.split('\n');
tlds = tlds.map(function(_) { return _.trim().toLowerCase(); });
tlds = tlds.filter(Boolean);
tlds = tlds.filter(function(_) { return _.length < MAX_TLD_LENGTH; });
res(tlds);
});
xhr.open('GET', TLD_LIST_URL);
xhr.send();
})
getCanonicalHost('mail.google.com').then(console.log);
getCanonicalHost('some.google.com.ar').then(console.log);
getCanonicalHost('some.another.google.com.ar').then(console.log);
getCanonicalHost('foo.bar.google.com').then(console.log);
getCanonicalHost('foo.bar.google.com.ar').then(console.log);
getCanonicalHost('bar.google.ar').then(console.log);

You can use parse-domain to do the heavy lifting for you. This package considers the public suffix list and returns an easy to work with object breaking up the domain.
Here is an example from their readme:
npm install parse-domain
import { parseDomain, ParseResultType } from 'parse-domain';
const parseResult = parseDomain(
// should be a string with basic latin characters only. more details in the readme
'www.some.example.co.uk',
);
// check if the domain is listed in the public suffix list
if (parseResult.type === ParseResultType.Listed) {
const { subDomains, domain, topLevelDomains } = parseResult;
console.log(subDomains); // ["www", "some"]
console.log(domain); // "example"
console.log(topLevelDomains); // ["co", "uk"]
} else {
// more about other parseResult types in the readme
}

This works for me:
const firstTLDs = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|be|bf|bg|bh|bi|bj|bm|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|cl|cm|cn|co|cr|cu|cv|cw|cx|cz|de|dj|dk|dm|do|dz|ec|ee|eg|es|et|eu|fi|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jo|jp|kg|ki|km|kn|kp|kr|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|na|nc|ne|nf|ng|nl|no|nr|nu|nz|om|pa|pe|pf|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|yt".split('|');
const secondTLDs = "com|edu|gov|net|mil|org|nom|sch|caa|res|off|gob|int|tur|ip6|uri|urn|asn|act|nsw|qld|tas|vic|pro|biz|adm|adv|agr|arq|art|ato|bio|bmd|cim|cng|cnt|ecn|eco|emp|eng|esp|etc|eti|far|fnd|fot|fst|g12|ggf|imb|ind|inf|jor|jus|leg|lel|mat|med|mus|not|ntr|odo|ppg|psc|psi|qsl|rec|slg|srv|teo|tmp|trd|vet|zlg|web|ltd|sld|pol|fin|k12|lib|pri|aip|fie|eun|sci|prd|cci|pvt|mod|idv|rel|sex|gen|nic|abr|bas|cal|cam|emr|fvg|laz|lig|lom|mar|mol|pmn|pug|sar|sic|taa|tos|umb|vao|vda|ven|mie|北海道|和歌山|神奈川|鹿児島|ass|rep|tra|per|ngo|soc|grp|plc|its|air|and|bus|can|ddr|jfk|mad|nrw|nyc|ski|spy|tcm|ulm|usa|war|fhs|vgs|dep|eid|fet|fla|flå|gol|hof|hol|sel|vik|cri|iwi|ing|abo|fam|gok|gon|gop|gos|aid|atm|gsm|sos|elk|waw|est|aca|bar|cpa|jur|law|sec|plo|www|bir|cbg|jar|khv|msk|nov|nsk|ptz|rnd|spb|stv|tom|tsk|udm|vrn|cmw|kms|nkz|snz|pub|fhv|red|ens|nat|rns|rnu|bbs|tel|bel|kep|nhs|dni|fed|isa|nsn|gub|e12|tec|орг|обр|упр|alt|nis|jpn|mex|ath|iki|nid|gda|inc".split('|');
const knownSubdomains = "www|studio|mail|remote|blog|webmail|server|ns1|ns2|smtp|secure|vpn|m|shop|ftp|mail2|test|portal|ns|ww1|host|support|dev|web|bbs|ww42|squatter|mx|email|1|mail1|2|forum|owa|www2|gw|admin|store|mx1|cdn|api|exchange|app|gov|2tty|vps|govyty|hgfgdf|news|1rer|lkjkui";
function removeSubdomain(s) {
const knownSubdomainsRegExp = new RegExp(`^(${knownSubdomains})\.`, 'i');
s = s.replace(knownSubdomainsRegExp, '');
const parts = s.split('.');
while (parts.length > 3) {
parts.shift();
}
if (parts.length === 3 && ((parts[1].length > 2 && parts[2].length > 2) || (secondTLDs.indexOf(parts[1]) === -1) && firstTLDs.indexOf(parts[2]) === -1)) {
parts.shift();
}
return parts.join('.');
};
var tests = {
'www.sidanmor.com': 'sidanmor.com',
'exemple.com': 'exemple.com',
'argos.co.uk': 'argos.co.uk',
'www.civilwar.museum': 'civilwar.museum',
'www.sub.civilwar.museum': 'civilwar.museum',
'www.xxx.sub.civilwar.museum': 'civilwar.museum',
'www.exemple.com': 'exemple.com',
'main.testsite.com': 'testsite.com',
'www.ex-emple.com.ar': 'ex-emple.com.ar',
'main.test-site.co.uk': 'test-site.co.uk',
'en.tour.mysite.nl': 'tour.mysite.nl',
'www.one.lv': 'one.lv',
'www.onfdsadfsafde.lv': 'onfdsadfsafde.lv',
'aaa.onfdsadfsafde.aa': 'onfdsadfsafde.aa',
};
const firstTLDs = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|be|bf|bg|bh|bi|bj|bm|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|cl|cm|cn|co|cr|cu|cv|cw|cx|cz|de|dj|dk|dm|do|dz|ec|ee|eg|es|et|eu|fi|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jo|jp|kg|ki|km|kn|kp|kr|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|na|nc|ne|nf|ng|nl|no|nr|nu|nz|om|pa|pe|pf|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|yt".split('|');
const secondTLDs = "com|edu|gov|net|mil|org|nom|sch|caa|res|off|gob|int|tur|ip6|uri|urn|asn|act|nsw|qld|tas|vic|pro|biz|adm|adv|agr|arq|art|ato|bio|bmd|cim|cng|cnt|ecn|eco|emp|eng|esp|etc|eti|far|fnd|fot|fst|g12|ggf|imb|ind|inf|jor|jus|leg|lel|mat|med|mus|not|ntr|odo|ppg|psc|psi|qsl|rec|slg|srv|teo|tmp|trd|vet|zlg|web|ltd|sld|pol|fin|k12|lib|pri|aip|fie|eun|sci|prd|cci|pvt|mod|idv|rel|sex|gen|nic|abr|bas|cal|cam|emr|fvg|laz|lig|lom|mar|mol|pmn|pug|sar|sic|taa|tos|umb|vao|vda|ven|mie|北海道|和歌山|神奈川|鹿児島|ass|rep|tra|per|ngo|soc|grp|plc|its|air|and|bus|can|ddr|jfk|mad|nrw|nyc|ski|spy|tcm|ulm|usa|war|fhs|vgs|dep|eid|fet|fla|flå|gol|hof|hol|sel|vik|cri|iwi|ing|abo|fam|gok|gon|gop|gos|aid|atm|gsm|sos|elk|waw|est|aca|bar|cpa|jur|law|sec|plo|www|bir|cbg|jar|khv|msk|nov|nsk|ptz|rnd|spb|stv|tom|tsk|udm|vrn|cmw|kms|nkz|snz|pub|fhv|red|ens|nat|rns|rnu|bbs|tel|bel|kep|nhs|dni|fed|isa|nsn|gub|e12|tec|орг|обр|упр|alt|nis|jpn|mex|ath|iki|nid|gda|inc".split('|');
const knownSubdomains = "www|studio|mail|remote|blog|webmail|server|ns1|ns2|smtp|secure|vpn|m|shop|ftp|mail2|test|portal|ns|ww1|host|support|dev|web|bbs|ww42|squatter|mx|email|1|mail1|2|forum|owa|www2|gw|admin|store|mx1|cdn|api|exchange|app|gov|2tty|vps|govyty|hgfgdf|news|1rer|lkjkui";
function removeSubdomain(s) {
const knownSubdomainsRegExp = new RegExp(`^(${knownSubdomains})\.`, 'i');
s = s.replace(knownSubdomainsRegExp, '');
const parts = s.split('.');
while (parts.length > 3) {
parts.shift();
}
if (parts.length === 3 && ((parts[1].length > 2 && parts[2].length > 2) || (secondTLDs.indexOf(parts[1]) === -1) && firstTLDs.indexOf(parts[2]) === -1)) {
parts.shift();
}
return parts.join('.');
};
for (var test in tests) {
if (tests.hasOwnProperty(test)) {
var t = test;
var e = tests[test];
var r = removeSubdomain(test);
var s = e === r;
if (s) {
console.log('OK: "' + t + '" should be "' + e + '" and it is really "' + r + '"');
} else {
console.log('Fail: "' + t + '" should be "' + e + '" but it is NOT "' + r + '"');
}
}
}
Referance:
psl.min.js file
Maximillian Laumeister Answer to this question
The most popular subdomains on the internet

Simplest solution:
var domain='https://'+window.location.hostname.split('.')[window.location.hostname.split('.').length-2]+'.'+window.location.hostname.split('.')[window.location.hostname.split('.').length-1];
alert(domain);

I created this function which uses URL to parse. It cheats by assuming all hostnames will have either 4 or less parts.
const getDomainWithoutSubdomain = url => {
const urlParts = new URL(url).hostname.split('.')
return urlParts
.slice(0)
.slice(-(urlParts.length === 4 ? 3 : 2))
.join('.')
}
[
'https://www.google.com',
'https://www.google.co.uk',
'https://mail.google.com',
'https://www.bbc.co.uk/news',
'https://github.com',
].forEach(url => {
console.log(getDomainWithoutSubdomain(url))
})

Here is a working JSFiddle
My solution works with the assumption that the root hostname you are looking for is of the type "abc.xyz.pp".
extractDomain() returns the hostname with all the subdomains.
getRootHostName() splits the hostname by . and then based on the assumption mentioned above, it uses the shift() to remove each subdomain name.
Finally, whatever remains in parts[], it joins them by . to form the root hostname.
Javascript
var urlInput = "http://one.two.roothost.co.uk/page.html";
function extractDomain(url) {
var domain;
//find & remove protocol (http, ftp, etc.) and get domain
if (url.indexOf("://") > -1) {
domain = url.split('/')[2];
} else {
domain = url.split('/')[0];
}
//find & remove port number
domain = domain.split(':')[0];
return domain;
}
function getRootHostName(url) {
var parts = extractDomain(url).split('.');
var partsLength = parts.length - 3;
//parts.length-3 assuming root hostname is of type abc.xyz.pp
for (i = 0; i < partsLength; i++) {
parts.shift(); //remove sub-domains one by one
}
var rootDomain = parts.join('.');
return rootDomain;
}
document.getElementById("result").innerHTML = getRootHostName(urlInput);
HTML
<div id="result"></div>
EDIT 1: Updated the JSFiddle link. It was reflecting the incorrect code.

What about...
function getDomain(){
if(document.domain.length){
var parts = document.domain.replace(/^(www\.)/,"").split('.');
//is there a subdomain?
while(parts.length > 2){
//removing it from our array
var subdomain = parts.shift();
}
//getting the remaining 2 elements
var domain = parts.join('.');
return domain.replace(/(^\.*)|(\.*$)/g, "");
}
return '';
}

My solution worked for me: Get "gocustom.com" from "shop.gocustom.com"
var site_domain_name = 'shop.gocustom.com';
alert(site_domain_name);
var strsArray = site_domain_name.split('.');
var strsArrayLen = strsArray.length;
alert(strsArray[eval(strsArrayLen - 2)]+'.'+strsArray[eval(strsArrayLen - 1)])

You can try this in JavaScript:
alert(window.location.hostname);
It will return the hostname.

Develop Reference

JavaScript is the programming language of the Web.

Get the domain and page name from a String URL - javascript

Well i'm currently have some issue about manipulating an URL. Technically what i want is to get the domain name and the page name from a page. For example : www.myWebSite.com => domain : myWebSite http://myWebSite.com => domain : myWebSite myWebSite.com/xxx.hmtl => domain : myWebSite page : xxx

window.location.hostname; //Domain name $("title").text(); //Page name EDIT: var loc = window.location; var filename = loc.pathname.split("/"); filename = filename[pathname.length-1]; alert("Domain: "+loc.hostname); alert("Filename: "+filename);

try with url.match(/:\/\/(.[^/]+)/)[1] example : var r = /:\/\/(.[^/]+)/; "http://stackoverflow.com/questions/5343288/get-the-domain-and-page-name-from-a-string-url".match(r)[1] => stackoverflow.com

Use window.location.hostname or window.location.host. Check location reference.

Related

nodeJS Crawler : unable to get the tagname associated with search word

Getting domain without subdomain from an url with javascript [duplicate]

Adding a Parameter to Url with javascript

How can I use javascript to attach url parameters to all outgoing requests?

Get domain name without subdomains using JavaScript?

Categories

Resources