Fetching word count in a web page

Fetching word count in a web page - javascript

This must have been a very generic question but I have not come across any concrete or stable solution for this.
I just want to fetch the number of words in a web page but across all the browsers. My current implementation is
var body = top.document.body;
if(body) {
var content = body.innerText || body.textContent;
content = content.replace(/\n/ig,' ');
content = content.replace(/\s+/gi,' ');
content = content.replace(/(^\s|\s$)/gi,'');
if(!body.innerText) {
content = content.replace(/<script/gi,'');
}
console.log(content);
console.log(content.split(' ').length);
}
This works well but it does not work with some Firefox browsers as innerText does not work on Firefox.
If I use textContent then it displays the contents of JS tags too if present. Eg if a web page content is
<body>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.1.1/jquery.min.js"></script>
<script type="text/javascript">
console.log('Hellow World');
var some = "some";
var two = "two";
var three = "three";
</script>
<h1 style="text-align:center">Static content from Nginx</h1>
<div>
This is a
static.
<div>
This is a
static.
</div>
</div>
</body>
Then textContent will have JS code too in the content which will give me wrong word count.
What is the concrete solution that can work across any environment.
PS: No JQuery

Ok, you have there two problems:
Cross-browser innerText
I'd go with:
var text = document.body[('innerText' in document.body) ? 'innerText' : 'textContent'];
That, to prefer innerText over textContent.
Stripping result of <script> tags.
dandavis offers a neat solution to that:
function noscript(strCode){
var html = $(strCode.bold());
html.find('script').remove();
return html.html();
}
And a non-jQuery solution:
function noscript(strCode){
return strCode.replace(/<script.*?>.*?<\/script>/igm, '')
}
A function that will turn the string into a "fake" html document, strip its script tags and return the raw result.
Of course, you may improve the function to remove also <style> tags and others.
Counting letters
Your method to do the job is alright, but still, I think that a simple regex would do the job much better. You can count the words in a string using:
str.match(/\S+/g).length;
Finally
Final result should look like
var body = top.document.body;
if(body) {
var content = document.body[('innerText' in document.body) ? 'innerText' : 'textContent'];
content = noscript(content);
alert(content.match(/\S+/g).length);
}

What about hidden/invisible/overlayed blocks? do you want to count words inside all of it? what about images (alt tag of image)
if you want to count all - just strip tags and count test of all rest blocks. smth like that $('body :not(script)').text()

Thank you so much for giving such a helpful answers. I found this approach to use if the innerText is not defined in a browser. And the result that we get is very much similar to innerText. Hence I think it will be consistent across all the browsers.
All of you please look into it and let me know if this answer can be accepted. And let me know if you guys find any discrepancy in this method I am using.
function getWordCount() {
try {
var body = top.document.querySelector("body");
if (body) {
var content = body.innerText || getInnerText(top.document.body, top);
content = content.replace(/\n/ig, ' ');
var wordCount = content.match(/\S+/gi).length;
return wordCount;
}
} catch (e) {
processError("getWordCount", e);
}
}
function getInnerText(el, win) {
try {
win = win || window;
var doc = win.document,
sel, range, prevRange, selString;
if (win.getSelection && doc.createRange) {
sel = win.getSelection();
if (sel.rangeCount) {
prevRange = sel.getRangeAt(0);
}
range = doc.createRange();
range.selectNodeContents(el);
sel.removeAllRanges();
sel.addRange(range);
selString = sel.toString();
sel.removeAllRanges();
prevRange && sel.addRange(prevRange);
} else if (doc.body.createTextRange) {
range = doc.body.createTextRange();
range.moveToElementText(el);
range.select();
}
return selString;
} catch (e) {
processError('getInnerText', e);
}
}
The result that I am getting is same as that of innerText and is more accurate than using regex, or removing tags etc.
Please give me ur views on this.

Related

Trying to figure out how to create links using createTextNode

First off I would like to say, the person that originally created this portion of the code is no longer on the team.
We are creating a development tool to Administrate and Develop servers for our game, that has its own programming language.
I'm using JavaFX with WebView to generate the chat area of the development tool to communicate with other developers and staff. However I want it so hen you post a link it actually shows as a link instead of plain text. I have tried things such as AutoLinker with no success. Here is the HTML portion of the webview.
<script src=".././scripts/Autolinker.js"></script>
<script type="text/javascript">
app = null;
const messages = document.getElementById("messages");
function addMessage(message, options) {
const p = document.createElement("p");
const c = message.indexOf(":");
const modifiedMessage = message; //replaceURLWithHTMLLinks(message);
const ridBrackets = options.replace(/[\[\]']/g, "");
const tokenize = ridBrackets.split(",", 2);
const rcChatOptions = tokenize;
const mFontColor = tokenize[rcChatOptions.BFONTCOLOR];
let timeStampFormat = tokenize[rcChatOptions.TIMESTAMP];
if(c > -1) {
const u = document.createElement("span");
const a = document.createElement("a");
u.className = "user";
if(mFontColor != null) {
u.style.color = mFontColor;
} else {
u.style.color = "#00c02b";
}
//Turn plain text links into actual links
u.appendChild(document.createTextNode(Autolinker.link(modifiedMessage.substring(0, c + 1))));
p.appendChild(u);
if(document.selectedfont != null) {
p.style.fontFamily = document.selectedfont;
}
p.appendChild(document.createTextNode(modifiedMessage.substring(c + 1)));
} else {
p.appendChild(document.createTextNode(modifiedMessage));
}
// Append message and scroll to bottom (if at bottom)
const scrollTop = document.body.scrollTop;
const scrolledToBottom = scrollTop + window.innerHeight >= document.body.scrollHeight;
if(scrolledToBottom) {
messages.appendChild(p);
window.scrollTo(document.body.scrollLeft, document.body.scrollHeight - window.innerHeight);
} else {
messages.appendChild(p);
}
messages.style.backgroundColor = "transparent";
}
</script>
I removed portions of the code that I felt was just a distraction.
This what the tool looks like
https://share.getcloudapp.com/kpuNDB4m
this is what it looks like using AutoLinker
https://share.getcloudapp.com/8LunomDL
(So auto linker is doing its job, it just still isn't rending as HyperLinks)

It looks like the TextNode is created after collecting some substring which would be the link. Here's an example of what it would look like if a link was created directly in js then passed to the TextNode.
One thing you can do is place the text inside of an a tag within a paragraph and then convert like so:
var link = document.createElement('link');
link.innerHTML = 'Website: <a href="http://somelink.com" </a>
link.href = 'http://somelink.com';
link.appendChild(document.createTextNode('http://somelink.com'));

After getting pointed in the right direction (By Frank, Thank You) I found a javascript Library that helped me accomplish what I was looking for.
Library
https://github.com/cferdinandi/saferInnerHTML
Here is an example!
https://share.getcloudapp.com/nOuDPnlp
Usage:
saferInnerHTML(message, modifiedMessage, true);
The last param is an option, append or overwrite.
Obviously, I will have to do some CSS work to make them not display as buttons. But it is exactly what I was trying to achieve.

Toggle any tag just like execCommand bold does with b

I have contenteditable div and need to be able to wrap/unwrap text into other tags like <code> and so on.
Wrapping them isn't problem, rather unwrapping selected text is, like when there is <b>hello </b><code>world</code> and I need to unwrap only letters in middle (orl), there doesn't seem to be simple way detect if selected text is inside <code> tag or no. So while document.execCommand("bold") does this wrapping/unwrapping job perfectly with <b> and <i> tags, is there any way to do same with other tags except b, i, u?

So after longer time searching for solution and finding nothing, I decided to try to write it on my own, so if anyone finds this piece useful I will be glad.
I don't know how it performs on IE or Safari and for sure it's not the most efficient way how to do it, but for most of browsers it should do the job.
<!doctype html>
<html>
<head>
<title>Wrap</title>
<meta charset="utf-8">
</head>
<body>
<script>
function traverseParentCE(el){
var parent = el.parentElement;
if(!parent || (el && el.contentEditable=="true")) return el;
while(parent && parent.contentEditable!="true"){
parent = parent.parentElement;
}
return parent;
}
function finishWrap(el, innHtml, marker, wrapper){
el.innerHTML = innHtml.replace("<"+marker+">","").replace("</"+marker+">","");
var node = el.getElementsByTagName(wrapper)[0];
var range = document.createRange();
range.setStart(node, 0);
range.setEnd(node, 1);
var sel = document.getSelection();
sel.removeAllRanges();
sel.addRange(range);
}
function wrap(tag){
var marker = "marker";
var wrapper = "inncnt";
var ae = document.activeElement;
if(ae.contentEditable!="true") return;
var sel = document.getSelection();
var range = sel.getRangeAt(0);
var el = document.createElement(marker);
el.appendChild(range.cloneContents());
range.deleteContents();
range.insertNode(el);
var innHtml = el.innerHTML;
var count = 0;
while(true){ // I know right, this kind of replacing is horrible, but RegExp somehow didn't work for me
var pos = innHtml.indexOf("<"+tag+">");
if(pos==-1) break;
innHtml = innHtml.replace("<"+tag+">","").replace("</"+tag+">","");
count++;
}
innHtml.replace("<"+wrapper+">","").replace("</"+wrapper+">","");
el.innerHTML = "<"+wrapper+">" + innHtml + "</"+wrapper+">";
var container = traverseParentCE(range.commonAncestorContainer);
innHtml = container.innerHTML;
if(count>0){
return finishWrap(container, innHtml, marker, wrapper);
}
var pos = innHtml.indexOf("<"+marker+">");
var contentBefore = innHtml.substr(0, pos);
var lastStarting = contentBefore.indexOf("<"+tag+">");
var lastEnding = contentBefore.indexOf("</"+tag+">");
var wrap = false;
if(lastStarting == -1) wrap = true;
else if(lastStarting < lastEnding) wrap = true;
if(wrap) innHtml = innHtml.replace(marker, tag).replace(marker, tag);
else innHtml = innHtml.replace(marker, "/"+tag).replace("/"+marker,tag);
var doublet = "<"+tag+"></"+tag+">";
while(true){
var pos = innHtml.indexOf(doublet);
if(pos==-1) break;
innHtml = innHtml.replace(doublet, "");
}
return finishWrap(container, innHtml, marker, wrapper);
}
</script>
<button onmousedown="wrap('b'); return false;" onmouseup="return false;" onclick="return false;">B</button>
<button onmousedown="wrap('i'); return false;" onmouseup="return false;" onclick="return false;">I</button>
<button onmousedown="wrap('code'); return false;" onmouseup="return false;" onclick="return false;">Code</button>
<div style="width: 800px; height: 300px; background-color: yellow" contenteditable="true" id="out">
Hello <b>World</b>! This <code>is some</code> sentence.
</div>
</body>
</html>

Modifying contenteditable divs is no simple task. What you ask is possible, but far too large a topic and isn't answerable with a few examples here.
However there are many ready-made libraries that have done this before (for example CKEditor. They are very complex, but also very flexable. Out of the box, they probably do much more than you need, but they can be configured so that features you don't need can be disabled, and they have an API allowing you to control them from the outside, if needed.

Javascript pulling content from commented html

Bit of a JS newbie, I have a tracking script that reads the meta data of the page and places the right scripts on that page using this:
var element = document.querySelector('meta[name="tracking-title"]');
var content = element && element.getAttribute("content");
console.log(content)
This obviously posts the correct tag to console so I can make sure it's working .. and it does in a test situation. However, on the actual website the meta data i'm targeting is produced on the page by a Java application and beyond my control, the problem is it is in a commented out area. This script cannot read within a commented out area. ie
<!-- your tracking meta is here
<meta name="tracking-title" content="this-is-the-first-page">
Tracking finished -->
Any ideas appreciated.

You can use this code:
var html = document.querySelector('html');
var content;
function traverse(node) {
if (node.nodeType == 8) { // comment
var text = node.textContent.replace(/<!--|-->/g, '');
var frag = document.createDocumentFragment();
var div = document.createElement('div');
frag.appendChild(div);
div.innerHTML = text;
var element = div.querySelector('meta[name="tracking-title"]');
if (element) {
content = element.getAttribute("content");
}
}
var children = node.childNodes;
if (children.length) {
for (var i = 0; i < children.length; i++) {
traverse(children[i]);
}
}
}
traverse(html);

One way is to use a NodeIterator and get comment nodes. Quick example below. You will still need to parse the returned value for the data you want but I am sure you can extend this here to do what you want.
Fiddle: http://jsfiddle.net/AtheistP3ace/gfu791c5/
var commentedOutHTml = [];
var iterator = document.createNodeIterator(document.body, NodeFilter.SHOW_COMMENT, NodeFilter.FILTER_ACCEPT, false);
var currentNode;
while (currentNode = iterator.nextNode()) {
commentedOutHTml.push(currentNode.nodeValue);
}
alert(commentedOutHTml.toString());

You can try this. This will require you to use jQuery however.
$(function() {
$("*").contents().filter(function(){
return this.nodeType == 8;
}).each(function(i, e){
alert(e.nodeValue);
});
});

JavaScript/jQuery: hasDescendant / hasAncestor

Hilariously, I'm having incredible difficulty finding any half-good way to determine whether or not an HTML element is inside another one or not -- which seems like it should be a basic core feature of traversing and analyzing the HTML DOM. I was immensely surprised and disappointed that the "hasDescendant" (or likewise) method is missing.
I'm trying to do this:
var frog = $('#frog');
var walrus = $('#walrus');
if (frog.hasDescendant(walrus)) console.log("Frog is within walrus.");
else console.log("Frog is outside walrus.");
I've tried to reproduce what I'm looking for with many jQuery combinations.
walrus.is(frog.parents());
walrus.has(frog);
walrus.find(' *').has(frog);
frog.is(walrus.find(' *'));
I haven't found a working solution yet.
[edit]
Solution: walrus.has(frog)
Alternate: if (walrus.has(frog)) { doStuff(); }
Alternate: var booleanResult = walrus.has(frog).length>0;
//Chase.

jQuery has just the function for this: jQuery.contains: "Check to see if a DOM element is within another DOM element."
Demo (live copy):
HTML:
<p id="frog">Frog <span id="walrus">walrus</span></p>
JavaScript:
jQuery(function($) {
var frog = $("#frog"),
walrus = $("#walrus");
display("frog contains walrus? " + $.contains(frog[0], walrus[0]));
display("walrus contains frog? " + $.contains(walrus[0], frog[0]));
function display(msg) {
$("<p>").html(msg).appendTo(document.body);
}
});

Use
if (walrus.has(frog).length) {
// frog is contained in walrus..
}
Demo at http://jsfiddle.net/gaby/pZfLm/
An alternative
if ( $('#frog').closest('#walrus').length ){
// frog is contained in walrus..
}
demo at http://jsfiddle.net/gaby/pZfLm/1/

If you wanted to write your own function you could do it without jQuery, perhaps something like this:
function isAncestor(ancestor, descendent) {
while ((descendent = descendent.parentNode) != null)
if (descendent == ancestor)
return true;
return false;
}

Changing innerHTML of script tags in IE for loading google plusone button explicitly

To add Google's plusone button on your website the following script tag is to be inserted (for explicit load).
<script type="text/javascript" src="https://apis.google.com/js/plusone.js">
{"parsetags": "explicit"}
</script>
It looks pretty straight forward in HTML. However I wan't to insert the script using a JS file. So I use the following code:
var e = document.createElement('script');
e.src = "https://apis.google.com/js/plusone.js";
e.id = "googplusonescript";
e.innerHTML = '{"parsetags": "explicit"}';
document.getElementsByTagName("head")[0].appendChild(e);
It works pretty awesome in all the browsers except IE because IE doesn't allow us to write the innerHTML of script tags. Anywork arounds anyone ? (I have jquery inserted in the page. So can use jquery too.)

Came across the same issue.
Under IE you should use
script.text = '{"parsetags": "explicit"}';
instead of script.innerHTML

try creating a textNode and appending it to your script tags:
var myText = document.createTextNode('{"parsetags": "explicit"}');
myScriptTag.appendChild(myText);

Try the following:
window['___gcfg'] = { parsetags: 'explicit' };

var ispoloaded;
function showpo() {
var pohd = document.getElementsByTagName('head')[0];
var poscr = document.createElement('script');
poscr.type ='text/javascript';
poscr.async = true;
poscr.text ='{"parsetags": "explicit"}'; //works on IE too
poscr.src = "http://apis.google.com/js/plusone.js";
pohd.appendChild(poscr);
ispoloaded = setInterval(function () {
//check if plusone.js is loaded
if(typeof gapi == 'object') {
//break the checking interval if loaded
clearInterval(ispoloaded);
//render the button, if passed id does not exists it renders all plus one buttons on page
gapi.plusone.go("idthatexists");
}
}, 100); //checks every 0.1 second
}

Develop Reference

JavaScript is the programming language of the Web.

Fetching word count in a web page - javascript

What about hidden/invisible/overlayed blocks? do you want to count words inside all of it? what about images (alt tag of image) if you want to count all - just strip tags and count test of all rest blocks. smth like that $('body :not(script)').text()

Related

Trying to figure out how to create links using createTextNode

Toggle any tag just like execCommand bold does with b

Javascript pulling content from commented html

JavaScript/jQuery: hasDescendant / hasAncestor

Changing innerHTML of script tags in IE for loading google plusone button explicitly

Categories

Resources