Detecting Hebrew words in document via JavaScript

Detecting Hebrew words in document via JavaScript - javascript

I'm a mostly-newbie when it comes to web development (though not to programming in general) so pardon any incorrect terminology.
I want to build a script that, when added to an HTML page, detects each Hebrew word in the page and transforms that word into an HTML element, e.g. into a hyperlink with title.
So, the following:
<p>ראש הלשכה</p>
Is transformed into:
<p><a title="word 1" href="#">הלשכה</a> <a title="word 2" href="#">ראש</a></p>
Make sense?
So, I suppose the first order of business is detecting Hebrew words in a page. How would I go about doing this? I don't know where to start, outside of poking around jQuery documentation.

Searching for a Hebrew word in a string is fairly simple. Use a regexp that matches a contiguous sequence of Hebrew code points:
/[\u05D0-\u05FF]+/
Since JS supports functional programming, we can easily write our own functions to walk the document tree, calling a function on each text node. First, a bit of scaffolding.
if (! window.assert) {
window.dbgLvl = 1; // change this to 0 for production release
window.assert=function(succeeded, msg) {
if (dbgLvl && !succeeded) {
if (!msg) msg = 'assertion failed';
throw msg;
}
}
}
Next, we define a method to split strings into an array, including separators in the output.
/* String.separate is like String.split, but the result includes the
separators.
These implementations of 'String.separate' will work for our purposes,
but are buggy in general, due to differences in the implementation of
String.split.
The two misbehaviors we correct are including neither grouped patterns
nor empty strings in the result, though the latter is only corrected
when the missing empty string is at the start or the end.
*/
if ('-'.split(/(-)/).length & 1) {
assert('a'.split(/a/).length, 'split includes grouping but not empty strings');
// split includes groups in result
String.prototype.separate = function (separator) {
if (typeof separator == 'string') {
if (separator.charAt(0) != '('
|| separator.charAt(separator.length-1) != ')')
{
separator = new RegExp('(' + separator + ')', 'g');
} else {
separator = new RegExp(separator, 'g');
}
}
return this.split(separator);
}
} else {
if ('a'.split(/a/).length) {
// empty strings included, grouped aren't
String.prototype.separate = function (separator) {
if (typeof separator == 'string') {
separator = new RegExp(separator, 'g');
}
var fence = this.match(separator);
if (!fence) {
return [this];
}
var posts = this.split(separator);
assert(posts.length = fence.length+1);
var result = [], i;
for (i=0; i<fence.length; ++i) {
result.push(posts[i]);
result.push(fence[i]);
}
result.push(posts[i]);
return result;
}
} else {
// neither empty strings nor groups are included. IE, you suck.
String.prototype.separate = function (separator) {
if (typeof separator == 'string') {
separator = new RegExp(separator, 'g');
}
var fence = this.match(separator);
if (!fence) {
return [this];
}
var posts = this.split(separator);
if (posts.length <= fence.length) {
/* missing some posts. Assume that they are the first or
last, though this won't be true in general.
*/
if (posts.length < fence.length) {
posts.unshift('');
posts.push('');
} else {
if (this.substring(0, fence[0].length) == fence[0]) {
posts.unshift('');
} else {
posts.push('');
}
}
}
var result = [], i;
for (i=0; i<fence.length; ++i) {
result.push(posts[i]);
result.push(fence[i]);
}
result.push(posts[i]);
return result;
}
}
}
Next, some node predicates.
if (! window.Node) {
window.Node={TEXT_NODE: 3};
} else if (typeof Node.TEXT_NODE == 'undefined') {
Node.TEXT_NODE = 3;
}
function isTextNode(node) {return node.nodeType == Node.TEXT_NODE;}
function hasKids(node) {return node.childNodes && node.childNodes.length;}
function allNodes(node) {return true;}
Now the functions to walk the DOM.
/*
forEachChild: pre-order traversal of document tree. Applies a function to some nodes, determined by the 'which' and 'descendInto' arguments.
Arguments:
which (function): Returns true if 'action' should be applied to a node.
action (function): Takes a node and does something to it.
parent (Node): The node to start from.
descendInto (function, optional): By default, forEachChild will descend into every child that itself has children. Place additional restrictions by passing this argument.
*/
var forEachChild = (function() {
/* the actual implementation is made a local function so that the
optional parameter can be handled efficiently.
*/
function _forEachChild(which, action, node, descendInto) {
for (var child=node.firstChild; child; child=child.nextSibling) {
if (which(child)) {
action(child);
}
if (hasKids(child) && descendInto(child)) {
_forEachChild(which, action, child, descendInto);
}
}
}
return function (which, action, node, descendInto) {
if (!descendInto) {descendInto=allNodes}
_forEachChild(which, action, node, descendInto);
}
})();
function forEachNode(which, action, descendInto) {
return forEachChild(which, action, document, descendInto);
}
function forEachTextNode(action, descendInto) {
return forEachNode(isTextNode, action, descendInto);
}
function forEachTextNodeInBody(action, descendInto) {
return forEachChild(isTextNode, action, document.body, descendInto);
}
The last group of functions replace text in a text node that matches a pattern with a new node of your choosing. This group (well, the function returned by wrapText) hasn't been completely tested for cross-browser compatibility, including whether it handles text direction properly.
/*
wrapText replaces substrings in a text node with new nodes.
Arguments:
pattern (RegExp || string): If a RegExp, must be of the form: '/(...)/g'.
replace (function): Takes a string and returns a Node to replace the string.
Returns a function that takes a text node.
*/
function wrapText(pattern, replace) {
return function (node) {
var chunks = node.nodeValue.separate(pattern);
if (chunks.length < 2)
return;
var wordCount=0;
var fragment = document.createDocumentFragment();
var i;
// don't bother adding first chunk if it's empty.
if (chunks[0].length) {
fragment.appendChild(document.createTextNode(chunks[0]));
}
for (i=1; i < chunks.length; i+=2) {
fragment.appendChild(replace(chunks[i])); // †
fragment.appendChild(document.createTextNode(chunks[i+1])); // ‡
}
// clean-up
assert(i == chunks.length, 'even number of chunks in ['+chunks+'] when it should be odd.');
/* chunks.length and i will always be odd, thus i == chunks.length
* when the loop finishes. This means the last element is never
* missed.
* Here's another way of thinking about this. Since the last
* (and first) chunk won't match the pattern, it won't be
* processed by the line †. The penultimate chunk, however, does
* match. Assuming the loop condition is correct,the penultimate
* chunk must be processed by †, hence the last chunk is
* processed by ‡.
*/
if (! chunks[i-1].length) {
// last chunk is empty; remove it.
fragment.removeChild(fragment.lastChild);
}
node.parentNode.replaceChild(fragment, node);
}
}
/*
createAnchorWrap wraps a string in an anchor node. createAnchorWrap also
sets the title of the anchor.
Arguments:
title (string || function, optional): The title for the anchor element.
If title is a function, it's called with the string to wrap. If
title is a string, wrapper will use a word counter for the title
function.
Returns a function that takes a string and returns an anchor element.
*/
function createAnchorWrap(title) {
if (typeof title == 'string') {
title=createWordCounter(title);
} else if (!title) {
title=createWordCounter();
}
return function(word) {
var a = document.createElement('a');
a.title=title(word);
a.appendChild(document.createTextNode(word));
return a;
}
}
/*
createWordCounter creates a word counter, which returns the number of
times it's been called (including the current call), prefixed by a string.
Arguments:
pre (string, optional): prefix for return value.
Returns a function that takes a string (ignored) and returns a string.
*/
function createWordCounter(pre) {
var wordCount=0;
if (pre) {
pre = pre.replace(/ *$/, ' ');
} else {
pre = 'word ';
}
return function(text) {
return pre + wordCount;
}
}
The last thing you have to do is start the process in (e.g.) a load handler or a script at the bottom of the page.
forEachTextNodeInBody(wrapText(/([\u05D0-\u05FF]+)/g,
createAnchorWrap()));
If you want to change the prefix for the title, pass the result of createWordCounter(...) to the createAnchorWrap.

Related

How to ignore any element in Javascript? [duplicate]

<div class="title">
I am text node
<a class="edit">Edit</a>
</div>
I wish to get the "I am text node", do not wish to remove the "edit" tag, and need a cross browser solution.

var text = $(".title").contents().filter(function() {
return this.nodeType == Node.TEXT_NODE;
}).text();
This gets the contents of the selected element, and applies a filter function to it. The filter function returns only text nodes (i.e. those nodes with nodeType == Node.TEXT_NODE).

You can get the nodeValue of the first childNode using
$('.title')[0].childNodes[0].nodeValue
http://jsfiddle.net/TU4FB/

Another native JS solution that can be useful for "complex" or deeply nested elements is to use NodeIterator. Put NodeFilter.SHOW_TEXT as the second argument ("whatToShow"), and iterate over just the text node children of the element.
var root = document.querySelector('p'),
iter = document.createNodeIterator(root, NodeFilter.SHOW_TEXT),
textnode;
// print all text nodes
while (textnode = iter.nextNode()) {
console.log(textnode.textContent)
}
<p>
<br>some text<br>123
</p>
You can also use TreeWalker. The difference between the two is that NodeIterator is a simple linear iterator, while TreeWalker allows you to navigate via siblings and ancestors as well.

ES6 version that return the first #text node content
const extract = (node) => {
const text = [...node.childNodes].find(child => child.nodeType === Node.TEXT_NODE);
return text && text.textContent.trim();
}

If you mean get the value of the first text node in the element, this code will work:
var oDiv = document.getElementById("MyDiv");
var firstText = "";
for (var i = 0; i < oDiv.childNodes.length; i++) {
var curNode = oDiv.childNodes[i];
if (curNode.nodeName === "#text") {
firstText = curNode.nodeValue;
break;
}
}
You can see this in action here: http://jsfiddle.net/ZkjZJ/

Pure JavaScript: Minimalist
First off, always keep this in mind when looking for text in the DOM.
MDN - Whitespace in the DOM
This issue will make you pay attention to the structure of your XML / HTML.
In this pure JavaScript example, I account for the possibility of multiple text nodes that could be interleaved with other kinds of nodes. However, initially, I do not pass judgment on whitespace, leaving that filtering task to other code.
In this version, I pass a NodeList in from the calling / client code.
/**
* Gets strings from text nodes. Minimalist. Non-robust. Pre-test loop version.
* Generic, cross platform solution. No string filtering or conditioning.
*
* #author Anthony Rutledge
* #param nodeList The child nodes of a Node, as in node.childNodes.
* #param target A positive whole number >= 1
* #return String The text you targeted.
*/
function getText(nodeList, target)
{
var trueTarget = target - 1,
length = nodeList.length; // Because you may have many child nodes.
for (var i = 0; i < length; i++) {
if ((nodeList[i].nodeType === Node.TEXT_NODE) && (i === trueTarget)) {
return nodeList[i].nodeValue; // Done! No need to keep going.
}
}
return null;
}
Of course, by testing node.hasChildNodes() first, there would be no need to use a pre-test for loop.
/**
* Gets strings from text nodes. Minimalist. Non-robust. Post-test loop version.
* Generic, cross platform solution. No string filtering or conditioning.
*
* #author Anthony Rutledge
* #param nodeList The child nodes of a Node, as in node.childNodes.
* #param target A positive whole number >= 1
* #return String The text you targeted.
*/
function getText(nodeList, target)
{
var trueTarget = target - 1,
length = nodeList.length,
i = 0;
do {
if ((nodeList[i].nodeType === Node.TEXT_NODE) && (i === trueTarget)) {
return nodeList[i].nodeValue; // Done! No need to keep going.
}
i++;
} while (i < length);
return null;
}
Pure JavaScript: Robust
Here the function getTextById() uses two helper functions: getStringsFromChildren() and filterWhitespaceLines().
getStringsFromChildren()
/**
* Collects strings from child text nodes.
* Generic, cross platform solution. No string filtering or conditioning.
*
* #author Anthony Rutledge
* #version 7.0
* #param parentNode An instance of the Node interface, such as an Element. object.
* #return Array of strings, or null.
* #throws TypeError if the parentNode is not a Node object.
*/
function getStringsFromChildren(parentNode)
{
var strings = [],
nodeList,
length,
i = 0;
if (!parentNode instanceof Node) {
throw new TypeError("The parentNode parameter expects an instance of a Node.");
}
if (!parentNode.hasChildNodes()) {
return null; // We are done. Node may resemble <element></element>
}
nodeList = parentNode.childNodes;
length = nodeList.length;
do {
if ((nodeList[i].nodeType === Node.TEXT_NODE)) {
strings.push(nodeList[i].nodeValue);
}
i++;
} while (i < length);
if (strings.length > 0) {
return strings;
}
return null;
}
filterWhitespaceLines()
/**
* Filters an array of strings to remove whitespace lines.
* Generic, cross platform solution.
*
* #author Anthony Rutledge
* #version 6.0
* #param textArray a String associated with the id attribute of an Element.
* #return Array of strings that are not lines of whitespace, or null.
* #throws TypeError if the textArray param is not of type Array.
*/
function filterWhitespaceLines(textArray)
{
var filteredArray = [],
whitespaceLine = /(?:^\s+$)/; // Non-capturing Regular Expression.
if (!textArray instanceof Array) {
throw new TypeError("The textArray parameter expects an instance of a Array.");
}
for (var i = 0; i < textArray.length; i++) {
if (!whitespaceLine.test(textArray[i])) { // If it is not a line of whitespace.
filteredArray.push(textArray[i].trim()); // Trimming here is fine.
}
}
if (filteredArray.length > 0) {
return filteredArray ; // Leave selecting and joining strings for a specific implementation.
}
return null; // No text to return.
}
getTextById()
/**
* Gets strings from text nodes. Robust.
* Generic, cross platform solution.
*
* #author Anthony Rutledge
* #version 6.0
* #param id A String associated with the id property of an Element.
* #return Array of strings, or null.
* #throws TypeError if the id param is not of type String.
* #throws TypeError if the id param cannot be used to find a node by id.
*/
function getTextById(id)
{
var textArray = null; // The hopeful output.
var idDatatype = typeof id; // Only used in an TypeError message.
var node; // The parent node being examined.
try {
if (idDatatype !== "string") {
throw new TypeError("The id argument must be of type String! Got " + idDatatype);
}
node = document.getElementById(id);
if (node === null) {
throw new TypeError("No element found with the id: " + id);
}
textArray = getStringsFromChildren(node);
if (textArray === null) {
return null; // No text nodes found. Example: <element></element>
}
textArray = filterWhitespaceLines(textArray);
if (textArray.length > 0) {
return textArray; // Leave selecting and joining strings for a specific implementation.
}
} catch (e) {
console.log(e.message);
}
return null; // No text to return.
}
Next, the return value (Array, or null) is sent to the client code where it should be handled. Hopefully, the array should have string elements of real text, not lines of whitespace.
Empty strings ("") are not returned because you need a text node to properly indicate the presence of valid text. Returning ("") may give the false impression that a text node exists, leading someone to assume that they can alter the text by changing the value of .nodeValue. This is false, because a text node does not exist in the case of an empty string.
Example 1:
<p id="bio"></p> <!-- There is no text node here. Return null. -->
Example 2:
<p id="bio">
</p> <!-- There are at least two text nodes ("\n"), here. -->
The problem comes in when you want to make your HTML easy to read by spacing it out. Now, even though there is no human readable valid text, there are still text nodes with newline ("\n") characters in their .nodeValue properties.
Humans see examples one and two as functionally equivalent--empty elements waiting to be filled. The DOM is different than human reasoning. This is why the getStringsFromChildren() function must determine if text nodes exist and gather the .nodeValue values into an array.
for (var i = 0; i < length; i++) {
if (nodeList[i].nodeType === Node.TEXT_NODE) {
textNodes.push(nodeList[i].nodeValue);
}
}
In example two, two text nodes do exist and getStringFromChildren() will return the .nodeValue of both of them ("\n"). However, filterWhitespaceLines() uses a regular expression to filter out lines of pure whitespace characters.
Is returning null instead of newline ("\n") characters a form of lying to the client / calling code? In human terms, no. In DOM terms, yes. However, the issue here is getting text, not editing it. There is no human text to return to the calling code.
One can never know how many newline characters might appear in someone's HTML. Creating a counter that looks for the "second" newline character is unreliable. It might not exist.
Of course, further down the line, the issue of editing text in an empty <p></p> element with extra whitespace (example 2) might mean destroying (maybe, skipping) all but one text node between a paragraph's tags to ensure the element contains precisely what it is supposed to display.
Regardless, except for cases where you are doing something extraordinary, you will need a way to determine which text node's .nodeValue property has the true, human readable text that you want to edit. filterWhitespaceLines gets us half way there.
var whitespaceLine = /(?:^\s+$)/; // Non-capturing Regular Expression.
for (var i = 0; i < filteredTextArray.length; i++) {
if (!whitespaceLine.test(textArray[i])) { // If it is not a line of whitespace.
filteredTextArray.push(textArray[i].trim()); // Trimming here is fine.
}
}
At this point you may have output that looks like this:
["Dealing with text nodes is fun.", "Some people just use jQuery."]
There is no guarantee that these two strings are adjacent to each other in the DOM, so joining them with .join() might make an unnatural composite. Instead, in the code that calls getTextById(), you need to chose which string you want to work with.
Test the output.
try {
var strings = getTextById("bio");
if (strings === null) {
// Do something.
} else if (strings.length === 1) {
// Do something with strings[0]
} else { // Could be another else if
// Do something. It all depends on the context.
}
} catch (e) {
console.log(e.message);
}
One could add .trim() inside of getStringsFromChildren() to get rid of leading and trailing whitespace (or to turn a bunch of spaces into a zero length string (""), but how can you know a priori what every application may need to have happen to the text (string) once it is found? You don't, so leave that to a specific implementation, and let getStringsFromChildren() be generic.
There may be times when this level of specificity (the target and such) is not required. That is great. Use a simple solution in those cases. However, a generalized algorithm enables you to accommodate simple and complex situations.

.text() - for jquery
$('.title').clone() //clone the element
.children() //select all the children
.remove() //remove all the children
.end() //again go back to selected element
.text(); //get the text of element

This will ignore the whitespace as well so, your never got the Blank textNodes..code using core Javascript.
var oDiv = document.getElementById("MyDiv");
var firstText = "";
for (var i = 0; i < oDiv.childNodes.length; i++) {
var curNode = oDiv.childNodes[i];
whitespace = /^\s*$/;
if (curNode.nodeName === "#text" && !(whitespace.test(curNode.nodeValue))) {
firstText = curNode.nodeValue;
break;
}
}
Check it on jsfiddle : - http://jsfiddle.net/webx/ZhLep/

Simply via Vanilla JavaScript:
const el = document.querySelector('.title');
const text = el.firstChild.textContent.trim();

You can also use XPath's text() node test to get the text nodes only. For example
var target = document.querySelector('div.title');
var iter = document.evaluate('text()', target, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE);
var node;
var want = '';
while (node = iter.iterateNext()) {
want += node.data;
}

There are some overcomplicated solutions here but the operation is as straightforward as using .childNodes to get children of all node types and .filter to extract e.nodeType === Node.TEXT_NODEs. Optionally, we may want to do it recursively and/or ignore "empty" text nodes (all whitespace).
These examples convert the nodes to their text content for display purposes, but this is technically a separate step from filtering.
const immediateTextNodes = el =>
[...el.childNodes].filter(e => e.nodeType === Node.TEXT_NODE);
const immediateNonEmptyTextNodes = el =>
[...el.childNodes].filter(e =>
e.nodeType === Node.TEXT_NODE && e.textContent.trim()
);
const firstImmediateTextNode = el =>
[...el.childNodes].find(e => e.nodeType === Node.TEXT_NODE);
const firstImmediateNonEmptyTextNode = el =>
[...el.childNodes].find(e =>
e.nodeType === Node.TEXT_NODE && e.textContent.trim()
);
// example usage:
const text = el => el.textContent;
const p = document.querySelector("p");
console.log(immediateTextNodes(p).map(text));
console.log(immediateNonEmptyTextNodes(p).map(text));
console.log(text(firstImmediateTextNode(p)));
console.log(text(firstImmediateNonEmptyTextNode(p)));
// if you want to trim whitespace:
console.log(immediateNonEmptyTextNodes(p).map(e => text(e).trim()));
<p>
<span>IGNORE</span>
<b>IGNORE</b>
foo
<br>
bar
</p>
Recursive alternative to a NodeIterator:
const deepTextNodes = el => [...el.childNodes].flatMap(e =>
e.nodeType === Node.TEXT_NODE ? e : deepTextNodes(e)
);
const deepNonEmptyTextNodes = el =>
[...el.childNodes].flatMap(e =>
e.nodeType === Node.TEXT_NODE && e.textContent.trim()
? e : deepNonEmptyTextNodes(e)
);
// example usage:
const text = el => el.textContent;
const p = document.querySelector("p");
console.log(deepTextNodes(p).map(text));
console.log(deepNonEmptyTextNodes(p).map(text));
<p>
foo
<span>bar</span>
baz
<span><b>quux</b></span>
</p>
Finally, feel free to join the text node array into a string if you wish using .join(""). But as with trimming and text content extraction, I'd probably not bake this into the core filtering function and leave it to the caller to handle as needed.

Compare the 2nd strings characters with the 1st string in an array

I'm trying to figure out a challenge in Free Code Camp which states the following:
Return true if the string in the first element of the array contains all of the letters of the string in the second element of the array.
I understand how to do this if the 2nd string has a single character or if the 1st string has the 2nd string contained in the exact same sequence (e.g. "hello", "hel" and not "hello", "olleh"). But I can't figure out yet the correct approach to tackle this challenge.
Here is my code...
function mutation(arr) {
var myArray = arr.splice(1).toString().toLowerCase();
var splicedArray = arr.toString().toLowerCase();
if (splicedArray.search(myArray) != -1) {
return true;
} else {
return false;
}
}
Any combination which has a different sequence of the characters evaluates to false.
// e.g this is false
mutation(['Alien', 'line'])
What is the right way to complete this task?

Thanks to #Bergi I figured out the answer. Also he was so kind to allow me to post the answer myself. Here it is...
function mutation(arr) {
var string2 = arr.splice(1).toString().toLowerCase();
var string1 = arr.toString().toLowerCase();
for(var i = 0; i < string2.length; i++) {
if (string1.indexOf(string2.charAt(i)) == -1) {
return false;
}
}
return true;
}
If someone like me (JS beginner) encounters this task and finds this solution, here are some notable resources to read through if you do not know the methods used here..
.splice()
indexOf()
.slice()
Difference between .slice() and .splice() methods

You could also do this:
/**
* Match function that operates on a data array with two elements, where the
* first element is the query and the second element is the searchable data.
*
* Returns true if the query string contains all of the letters of the searchable
* data string.
*
* #param {Array} data - contains query and searchable string data
*
* #return {Boolean} if a match occured
*/
var match = function (data) {
// Convert strings to arrays.
var query = Array.prototype.slice.call(data[0]);
var searchableData = Array.prototype.slice.call(data[1]);
// Every query string character should occur in the searchable data.
return query.every(function (search) {
// Only some of the searchable data characters should occur in the query data.
return searchableData.some(function (target) {
return search === target;
});
});
};
match([ 'abc', 'xyzadefbhijc' ]); // returns true
match([ 'abq', 'xyzadefbhijc' ]); // returns false

My mutation
function mutation(arr) {
var string2 = arr[1].toLowerCase();
var string1 = arr[0].toLowerCase();
for(var i = 0; i < string2.length; i++) {
if (string1.indexOf(string2.charAt(i)) == -1) {
return false;
}
}
return true;
}

The same can be done using map:
function mutation(arr) {
var one = arr[0].toLowerCase();
var two = arr[1].toLowerCase().split('');
var match = true;
two.map(function(val){
if(one.indexOf(val) === -1){
match = false;
}
});
return match;
}

function mutation(arr) {
var src=arr[0].toLowerCase();
var dist=arr[1].toLowerCase();
for(var i=0;i<dist.length;i++){
if(src.indexOf(dist[i])<0) return false;
}
return true;
}
console.log(mutation(["voonoo", "no"]))

Replace text in the middle of a TextNode with an element

I want to insert html tags within a text node with TreeWalker, but TreeWalker forces my html brackets into & lt; & gt; no matter what I've tried. Here is the code:
var text;
var tree = document.createTreeWalker(document.body,NodeFilter.SHOW_TEXT);
while (tree.nextNode()) {
text = tree.currentNode.nodeValue;
text = text.replace(/(\W)(\w+)/g, '$1<element onmouseover="sendWord(\'$2\')">$2</element>');
text = text.replace(/^(\w+)/, '<element onmouseover="sendWord(\'$1\')">$1</element>');
tree.currentNode.nodeValue = text;
}
Using \< or " instead of ' won't help. My workaround is to copy all of the DOM tree to a string and to replace the html body with that. It works on very simple webpages and solves my first problem, but is a bad hack and won't work on anything more than a trivial page. I was wondering if I could just work straight with the text node rather than use a workaround. Here is the code for the (currently buggy) workaround:
var text;
var newHTML = "";
var tree = document.createTreeWalker(document.body);
while (tree.nextNode()) {
text = tree.currentNode.nodeValue;
if (tree.currentNode.nodeType == 3){
text = text.replace(/(\W)(\w+)/g, '$1<element onmouseover="sendWord(\'$2\')">$2</element>');
text = text.replace(/^(\w+)/, '<element onmouseover="sendWord(\'$1\')">$1</element>');
}
newHTML += text
}
document.body.innerHTML = newHTML;
Edit: I realize a better workaround would be to custom tag the text nodes ((Customtag_Start_Here) etc.), copy the whole DOM to a string, and use my customs tags to identify text nodes and modify them that way. But if I don't have to, I'd rather not.

To 'change' a text node into an element, you must replace it with an element. For example:
var text = tree.currentNode;
var el = document.createElement('foo');
el.setAttribute('bar','yes');
text.parentNode.replaceChild( el, text );
If you want to retain part of the text node, and inject an element "in the middle", you need to create another text node and insert it and the element into the tree at the appropriate places in the tree.
Edit: Here's a function that might be super useful to you. :)
Given a text node, it runs a regex on the text values. For each hit that it finds it calls a custom function that you supply. If that function returns a string, then the match is replaced. However, if that function returns an object like:
{ name:"element", attrs{onmouseover:"sendWord('foo')"}, content:"foo" }
then it will split the text node around the match and inject an element in that location. You can also return an array of strings or those objects (and can recursively use arrays, strings, or objects as the content property).
Demo: http://jsfiddle.net/DpqGH/8/
function textNodeReplace(node,regex,handler) {
var mom=node.parentNode, nxt=node.nextSibling,
doc=node.ownerDocument, hits;
if (regex.global) {
while(node && (hits=regex.exec(node.nodeValue))){
regex.lastIndex = 0;
node=handleResult( node, hits, handler.apply(this,hits) );
}
} else if (hits=regex.exec(node.nodeValue))
handleResult( node, hits, handler.apply(this,hits) );
function handleResult(node,hits,results){
var orig = node.nodeValue;
node.nodeValue = orig.slice(0,hits.index);
[].concat(create(mom,results)).forEach(function(n){
mom.insertBefore(n,nxt);
});
var rest = orig.slice(hits.index+hits[0].length);
return rest && mom.insertBefore(doc.createTextNode(rest),nxt);
}
function create(el,o){
if (o.map) return o.map(function(v){ return create(el,v) });
else if (typeof o==='object') {
var e = doc.createElementNS(o.namespaceURI || el.namespaceURI,o.name);
if (o.attrs) for (var a in o.attrs) e.setAttribute(a,o.attrs[a]);
if (o.content) [].concat(create(e,o.content)).forEach(e.appendChild,e);
return e;
} else return doc.createTextNode(o+"");
}
}
It's not quite perfectly generic, as it does not support namespaces on attributes. But hopefully it's enough to get you going. :)
You would use it like so:
findAllTextNodes(document.body).forEach(function(textNode){
replaceTextNode( textNode, /\b\w+/g, function(match){
return {
name:'element',
attrs:{onmouseover:"sendWord('"+match[0]+"')"},
content:match[0]
};
});
});
function findAllTextNodes(node){
var walker = node.ownerDocument.createTreeWalker(node,NodeFilter.SHOW_TEXT);
var textNodes = [];
while (walker.nextNode())
if (walker.currentNode.parentNode.tagName!='SCRIPT')
textNodes.push(walker.currentNode);
return textNodes;
}
or if you want something closer to your original regex:
replaceTextNode( textNode, /(^|\W)(\w+)/g, function(match){
return [
match[1], // might be an empty string
{
name:'element',
attrs:{onmouseover:"sendWord('"+match[2]+"')"},
content:match[2]
}
];
});

Function that returns the parent element of any text node including partial match of passed string:
function findElByText(text, mainNode) {
let textEl = null;
const traverseNodes = function (n) {
if (textEl) {
return;
}
for (var nodes = n.childNodes, i = nodes.length; i--;) {
if (textEl) {
break;
}
var n = nodes[i], nodeType = n.nodeType;
// Its a text node, check if it matches string
if (nodeType == 3) {
if (n.textContent.includes(text)) {
textEl = n.parentElement;
break;
}
}
else if (nodeType == 1 || nodeType == 9 || nodeType == 11) {
traverseNodes(n);
}
}
}
traverseNodes(mainNode);
return textEl;
}
Usage:
findElByText('Some string in document', document.body);

Find and replace specific text characters across a document with JS

I'm wondering if there is a lightweight way I could use JavaScript or jQuery to sniff out a specific text character across a document; say € and find all instances of this character. And then! Write an ability to replace all instances of this with say a $.
I found this snippet for starters:
var str = 'test: '';
str = str.replace(/'/g, "'");
Essentially; I am wanting a solution for a one page document. Grab all instances of X and make it XY. Only text characters.

How about this, replacing # with $:
$("body").children().each(function () {
$(this).html( $(this).html().replace(/#/g,"$") );
});
http://jsfiddle.net/maximua/jp96C/1/

ECMAScript 2015+ approach
Pitfalls when solving this task
This seems like an easy task, but you have to take care of several things:
Simply replacing the entire HTML (e.g. using innerHTML) causes the affected subtree of the DOM to be entirely deleted and replaced, however event listeners are attached to the existing, now deleted elements, so they’re deleted with them. Similarly, WeakMap entries for the existing elements will all be deleted. This is because all of these things need the exact references to the elements or nodes; a replaced innerHTML will create entirely new references and discard the old ones.
Replacing the HTML may also replace <script> or <style> contents, or HTML tag or attribute names, which is not always desired.
Changing the HTML may result in an xss attack.
You may want to replace attribute values, e.g. for title and alt, in a controlled manner as well, but those all-or-nothing approaches as well as regexes are ill-equipped to do so.
Guarding against xss attacks generally can’t be solved by using the approaches below. E.g. if a fetch call reads a URL from somewhere on the page, then sends a request to that URL, the functions below won’t stop that, since this scenario is inherently unsafe.
Replacing the text contents of all elements
This basically selects all elements that contain normal text, goes through their child nodes — among those are also text nodes —, seeks those text nodes out and replaces their contents.
You can optionally specify a different root target, e.g. replaceOnDocument(/€/g, "$", { target: someElement });; by default, the <body> is chosen.
const replaceOnDocument = (pattern, string, {target = document.body} = {}) => {
// Handle `string` — see the last section
[
target,
...target.querySelectorAll("*:not(script):not(noscript):not(style)")
].forEach(({childNodes: [...nodes]}) => nodes
.filter(({nodeType}) => nodeType === Node.TEXT_NODE)
.forEach((textNode) => textNode.textContent = textNode.textContent.replace(pattern, string)));
};
replaceOnDocument(/€/g, "$");
Replacing text nodes, element attributes and properties
Now, this is a little more complex: you need to check three cases: whether a node is a text node, whether it’s an element and its attribute should be replaced, or whether it’s an element and its property should be replaced. A replacer object provides methods for text nodes and for elements.
Before replacing attributes and properties, the replacer needs to check whether the element has a matching attribute; otherwise new attributes get created, undesirably. It also needs to check whether the targeted property is a string, since only strings can be replaced, or whether the matching property to the targeted attribute is not a function, since this may lead to an xss attack.
In the example below, you can see how to use the extended features: in the optional third argument, you may add an attrs property and a props property, which is an iterable (e.g. an array) each, for the attributes to be replaced and the properties to be replaced, respectively.
You’ll also notice that this snippet uses flatMap. If that’s not supported, use a polyfill or replace it by the reduce–concat, or map–reduce–concat construct, as seen in the linked documentation.
const replaceOnDocument = (() => {
const replacer = {
[Node.TEXT_NODE](node, pattern, string){
node.textContent = node.textContent.replace(pattern, string);
},
[Node.ELEMENT_NODE](node, pattern, string, {attrs, props} = {}){
attrs.forEach((attr) => {
if(typeof node[attr] !== "function" && node.hasAttribute(attr)){
node.setAttribute(attr, node.getAttribute(attr).replace(pattern, string));
}
});
props.forEach((prop) => {
if(typeof node[prop] === "string" && node.hasAttribute(prop)){
node[prop] = node[prop].replace(pattern, string);
}
});
}
};
return (pattern, string, {target = document.body, attrs: [...attrs] = [], props: [...props] = []} = {}) => {
// Handle `string` — see the last section
[
target,
...[
target,
...target.querySelectorAll("*:not(script):not(noscript):not(style)")
].flatMap(({childNodes: [...nodes]}) => nodes)
].filter(({nodeType}) => replacer.hasOwnProperty(nodeType))
.forEach((node) => replacer[node.nodeType](node, pattern, string, {
attrs,
props
}));
};
})();
replaceOnDocument(/€/g, "$", {
attrs: [
"title",
"alt",
"onerror" // This will be ignored
],
props: [
"value" // Changing an `<input>`’s `value` attribute won’t change its current value, so the property needs to be accessed here
]
});
Replacing with HTML entities
If you need to make it work with HTML entities like , the above approaches will just literally produce the string , since that’s an HTML entity and will only work when assigning .innerHTML or using related methods.
So let’s solve it by passing the input string to something that accepts an HTML string: a new, temporary HTMLDocument. This is created by the DOMParser’s parseFromString method; in the end we read its documentElement’s textContent:
string = new DOMParser().parseFromString(string, "text/html").documentElement.textContent;
If you want to use this, choose one of the approaches above, depending on whether or not you want to replace HTML attributes and DOM properties in addition to text; then simply replace the comment // Handle `string` — see the last section by the above line.
Now you can use replaceOnDocument(/Güterzug/g, "Güterzug");.
NB: If you don’t use the string handling code, you may also remove the { } around the arrow function body.
Note that this parses HTML entities but still disallows inserting actual HTML tags, since we’re reading only the textContent. This is also safe against most cases of xss: since we’re using parseFromString and the page’s document isn’t affected, no <script> gets downloaded and no onerror handler gets executed.
You should also consider using \xAD instead of  directly in your JavaScript string, if it turns out to be simpler.

My own suggestion is as follows:
function nativeSelector() {
var elements = document.querySelectorAll("body, body *");
var results = [];
var child;
for(var i = 0; i < elements.length; i++) {
child = elements[i].childNodes[0];
if(elements[i].hasChildNodes() && child.nodeType == 3) {
results.push(child);
}
}
return results;
}
var textnodes = nativeSelector(),
_nv;
for (var i = 0, len = textnodes.length; i<len; i++){
_nv = textnodes[i].nodeValue;
textnodes[i].nodeValue = _nv.replace(/£/g,'€');
}
JS Fiddle demo.
The nativeSelector() function comes from an answer (posted by Anurag) to this question: getElementsByTagName() equivalent for textNodes.

I think you may be overthinking this.
My approach is simple.
Enclose you page with a div tag:
<div id="mydiv">
<!-- you page here -->
</div>
In your javascript:
var html=document.getElementById('mydiv').innerHTML;
html = html.replace(/this/g,"that");
document.getElementById('mydiv').innerHTML=html;

Similar to #max-malik's answer, but without using jQuery, you can also do this using document.createTreeWalker:
button.addEventListener('click', e => {
const treeWalker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
while (treeWalker.nextNode()) {
const node = treeWalker.currentNode;
node.textContent = node.textContent.replace(/#/g, '$');
}
})
<div>This is an # that we are # replacing.</div>
<div>This is another # that we are replacing.</div>
<div>
<span>This is an # in a span in # div.</span>
</div>
<br>
<input id="button" type="button" value="Replace # with $" />

Vanilla JavaScript solution:
document.body.innerHTML = document.body.innerHTML.replace(/Original/g, "New")

The best would be to do this server-side or wrap the currency symbols in an element you can select before returning it to the browser, however if neither is an option, you can select all text nodes within the body and do the replace on them. Below i'm doing this using a plugin i wrote 2 years ago that was meant for highlighting text. What i'm doing is finding all occurrences of € and wrapping it in a span with the class currency-symbol, then i'm replacing the text of those spans.
Demo
(function($){
$.fn.highlightText = function () {
// handler first parameter
// is the first parameter a regexp?
var re,
hClass,
reStr,
argType = $.type(arguments[0]),
defaultTagName = $.fn.highlightText.defaultTagName;
if ( argType === "regexp" ) {
// first argument is a regular expression
re = arguments[0];
}
// is the first parameter an array?
else if ( argType === "array" ) {
// first argument is an array, generate
// regular expression string for later use
reStr = arguments[0].join("|");
}
// is the first parameter a string?
else if ( argType === "string" ) {
// store string in regular expression string
// for later use
reStr = arguments[0];
}
// else, return out and do nothing because this
// argument is required.
else {
return;
}
// the second parameter is optional, however,
// it must be a string or boolean value. If it is
// a string, it will be used as the highlight class.
// If it is a boolean value and equal to true, it
// will be used as the third parameter and the highlight
// class will default to "highlight". If it is undefined,
// the highlight class will default to "highlight" and
// the third parameter will default to false, allowing
// the plugin to match partial matches.
// ** The exception is if the first parameter is a regular
// expression, the third parameter will be ignored.
argType = $.type(arguments[1]);
if ( argType === "string" ) {
hClass = arguments[1];
}
else if ( argType === "boolean" ) {
hClass = "highlight";
if ( reStr ) {
reStr = "\\b" + reStr + "\\b";
}
}
else {
hClass = "highlight";
}
if ( arguments[2] && reStr ) {
reStr = reStr = "\\b" + reStr + "\\b";
}
// if re is not defined ( which means either an array or
// string was passed as the first parameter ) create the
// regular expression.
if (!re) {
re = new RegExp( "(" + reStr + ")", "ig" );
}
// iterate through each matched element
return this.each( function() {
// select all contents of this element
$( this ).find( "*" ).andSelf().contents()
// filter to only text nodes that aren't already highlighted
.filter( function () {
return this.nodeType === 3 && $( this ).closest( "." + hClass ).length === 0;
})
// loop through each text node
.each( function () {
var output;
output = this.nodeValue
.replace( re, "<" + defaultTagName + " class='" + hClass + "'>$1</" + defaultTagName +">" );
if ( output !== this.nodeValue ) {
$( this ).wrap( "<p></p>" ).parent()
.html( output ).contents().unwrap();
}
});
});
};
$.fn.highlightText.defaultTagName = "span";
})( jQuery );
$("body").highlightText("€","currency-symbol");
$("span.currency-symbol").text("$");

Use split and join method
$("#idBut").click(function() {
$("body").children().each(function() {
$(this).html($(this).html().split('#').join("$"));
});
});
here is solution

In javascript without using jquery:
document.body.innerText = document.body.innerText.replace('actualword', 'replacementword');

You can use:
str.replace(/text/g, "replaced text");

For each element inside document body modify their text using .text(fn) function.
$("body *").text(function() {
return $(this).text().replace("x", "xy");
});

As you'll be using jQuery anyway, try:
https://github.com/cowboy/jquery-replacetext
Then just do
$("p").replaceText("£", "$")
It seems to do good job of only replacing text and not messing with other elements

str.replace(/replacetext/g,'actualtext')
This replaces all instances of replacetext with actualtext

Here is something that might help someone looking for this answer:
The following uses jquery it searches the whole document and only replaces the text.
for example if we had
overpopulation
and we wanted to add a span with the class overpop around the word overpopulation
<span class="overpop">overpopulation</span>
we would run the following
$("*:containsIN('overpopulation')").filter(
function() {
return $(this).find("*:contains('" + str + "')").length == 0
}
).html(function(_, html) {
if (html != 'undefined') {
return html.replace(/(overpopulation)/gi, '<span class="overpop">$1</span>');
}
});
the search is case insensitive searches the whole document and only replaces the text portions in this case we are searching for the string 'overpopulation'
$.extend($.expr[":"], {
"containsIN": function(elem, i, match, array) {
return (elem.textContent || elem.innerText || "").toLowerCase().indexOf((match[3] || "").toLowerCase()) >= 0;
}
});

How to stop DOM searching loop while the first match is found?

I've modify a dom search/replace script, to replace multiple keywords matching on it, by a link, in a document.
It was working great without any <div> or <p>, but with a complex structure the keywords of each node are replaced...
This an example
As you could see, the same keyword is not linked several times in an element, but while there is some other elements the keywords are linked...
This is the script
(function(){
// don't replace text within these tags
var skipTags = { 'a': 1, 'style': 1, 'script': 1, 'iframe': 1, 'meta':1, 'title':1, 'img':1, 'h1':1 };
// find text nodes to apply replFn to
function findKW( el, term, replFn )
{
var child, tag;
if(!found)var found=false;
for (var i = 0;i<=el.childNodes.length - 1 && !found; i++)
{
child = el.childNodes[i];
if (child.nodeType == 1)
{ // ELEMENT_NODE
tag = child.nodeName.toLowerCase();
if (!(tag in skipTags))
{
findKW(child, term, replFn);
}
}
else if (child.nodeType == 3)
{ // TEXT_NODE
found=replaceKW(child, term, replFn);
}
}
};
// replace terms in text according to replFn
function replaceKW( text, term, replFn)
{
var match,
matches = [],found=false;
while (match = term.exec(text.data))
{
matches.push(match);
}
for (var i = 0;i<=matches.length - 1 && !found; i++)
{
match = matches[i];
// cut out the text node to replace
text.splitText(match.index);
text.nextSibling.splitText(match[1].length);
text.parentNode.replaceChild(replFn(match[1]), text.nextSibling);
if(matches[i])found=true;// To stop the loop
}
return found;
};
// Keywords to replace by a link
var terms=Array('keywords','words');
for(kw in terms)
{
findKW(
document.body,
new RegExp('\\b(' + terms[kw] + ')\\b', 'gi'),
function (match)
{
var link = document.createElement('a');
link.href = 'http://www.okisurf.com/#q=' + terms[kw];
link.id = '1';
link.target = '_blank';
link.innerHTML = match;
return link;
}
);
}
}());
Please anyone could help me to stop the loop and replace only the first keyword matching ? (I'm going crazy with those nodes and the var found that I can't send like global while the threads are working in loop, for the findKW() function...) And without any library (no jQuery or other)

You can return true when you replaced the the word, and test for it to stop the recursion:
if (child.nodeType == 1) { // ELEMENT_NODE
tag = child.nodeName.toLowerCase();
if (!(tag in skipTags)) {
// If `findKW` returns `true`, a replacement as taken place further down
// the hierarchy and we can stop iterating over the other nodes.
if (findKW(child, term, replFn)) {
return true;
}
}
} else if (child.nodeType == 3) { // TEXT_NODE
if (replaceKW(child, term, replFn)) {
return true;
}
}
And remove any reference to found in this function, it is not needed.
DEMO (I also updated the replaceKW function, you don't need to collect all matches if you are only using the first one anyway).

Use break statement to exist from a loop block.
Example :
for(;;) {
if(condition)
break;
}
In your case you should add this on following position
else if (child.nodeType == 3)
{ // TEXT_NODE
found=replaceKW(child, term, replFn);
if(found)
break; // or alternately use return;
}

Develop Reference

JavaScript is the programming language of the Web.

Detecting Hebrew words in document via JavaScript - javascript

Related

How to ignore any element in Javascript? [duplicate]

Compare the 2nd strings characters with the 1st string in an array

Replace text in the middle of a TextNode with an element

Find and replace specific text characters across a document with JS

How to stop DOM searching loop while the first match is found?

Categories

Resources