Problem
I want to collect all table data using javascript and send back to some part of application.
html
<table>
<tbody>
<tr>
<td>
<span>hello</span>
</td>
<td>
<ul>
<li>1<span>:2000</span></li>
<li>2<span>:3000</span></li>
<li>
<span>link</span>
<span>
<a href=''>failed login</a>
</span>
</li>
</ul>
</td>
</tbody>
</table>
output Expected :
hello 1 :2000 :3000 link failed login
Thanks in advance.
You can use a recursive function like the following. There is no normalisation, you may wish to remove excess whitespace and insert spaces between the text from each element.
// Get the text within an element
// Doesn't do any normalising, returns a string
// of text as found.
function getTextRecursive(element) {
var text = [];
var el, els = element.childNodes;
for (var i=0, iLen=els.length; i<iLen; i++) {
el = els[i];
// May need to add other node types here that you don't want the text of
// Exclude script element content
if (el.nodeType == 1 && el.tagName && el.tagName.toLowerCase() != 'script') {
text.push(getTextRecursive(el));
// If working with XML, add nodeType 4 to get text from CDATA nodes
} else if (el.nodeType == 3) {
// Deal with extra whitespace and returns in text here if required
text.push(el.data);
}
}
// Insert a space between each text string and return
// a single string
return text.join(' ');
}
try this
$(document).ready(function() {
var texts = [];
$("table tr td").each(function(i, elem) {
texts.push($.trim($(elem).text()))
});
var str = texts.join(':').replace(/(\r\n|\n|\r)/gm, "")
alert(str);
});
see working example:-
http://jsfiddle.net/fL28r/84/
thansks
Related
Given an html like this:
<tr>
<th style="padding-right:1em">Location</th>
<td>
<span class="location">Lower reaches of the Geum River, Korea</span>
</td>
</tr>
How do I get Geum_River and Korea?
This is what I am doing at the moment:
countryLinks = doSelect("Location").siblings('td').find('a').attr('href');
function doSelect(text) {
return $wikiDOM.find(".infobox th").filter(function() {
return $(this).text() === text;
});
}
function countryList() {
let pattern = new RegExp('\/wiki\/');
string = countryLinks;
countryListLinks = string.replace(pattern, '');
console.log(countryListLinks);
}
if (doSelect('Location').length > 0 && doSelect('Date').length > 0) {
countryList();
};
I am splitting /wiki/ from the string and it works but I am only getting the first one Geum_River while I would expect all of the <a>s href.
You were only selecting first <a> element .href, .attr() returns a single value. Also second condition at if && doSelect('Date').length > 0 is false given HTML at Question.
You can use .map() and .get() to return an array of <a> element .href values, then pass countryList function to Array.prototype.forEach() to iterate .href values.
The RegExp should also be adjusted to replace all characters up to and including "wiki" '^.+\/wiki\/'.
function doSelect(text) {
return $(".infobox th").filter(function() {
return $(this).text() === text;
});
}
countryLinks = doSelect("Location").siblings('td')
.find('a').map(function(i, el) {return el.href}).get(); // .attr('href');
// we can pass this function to `.forEach()` or `.map()`
function countryList(string) {
let pattern = new RegExp('^.+\/wiki\/'); // adjust `RegExp`
// string = countryLinks;
countryListLinks = string.replace(pattern, '');
console.log(countryListLinks);
}
// the second condition is `false` given HTML at Question
if (doSelect('Location').length > 0 /* && doSelect('Date').length > 0 */) {
countryLinks.forEach(countryList);
};
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<table class="infobox">
<tr>
<th style="padding-right:1em">Location</th>
<td>
<span class="location">Lower reaches of the Geum River, Korea</span>
</td>
</tr>
</table>
Your main issue is when you call countryLinks = doSelect("Location").siblings('td').find('a').attr('href'); specifically, when you call the last bit .attr('href'); which the docs state this of
Description: Get the value of an attribute for the first element in the set of matched elements.
So basically, you're getting a collection of the links then reducing that collection to just the first element and return it's href attribute.
Here is how I would do this using .map() instead:
var $wikiDOM = $('.some-container');
var links = $.map($wikiDOM.find('td a'),function(link, i){
return (link.href || '').replace(/^.*[\\\/]/, '');
});
console.log(links);
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<div class="some-container">
<table>
<tr>
<th style="padding-right:1em">Location</th>
<td>
<span class="location">Lower reaches of the Geum River, Korea</span>
</td>
</tr>
</table>
</div>
you can use jQuery each() to get all the hrefs in a array and then display then one by one using for loop.
Here is the code:
var hrefs = new Array();
jQuery('.location').find('a').each(function() {
hrefs.push(jQuery(this).attr('href'));
});
function countryList() {
let pattern = new RegExp('\/wiki\/');
for(var i=0; i < hrefs.length ; i++){
string = hrefs[i];
var countryListLinks = string.replace(pattern, '');
alert(countryListLinks);
}
}
countryList();
Complete Code, should look somthing like this:
function doSelect(text) {
return $wikiDOM.find(".infobox th").filter(function() {
return $(this).text() === text;
});
}
var hrefs = new Array();
jQuery('.location').find('a').each(function() {
hrefs.push(jQuery(this).attr('href'));
});
function countryList() {
let pattern = new RegExp('\/wiki\/');
for(var i=0; i < hrefs.length ; i++){
string = hrefs[i];
var countryListLinks = string.replace(pattern, '');
console.log(countryListLinks);
}
}
if (doSelect('Location').length > 0 && doSelect('Date').length > 0) {
countryList();
};
var hrefs = new Array();
jQuery('.location').find('a').each(function() {
hrefs.push(jQuery(this).attr('href'));
});
function countryList() {
let pattern = new RegExp('\/wiki\/');
for(var i=0; i < hrefs.length ; i++){
string = hrefs[i];
var countryListLinks = string.replace(pattern, '');
alert(countryListLinks);
}
}
countryList();
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.0.2/jquery.min.js"></script>
<tr>
<th style="padding-right:1em">Location</th>
<td>
<span class="location">Lower reaches of the Geum River, Korea</span>
</td>
</tr>
First of all, I'm going to pop an id onto your span element so that I can locate it easily in my script.
<span id="locations"
Next, I'm going to drop your implementation, and instead, iterate through the child elements of the span with id="locations". Next, I'll get the substring of the href of these elements that we want, and push them to an array.
var locations = document.getElementById("locations").getElementsByTagName('a');
var rawLocations = [];
for (i in locations) {
if (locations[i].href) {
var lastIndex = locations[i].href.lastIndexOf('/') + 1;
rawLocations.push(locations[i].href.substring(lastIndex));
}
}
Now, console.log(rawLocations); gives us what we want:
(2) ["Geum_River", "Korea"]
This was not as easy as I thought:-)
But here we go:
// cache the element, you could do QuerySelectorAll and iterate through the
// node list, but let's keep it simple
var ts = document.querySelector('a');
// This is caching the href attribute from the link
var garbage = ts.href;
// here you take the link and run the native string method
// to find the lastIndexOf, this is great if you suck at regex like myself
var n = garbage.lastIndexOf('/');
Here we extract only the part after lastIndexOf and cache it in "result"
var result = garbage.substring(n + 1);
alert(result);
I'm trying to scrape text from an HTML string by using container.innerText || container.textContent where container is the element from which I want to extract text.
Usually, the text I want to extract is located in <p> tags. So for the HTML below as an example:
<div id="container">
<p>This is the first sentence.</p>
<p>This is the second sentence.</p>
</div>
Using
var container = document.getElementById("container");
var text = container.innerText || container.textContent; // the text I want
will return This is the first sentence.This is the second sentence. without a space between the first period and the start of the second sentence.
My overall goal is to parse text using the Stanford CoreNLP, but its parser cannot detect that these are 2 sentences because they are not separated by a space. Is there a better way of extracting text from HTML such that the sentences are separated by a space character?
The HTML I'm parsing will have the text I want mostly in <p> tags, but the HTML may also contain <img>, <a>, and other tags embeeded between <p> tags.
As a dirty hack, try using this:
container.innerHTML.replace(/<.*?>/g," ").replace(/ +/g," ");
This will replace all tags with a space, then collapse multiple spaces into a single one.
Note that if there is a > inside an attribute value, this will mess you up. Avoiding this problem will require more elaborate parsing, such as looping through all text nodes and putting them together.
Longer but more robust method:
function recurse(result, node) {
var c = node.childNodes, l = c.length, i;
for( i=0; i<l; i++) {
if( c[i].nodeType == 3) result += c.nodeValue + " ";
if( c[i].nodeType == 1) result = recurse(result, c[i]);
}
return result;
}
recurse(container);
Assuming I haven't made a stupid mistake, this will perform a depth-first search for text nodes, appending their contents to the result as it goes.
jQuery has the method text() that does what you want. Will this work for you?
I'm not sure if it fits for everything that's in your container but it works in my example. It will also take the text of a <a>-tag and appends it to the text.
Update 20.12.2020
If you're not using jQuery. You could implement the text method with vanilla js like this:
const nodes = Array.from(document.querySelectorAll("#container"));
const text = nodes
.filter((node) => !!node.textContent)
.map((node) => node.textContent)
.join(" ");
Using querySelectorAll("#container") to get every node in the container. Using Array.from so we can work with Array methods like filter, map & join.
Finally, generate the text by filtering out elements with-out textContent. Then use map to get each text and use join to add a space separator between the text.
$(function() {
var textToParse = $('#container').text();
$('#output').html(textToParse);
});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<div id="container">
<p>This is the first sentence.</p>
<p>This is the second sentence.</p>
<img src="http://placehold.it/200x200" alt="Nice picture"></img>
<p>Third sentence.</p>
</div>
<h2>output:</h2>
<div id="output"></div>
You can use the following function to extract and process the text as shown. It basically goes through all the children nodes of the target element and the child nodes of the child nodes and so on ... adding spaces at appropriate points:
function getInnerText( sel ) {
var txt = '';
$( sel ).contents().each(function() {
var children = $(this).children();
txt += ' ' + this.nodeType === 3 ? this.nodeValue : children.length ? getInnerText( this ) : $(this).text();
});
return txt;
}
function getInnerText( sel ) {
var txt = '';
$( sel ).contents().each(function() {
var children = $(this).children();
txt += ' ' + this.nodeType === 3 ?
this.nodeValue : children.length ?
getInnerText( this ) : $(this).text();
});
return txt;
}
alert( getInnerText( '#container' ) );
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
<div id="container">
Some other sentence
<p>This is the first sentence.</p>
<p>This is the second sentence.</p>
</div>
You may use jQuery to traverse down the elements.
Here is the code :
$(document).ready(function()
{
var children = $("#container").find("*");
var text = "";
while (children.html() != undefined)
{
text += children.html()+"\n";
children = children.next();
}
alert(text);
});
Here is the fiddle : http://jsfiddle.net/69wezyc5/
I'm trying to a way to get the first number value present inside a table (and respective tbody), but it needs to be able to find the value the first number, and ignores all the tags it comes accross until it reaches the number value.
<table id="TableID">
<thead></thead>
<tbody>
<tr></tr>
<tr>
<td></td>
<td>
<div>
<div>
<span>
4031007
</span>
</div>
<div>
<span>
whatever
</span>
</div>
</div>
</td>
<td></td>
</tr>
</tbody>
</table>
in the above example, we would try to find 4031007, which is inside a <span>, but it could've been a <div> or something else. I need this without using JQuery. Any help?
You could do it the plain old way: make a recursive function that will return text of the first node which has a text content:
function findFirstNumber(node) {
// If this is a text node, return its contents. Trim it because there is
// whitespace between the elements that should be ignored
if (node.nodeType == Node.TEXT_NODE)
return node.textContent.trim();
// Iterate over all child nodes and finde the first one that has text in it
for (var child = node.firstChild; child; child = child.nextSibling) {
var content = firstText(child);
if (content && isNumber(content))
return content;
}
// No text found
return '';
}
function isNumber(value) {
return !!isNaN(value);
}
console.log(findFirstNumber(document.getElementById('TableID')));
I used the mdn page about Node to find out how to do this.
see fiddle (open your console)
How about a fancy find function that accepts regular expressions.
function findRegExp(start, reg, mod) {
if (! (reg && start)) return this;
return [].slice.call(start.querySelectorAll('*')).filter(function(elem) {
if (typeof reg == 'string')
reg = new RegExp(reg, mod ? mod : '');
var clone = elem.cloneNode(true),
child = clone.children;
for (var i=child.length; i--;)
clone.removeChild(child[i]);
var txt = clone.textContent.trim();
return reg.test(txt);
});
}
to be used like
var elems = findRegExp(document.getElementById('TableID'), /^\d+$/);
FIDDLE
and the jQuery version
$.fn.findRegExp = function(reg, mod) {
if (!reg) return this;
return this.find('*').addBack().filter(function() {
if (typeof reg == 'string')
reg = new RegExp(reg, mod ? mod : '');
var c = $(this).clone();
c.children().remove();
var txt = $.trim(c.text());
return reg.test(txt);
});
}
Then you can search for an element containing only numbers
$('#TableID').findRegExp(/^\d+$/);
FIDDLE
I'm working on a project where I need to replace all occurrences of a string with another string. However, I only want to replace the string if it is text. For example, I want to turn this...
<div id="container">
<h1>Hi</h1>
<h2 class="Hi">Test</h2>
Hi
</div>
into...
<div id="container">
<h1>Hello</h1>
<h2 class="Hi">Test</h2>
Hello
</div>
In that example all of the "Hi"s were turned into "Hello"s except for the "Hi" as the h2 class.
I have tried...
$("#container").html( $("#container").html().replace( /Hi/g, "Hello" ) )
... but that replaces all occurrences of "Hi" in the html as well
This:
$("#container").contents().each(function () {
if (this.nodeType === 3) this.nodeValue = $.trim($(this).text()).replace(/Hi/g, "Hello")
if (this.nodeType === 1) $(this).html( $(this).html().replace(/Hi/g, "Hello") )
})
Produces this:
<div id="container">
<h1>Hello</h1>
<h2 class="Hi">Test</h2>
Hello
</div>
jsFiddle example
Nice results with:
function str_replace_all(string, str_find, str_replace){
try{
return string.replace( new RegExp(str_find, "gi"), str_replace ) ;
} catch(ex){return string;}}
and easier to remember...
replacedstr = str.replace(/needtoreplace/gi, 'replacewith');
needtoreplace should not rounded by '
//Get all text nodes in a given container
//Source: http://stackoverflow.com/a/4399718/560114
function getTextNodesIn(node, includeWhitespaceNodes) {
var textNodes = [], nonWhitespaceMatcher = /\S/;
function getTextNodes(node) {
if (node.nodeType == 3) {
if (includeWhitespaceNodes || nonWhitespaceMatcher.test(node.nodeValue)) {
textNodes.push(node);
}
} else {
for (var i = 0, len = node.childNodes.length; i < len; ++i) {
getTextNodes(node.childNodes[i]);
}
}
}
getTextNodes(node);
return textNodes;
}
var textNodes = getTextNodesIn( $("#container")[0], false );
var i = textNodes.length;
var node;
while (i--) {
node = textNodes[i];
node.textContent = node.textContent.replace(/Hi/g, "Hello");
}
Note that this will also match words where "Hi" is only part of the word, e.g. "Hill". To match the whole word only, use /\bHi\b/g
here you go => http://jsfiddle.net/c3w6X/1/
var children='';
$('#container').children().each(function(){
$(this).html($(this).html().replace(/Hi/g,"Hello")); //change the text of the children
children=children+$(this)[0].outerHTML; //copy the changed child
});
var theText=$('#container').clone().children().remove().end().text(); //get the text outside of the child in the root of the element
$('#container').html(''); //empty the container
$('#container').append(children+theText.replace(/Hi/g,"Hello")); //add the changed text of the root and the changed children to the already emptied element
I have the following html elements from which I have to get some specific texts,
example "John Doe"
I'm a newbie in javascript but have been playing with getElementById etc but I can't seem to get this one right.
<div id="name">
<p><span id="nameheading">name: </span> John Doe</p>
</div>
Bellow is What I have tried:
function askInformation()
{
var nameHeading = document.getElementById("nameheading");
var paragraph = document.getElementsByTagName("p").item(0).innerHTML ;
var name = paragraph[4];
console.log(name); // prints letter (n)
}
I need help please
If you want to get the text following the span in the following:
<div id="name">
<p><span id="nameheading">name: </span> John Doe</p>
</div>
You can use something like:
// Get a reference to the span
var span = document.getElementById('nameheading');
// Get the following text
var text = span.nextSibling.data;
However that is highly dependent on the internal structure, it may be best to loop over text node children and collect the content of all of them. You may also want to trim leading and trailing white space.
You could also get a reference to the parent DIV and use a function like the following that collects the text children and ignores child elements:
// Return the text of the child text nodes of an element,
// but not descendant element text nodes
function getChildText(element) {
var children = element.childNodes;
var text = '';
for (var i=0, iLen=children.length; i<iLen; i++) {
if (children[i].nodeType == '3') {
text += children[i].data;
}
}
return text;
}
var text = getChildText(document.getElementById('name').getElementsByTagName('p')[0]);
or more concisely for hosts that support the querySelector interface:
var text = getChildText(document.querySelector('#name p'));
var paragraph = document.getElementsByTagName("p").item(0).innerHTML ;
var name = paragraph.replace('<span id="nameheading">name: </span>','').trim(); // John Doe