Counting Words Between Two Variable Strings - javascript

Total newbie + first time poster here with very little experience though I feel this problem is one I could solve with the help of some generous strangers.
I am querying a GDoc and attempting to create a function to count words between two strings for two possible end strings, for example:
Example #1
Definitive Title
*Count these words*
===============
OR
Example #2
Definitive Title
*Count these words*
Other words that are in a table
Definitive Title
*Count these other different words*
===============
In both of the above examples I looking to count the words between a pre-defined string and an end string.
If I ran the function that I am trying to create on Example #1 I am hoping it'd return 3 words. For Example #2 I'd hope that my function returns 8 words.
So far my function looks like this:
function doPost(e) {
var docUrl = e.parameter.docUrl
var text = DocumentApp.openByUrl(docUrl).getBody().getText()
var wordCount = text.split(" ").length
return ContentService.createTextOutput(wordCount.toString()).setMimeType(ContentService.MimeType.TEXT)
}
This returns a word count for the entire document. Any advice to point me in the right direction?

For more dynamic, appropriate and accurate solution, execute the following snippets before the split () function. Regular Expressions often used to provide dynamic solutions. It is a must have skill.
text = text.replace(/(^\s*)|(\s*$)/gi,""); // remove the start and end spaces of the string (like trim ())
text = text.replace(/[ ]{2,}/gi," "); // filter out one or more spaces
text = text.replace(/\n /,"\n"); // filter out news lines with spacing at beginning
wordCount = text.split(" ").length;

Here is a solution to your problem you can log the difference of characters and words or you can log the total amount of words or characters in the two sentaces. You are also going to want to put the bigger sentence on top, otherwise it will give you a negative number.
var x = "count these words";
var y = "count words";
function findCharDif(word1, word2) {
var word1length = word1.length;
var word2length = word2.length;
var difference = word1length - word2length;
var total = word1length + word2length;
console.log(difference);
console.log(total);
}
function findWordDif(sentence1, sentence2) {
var words1 = 0;
var words2 = 0;
for (var i = 0; i < sentence1.length; i++) {
if (sentence1[i] == " ") {
words1++;
} else {
continue
}
}
for (var a = 0; a < sentence2.length; a++) {
if (sentence2[a] == " ") {
words2++;
} else {
continue
}
}
var difference = (words1 + 1) - (words2 + 1); // this logs out the difference of words between the sentences
var totalWords = (words1 + 1) + (words2 + 1); // this logs out the total amount of words
console.log(difference);
console.log(totalWords);
}
findCharDif(x, y);
findWordDif(x, y);

The below code seems to have worked! Was able to sit down with someone and solve it with them:
function doPost(e) {
var docUrl = e.parameter.docUrl
/*
var text = DocumentApp.openByUrl(docUrl).getBody().getText()
var wordCount = text.split(" ").length
*/
var wordCount = countScenario2(docUrl);
return ContentService.createTextOutput(wordCount.toString()).setMimeType(ContentService.MimeType.TEXT)
}
/**
* Count the words from Start Test to a table or ====
*/
function countScenario2(docUrl) {
//var docUrl = 'https://docs.google.com/document/d/';
var doc = DocumentApp.openByUrl(docUrl);
var body = doc.getBody();
var reference = body.findText('Start Text');
var start = getIndex('Start Text', body);
var tables = body.getTables();
var count = 0;
for(var j = 1; j < tables.length ; j ++) {
var end = body.getChildIndex(tables[j]);
for (var i = start + 1; i < end; i++) {
var element = body.getChild(i);
var text = element.getText();
//if(text.length > 0) count += text.split(" ").filter(word => word !== ' ' && word !== '' && word !== ' ').length;
var match = text.match(/\b(\w+)\b/g);
count += (match) ? match.length : 0;
}
console.log(count);
var reference = body.findText('Start Text', reference);
var element = reference.getElement();
var start = body.getChildIndex(element.getParent());
}
var end = getIndex('=========================================================', body);
for (var i = start + 1; i < end; i++) {
var element = body.getChild(i);
var text = element.getText();
//if(text.length > 0) count += text.split(" ").filter(word => word !== ' ' && word !== '' && word !== ' ').length;
var match = text.match(/\b(\w+)\b/g);
count += (match) ? match.length : 0;
}
console.log(count);
return count ;
}
/**
* This will return the index of the element
*
* #param {string} keyword The text to be found
* #param {Body} body This is the body of the document
*/
function getIndex(keyword, body, previous) {
var reference = body.findText(keyword, previous);
var element = reference.getElement();
return body.getChildIndex(element.getParent());
}
/************ */
function testPost(){
var e = {parameter:{docUrl:'https://docs.google.com/document/d/'}};
var result = doPost(e);
console.log(JSON.stringify(result.getContent()));}
/**
* Count the words from Start Text to ====
*/
function countScenario1(docUrl) {
//var docUrl = 'https://docs.google.com/document/d/';
var doc = DocumentApp.openByUrl(docUrl);
var body = doc.getBody();
var start = getIndex('Start Text', body);
var end = getIndex('=========================================================', body);
var count = 0;
for (var i = start + 1; i < end; i++) {
var element = body.getChild(i);
var text = element.getText();
//if(text.length > 0) count += text.split(" ").filter(word => word !== ' ' && word !== '' && word !== ' ').length;
var match = text.match(/\b(\w+)\b/g);
count += (match) ? match.length : 0;
}
console.log(count);
return count;
}
function test(){
var docUrl = 'https://docs.google.com/document/d/';
var wordCount = countScenario2(docUrl);
console.log(wordCount);
}

As what #Rishabh K said in his answer, you should definitely want to replace trailing spaces and multiple spaces to avoid inaccurate results.
However on the other hand, I don't think it answers the OP's question. Correct me if I'm wrong but I think this is what you want:
var sample1 = `This is the start identifier
These words should be included
As well As these ones
Even this
Until it ends
now
Ending identifier
These words shouldn't be included
If any of these appears, the logic is wrong`;
var sample2 = sample1 + `
This is the start identifier
These some few words
should also be included in the result set
Ending identifier`;
var sample3 = sample2 + `
This is the start identifier
Although we have the start identifier above
These words shouldn't be included
because there is no corresponding end identifier`;
function getWordDiffBetween(source, str1, str2) {
// make sure newSource, str1 and str2 are all strings
var args = Array.prototype.slice.call(arguments);
args.forEach(function(str, idx) {
if (typeof str !== 'string') {
throw `Argument ${[idx + 1]} is not a string.`;
}
});
var startId = '<==start==>',
endId = '<==end==>';
var newSource = source.replace(new RegExp(str1, 'g'), startId) // replace the start identifier with our own
.replace(new RegExp(str2 + '|={2,}', 'g'), endId) // replace the end identifier with our own
.replace(/(^\s*)|(\s*$)/gi, "") // remove the start and end spaces of the string (like trim ())
.replace(/\s+/g, ' ') //replace all 1 or more spaces/newline/linefeed with a single space
//separate text into words which are separated by a space since we replaced all newlines with space
var words = newSource.split(' ');
// get the indexes where the start and end identifiers occured
var strOneIdx = getAllIndexes(words, startId, true);
var strTwoIdx = getAllIndexes(words, endId, true);
var results = [], // we will store our results here
i;
for (i = 0; i < strOneIdx.length; i++) {
var idxOne = strOneIdx[i]; // current index for str1
var idxTwo = strTwoIdx.find(x => x > idxOne);
//make sure that idxOne has a partner
if (idxTwo) {
var wordsInBetween = words.slice(idxOne + 1, idxTwo); //get range between idxOne and idxTwo
results = results.concat(wordsInBetween); // add the result
}
}
return results;
}
function getAllIndexes(arr, val) {
var indexes = [],
i;
for (i = 0; i < arr.length; i++) {
if (arr[i] === val) {
indexes.push(i);
}
}
return indexes;
}
var startIdentifier = 'This is the start identifier',
endIdentifier = 'Ending identifier',
wordResults = {
sample1: getWordDiffBetween(sample1, startIdentifier, endIdentifier),
sample2: getWordDiffBetween(sample2, startIdentifier, endIdentifier),
sample3: getWordDiffBetween(sample3, startIdentifier, endIdentifier) //should be equal to sample2
};
console.log(wordResults);
We have 2 functions - getWordDiffBetween and getAllIndexes. For explanation, check the comments I added in noteworthy lines.
Edit (updated snippet above):
It seems like you also want "====================" included as your end identifier. This can be done by changing the code:
.replace(new RegExp(str2, 'g'), endId) // replace the end identifier with our own
into
.replace(new RegExp(str2 + '|={2,}', 'g'), endId) // replace the end identifier with our own
which means match occurence of your <end string> or if there is 2 or more occurences of =. You can also change the number 2 in {2,} to your desired count.

Related

Group regex is missing last match

as given in sample code fstr need match all occurence of regEx_C and need to replace with some dynamic value. and make sure each match should have unique value.
in this sample fstr.match(regEX_C) matches 4 match and i need to replace with 1 dynamic value for each.
var_Viablecells_C = X
var_IVCC_C = Y
var_Viablecells_C = Z
var_IVCC_C = Z1
the value assigned should not match with anyone.
var fstr = '(Math.log(var_Viablecells_C-var_Viablecells_P)/(2-1)+ var_IVCC_C + var_Viablecells_C / var_IVCC_C)';
var regEx_C = /var_(\w+)_C/ig;
var c_val = 5.5;
function putExpValForRegExMatch(fstr, regEx, val) {
var all_match = fstr.match(regEx);
if (null == all_match) return fstr;
console.log(all_match);
for (var i = 0; i < all_match.length; i++) {
console.log(i);
console.log(fstr);
var current_match = regEx.exec(fstr);
if (current_match == null) return fstr;
console.log(current_match);
fstr = replaceRange(fstr, current_match.index, current_match.index + current_match[0].length - 1, '');
fstr = replaceAt(fstr, current_match.index, val);
console.log(fstr);
}
return fstr;
}
function replaceAt(string, index, replace) {
return string.substring(0, index) + replace + string.substring(index + 1);
}
function replaceRange(s, start, end, substitute) {
return s.substring(0, start) + substitute + s.substring(end);
}
console.log(putExpValForRegExMatch(fstr, regEx_C, c_val));
You can just use string.prototype.replace()
var fstr = '(Math.log(var_Viablecells_C-var_Viablecells_P)/(2-1)+ var_IVCC_C + var_Viablecells_C / var_IVCC_C)';
var regEx_C = /var_(\w+)_C/ig;
var c_val = 5.5;
output = fstr.replace(regEx_C,c_val)

Why is the comparison operator not finding matches in Google App Script?

I have a small function that takes a string (strOrder = 658492-1), splits it into just the first part before the '-' and compares it to values in a cell in a spreadsheet (which is also split). If it is a match, it should add the spreadsheet row to an array. Unfortunately, it is not finding any matches, but even the log shows they are the same. Help!
function getLinks2(strOrder) {
var mLinks = SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Main');
var data = mLinks.getDataRange().getValues();
var boxOrder = '';
var shortOrder = '';
var sOrder = '';
var arrLinks = [];
data.shift();
strOrder = strOrder.replace(/\s/g, "");
var oTemp = strOrder.split('-');
boxOrder = oTemp[0];
Logger.log('boxOrder:x' + boxOrder + 'x');
for (var i = 0; i < data.length; i++) {
shortOrder = data[i][4].toString();
shortOrder = shortOrder.replace(/\s/g, "");
var soTemp = shortOrder.split('-');
sOrder = soTemp[0];
Logger.log('sOrder:x' + sOrder + 'x');
if (boxOrder == sOrder) {
Logger.log('match');
arrLinks.push(data[i]);
//the item matches the order id
} else {
//item does not match the order id
} // end if order id matches
} // end for loop through data
return arrLinks;
} // end function getLinks2
Here are the Log results:
boxOrder:x685492x
sOrder:x658492x
sOrder:x658492x
sOrder:x658492x
sOrder:x699249x
sOrder:x698406x
(I used the 'x' around the strings to check if there were extra spaces.)
What am I missing?
Thanks for any help!

Try to implement a simple text parser in javascript

i´m trying to bring a simple text parser from Java to Javascript.
The requierement is to transform a given csv file in to another format. The original file list a number of values according to one id in certain lines:
for example:
11111; 12; 23; 23 ;....
11111; 32; 12; 12 ;....
So the first value is an Id and the other values are according to this Id.
Now I need the same file with alle the values according to the one Id in a single line.
the result should be something like:
11111;12; 23; 23; 32; 12; 12 ;....
I already achieved this with a simple Java class:
public static void main(String[] args) throws Exception {
PrintWriter writer = new PrintWriter("t2_lines.csv", "UTF-8");
BufferedReader br = new BufferedReader(new FileReader("t2.csv"));
String previousId="";
String line;
while ((line = br.readLine()) != null) {
String [] words = line.split(";");
String id = words[0];
if (previousId.equals(id)){
// the loop starts at 4 to cut out some unneded values
for(int i=4;i<words.length;i++) {
writer.print(words[i]+";");
}
}else{
writer.println("");
for(String word : words)
writer.print(word+";");
previousId = id;
}
}
br.close();
writer.close();
}
and now I try to rebuild this thing in Javascript by read in a file from the client and present the result in a textfield - but unfortunately i´ve never implemented anything in Javascript before...
This is my approach so far:
window.onload = function () {
var fileInput = document.getElementById('fileInput');
var origFileDisplayArea = document.getElementById('origFileDisplayArea');
var reformatFileDisplayArea= document.getElementById('reformatFileDisplayArea');
fileInput.addEventListener('change', function (e) {
var file = fileInput.files[0];
var textType = /text.*/;
if (file.type.match(textType)) {
var reader = new FileReader();
reader.readAsText(file);
reader.onload = function (e) {
var result = reader.result;
var table = parse(result);
origFileDisplayArea.innerText = table;
}
} else {
origFileDisplayArea.innerText = "File not supported!"
}
});
}
function parse(input) {
var previousId = "";
var table = "";
if (typeof input !== "undefined")
var lines = input.split("\n");
for (var i = 0; i <= lines.length; i++) {
var line = lines[i];
if (typeof line !== "undefined")
var words = line.split(";");
console.log("words length: ", words.length);
for (var j = 0; j <= words.length; j++ ) {
var word = words[j];
if (typeof word !== "undefined") {
word.toString();
var id = words[0];
if (previousId === id) {
for (var jj = 4; jj <=words.length; jj++){
console.log("jj: " + jj)
table += words[jj]+";";
}
}else {
table += "\n";
for (var word in words) {
table += word + ";";
previousId = id;
}
}
}
}
}
return table;
}
But unfortunately i´m stucked now with undefined values and the whole thing took ages to run.
So any hints/help would be greatly appreciated.
Thanks in advance
Yes for the FileReader, I can't see a way to avoid that in this context. That doesn't look like where you have the problem.
As for parse, the split method can use up a lot of memory so I'd avoid using it on the whole file, and for..in is not designed for looping over an Array.
function parse(str_in) {
var i = -1, j = -1,
str_out = '',
last_id = '',
words;
str_in += '\n'; // not sure if necessary - let the last line pass `while`
// loop by seeking out the next new line
// i = old_index + 1
// j = next \n after old_index
// .slice(i, j) gives just the line
while (-1 !== (j = str_in.indexOf('\n', i = j + 1))) {
words = str_in.slice(i, j).split(';')
// loop words to trim whitespace here if you want
if (last_id === words[0]) // throw away first item if on the same id
words = words.slice(1);
else {
last_id = words[0];
if (str_out.length) // lazy prevent first char newline
str_out += '\n';
}
str_out += words.join(';'); // if you trimmed witespace, re-add here
// if you don't have a final semicolon, add it too
}
return str_out;
}
Now
parse('11111; 12; 23; 23 ;\n11111; 32; 12; 12 ;');
// "11111; 12; 23; 23 ; 32; 12; 12 ;"
Alternatively, you might find it easier to write methods similar to what you're used to in Java so you can work with minimal changes, e.g.
function ReadLineGenerator(text) {
var start = -1, end = -1;
return function readLine() {
if (end < start) {
start = end = -1;
return null;
}
start = end + 1;
end = text.indexOf('\n', start);
if (end !== -1)
return text.slice(start, end);
else
return text.slice(start);
};
}
// example usage
var str = 'a\nb\nc',
f = ReadLineGenerator(str),
line;
while (null !== (line = f()))
console.log(line);
// "a", "b", "c" logged
// line === null

JavaScript - Need help with string manipulation

say you have:
var foo = "donut [$25]"
What would you need to do in order to delete everything between and including the [ ].
so you get: foo = "donut" after the code is run.
So far I have tried most of the solutions below, but they all either do nothing or crash.
Maybe it's something with my code, please see below:
$('select').change(function () { OnSuccess(mydata); });
function OnSuccess(data) {
var total = 0;
$('select').each(function () {
var sov = parseInt($(this).find('option:selected').attr('value')) || 0; //Selected option value
var sop; //Selected Option Price
for (i = 0; i <= data.length; i++) {
if (data[i].partid == sov) {
sop = data[i].price;
total += sop;
$('#totalprice').html(total);
break;
}
};
//debugger;
$(this).find('option').each(function () {
// $(this).append('<span></span>');
var uov = parseInt($(this).attr('value')) || 0; //Unselected option value
var uop; //Unselected Option Price
for (d = 0; d <= data.length; d++) {
if (data[d].partid == uov) {
uop = data[d].price;
break;
}
}
//debugger;
var newtext = uop - sop;
//{ newtext = "" };
//if (newtext = 0) { newtext.toString; newtext = ""; };
//debugger;
var xtext = $(this).text().toString();
//if (xtext.match(/\[.*\]/) != null) {
xtext.replace(/\s*\[[\s\S]*?\]\s*/g, '').trim();
//}
// var temp = xtext.split('[')[0];
// var temp2 = xtext.split(']')[1];
// resultx = temp + temp2;
if (newtext != 0) {
//xtext.replace(/[.*?]/, "");
$(this).attr("text", xtext + " " + "[" + "$" + newtext + "]");
};
});
});
};
You can also use a regular expression, as Jon Martin pointed out:
var yum = "donut[$25]";
yum.replace(/[.*?]/, ""); // returns "donut"
Alternatively:
var temp = foo.split('[')[0];
var temp2 = foo.split(']')[1];
foo = temp + temp2;
You can use regular expressions (the RegExp() object) to match strings.
var foo = "donut[$25]";
foo.match(/\[.*\]/);
The above will return an array of every item in [square brackets], in this case ["[$25]"].
To just get one result as a string, specify the first index like so:
foo.match(/\[.*\]/)[0];
The above will return "[$25]"
Edit: You know what? I completely misread which bit of the string you're after. This is what you're after:
var foo = "donut[$25]";
foo.match(/\w*/)[0];
How about simply;
var yum = "donut[$25]";
print( yum.substr(0, yum.indexOf("[")) );
>>donut
var begin = foo.search("[");
var end = foo.search("]");
var result = foo.substr(0, begin) + foo.substr(end+1); //Combine anything before [ and after ]
Should be ok right?
Your question leaves unspecified the treatment of the spaces before the [ character, anything after the ], will your string ever contain a linefeed character, multiple occurrences of [..], leading or trailing spaces.
The following will replace all occurrences of 'spaces [ ... ] spaces' with a single space, then it trims the result to remove any leading/trailing spaces.
v.replace (/\s*\[[\s\S]*?\]\s*/g, ' ').trim ();

What's the best way to count keywords in JavaScript?

What's the best and most efficient way to count keywords in JavaScript? Basically, I'd like to take a string and get the top N words or phrases that occur in the string, mainly for the use of suggesting tags. I'm looking more for conceptual hints or links to real-life examples than actual code, but I certainly wouldn't mind if you'd like to share code as well. If there are particular functions that would help, I'd also appreciate that.
Right now I think I'm at using the split() function to separate the string by spaces and then cleaning punctuation out with a regular expression. I'd also want it to be case-insensitive.
Cut, paste + execute demo:
var text = "Text to be examined to determine which n words are used the most";
// Find 'em!
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
var word = matches[0].toLowerCase();
if (typeof words[word] == "undefined")
{
words[word] = 1;
}
else
{
words[word]++;
}
}
// Sort 'em!
var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
wordList.push([word, words[word]]);
}
}
wordList.sort(function(a, b) { return b[1] - a[1]; });
// Come back any time, straaanger!
var n = 10;
var message = ["The top " + n + " words are:"];
for (var i = 0; i < n; i++)
{
message.push(wordList[i][0] + " - " + wordList[i][1] + " occurance" +
(wordList[i][1] == 1 ? "" : "s"));
}
alert(message.join("\n"));
Reusable function:
function getTopNWords(text, n)
{
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
var word = matches[0].toLowerCase();
if (typeof words[word] == "undefined")
{
words[word] = 1;
}
else
{
words[word]++;
}
}
var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
wordList.push([word, words[word]]);
}
}
wordList.sort(function(a, b) { return b[1] - a[1]; });
var topWords = [];
for (var i = 0; i < n; i++)
{
topWords.push(wordList[i][0]);
}
return topWords;
}
Once you have that array of words cleaned up, and let's say you call it wordArray:
var keywordRegistry = {};
for(var i = 0; i < wordArray.length; i++) {
if(keywordRegistry.hasOwnProperty(wordArray[i]) == false) {
keywordRegistry[wordArray[i]] = 0;
}
keywordRegistry[wordArray[i]] = keywordRegistry[wordArray[i]] + 1;
}
// now keywordRegistry will have, as properties, all of the
// words in your word array with their respective counts
// this will alert (choose something better than alert) all words and their counts
for(var keyword in keywordRegistry) {
alert("The keyword '" + keyword + "' occurred " + keywordRegistry[keyword] + " times");
}
That should give you the basics of doing this part of the work.
Try to split you string on words and count the resulting words, then sort on the counts.
This builds upon a previous answer by insin by only having one loop:
function top_words(text, n) {
// Split text on non word characters
var words = text.toLowerCase().split(/\W+/)
var positions = new Array()
var word_counts = new Array()
for (var i=0; i<words.length; i++) {
var word = words[i]
if (!word) {
continue
}
if (typeof positions[word] == 'undefined') {
positions[word] = word_counts.length
word_counts.push([word, 1])
} else {
word_counts[positions[word]][1]++
}
}
// Put most frequent words at the beginning.
word_counts.sort(function (a, b) {return b[1] - a[1]})
// Return the first n items
return word_counts.slice(0, n)
}
// Let's see if it works.
var text = "Words in here are repeated. Are repeated, repeated!"
alert(top_words(text, 3))
The result of the example is: [['repeated',3], ['are',2], ['words', 1]]
I would do exactly what you have mentioned above to isolate each word. I would then probably add each word as the index of an array with the number of occurrences as the value.
For example:
var a = new Array;
a[word] = a[word]?a[word]+1:1;
Now you know how many unique words there are (a.length) and how many occurrences of each word existed (a[word]).

Categories

Resources