parse all subpages of site

parse all subpages of site - javascript

I tried to parse all sub pages of site with cheerio and request.
Before parsing I think that I need to collect all sub pages urls in an array.
But I don't understand why my code doesn't do it. Maybe anybody know in what my problem? Thanks for help.
Code that must collect subpages:
var validUrls = [];
var expression = /^(\d{1,3}-\d{1,3})/g;
var regex = new RegExp(expression);
var url = "http://www.fullautomatic.ru/index.php/";
if (url.match(regex)) {
validUrls.push(url.match(regex));
console.log(validUrls);
}
I think code example2 will be better for my task but i can't solve problem
var expression = /^(\d{1,3}-\d{1,3})/g;
var regex = new RegExp(expression);
var url = "http://www.fullautomatic.ru/index.php/";
for (var i in url + regex) {
console.log(regex[i]);
}

Related

Getting out a product ID from URL - JavaScript

I'm trying to catch only a product ID from a URL. The ID is broke into 2 pieces:
https://www.example.org/category/first_part_of_ID-some_text-second_part_of_id
I was stuck with this function
var ft = "ft_";
var pod = "_";
var pageUrl = window.location.href;
var pageUrl1 = pageUrl.split("-")[1];
var pageUrl2 = pageUrl.replace("/","");
return pageUrl2;
}
Can anyone please help me clearing out everything except for the numbers? I tried with 2 different functions and putting it together, but it also didn't work.

This answer assumes the id you are trying to extract will always follow /category/ in the URL as well as that the first and second parts of the id always be located between the -some_text- part of the url
const url = window.location.href;
const urlAfterCategory = url.split('/')[4];
const id = `${urlAfterCategory.split('-')[0]}${urlAfterCategory.split('-')[2]}`
console.log(id); // Your id

Retrieve the hyperlink on a specific text string within Google Doc using Apps Script

I'm trying to use Google Apps Script to get the hyperlink from a specific string found in this Google Doc.
The string is ||stock||
The hyperlink is https://www.cnbc.com/quotes/?symbol=aapl&qsearchterm=aapl
Any help is greatly appreciated.
The code I'm currently using
function docReport() {
var doc = DocumentApp.openByUrl('https://docs.google.com/document/d/1XNiqgJ_hM2SWjoR-OTsq1w-ZFKvTIERDIs_NOWJpckY/edit');
var body = doc.getBody();
Logger.log(body.getParagraphs().length);//get the number of paragraphs
//https://www.udemy.com/apps-script-course/learn/v4/t/lecture/10208226?start=0
for (var x=0;x<body.getParagraphs();X++) {
var el = body.getChild(x);
Logger.log(el.getText());
}
var bodyText = body.getText();
var words = bodyText.match(/\S+/g); // get word count for body - https://stackoverflow.com/questions/33338667/function-for-word-count-in-google-docs-apps-script
Logger.log(words.length); // retruns # of words
var paragraphAll = body.getParagraphs(); // gets all paragraph objects in a document
Logger.log(paragraphAll);
var paragraphText = paragraphAll[1].getText().match(/\S+/g);
Logger.log(paragraphText.length); // retruns # of words in a paragraph
}

You want to retrieve hyperlink of the text of ||stock||.
If my understanding is correct, for example, how about this sample script? In your situation, the text value which has a link has already been known. The sample script uses this situation.
By the way, from your question, I'm not sure whether there are several values of ||stock|| in the document. So this sample script supposes that there are several values of ||stock|| in the document.
I think that there are several answers for your situation. So please think of this as one of them.
Sample script:
var searchValue = "\\|\\|stock\\|\\|"; // Search value
var body = DocumentApp.openByUrl('https://docs.google.com/document/d/1XNiqgJ_hM2SWjoR-OTsq1w-ZFKvTIERDIs_NOWJpckY/edit').getBody();
var searchedText = body.findText(searchValue);
var urls = [];
while (searchedText) {
var url = searchedText.getElement().asText().getLinkUrl(searchedText.getStartOffset());
urls.push(url);
searchedText = body.findText(searchValue, searchedText);
}
Logger.log(urls) // Results
Note:
If there is only one search value in the document, you can also use the following script.
var searchValue = "\\|\\|stock\\|\\|";
var body = DocumentApp.openByUrl('https://docs.google.com/document/d/1XNiqgJ_hM2SWjoR-OTsq1w-ZFKvTIERDIs_NOWJpckY/edit').getBody();
var searchedText = body.findText(searchValue);
var url = searchedText.getElement().asText().getLinkUrl(searchedText.getStartOffset());
Logger.log(url)
References:
findText()
getLinkUrl()
If I misunderstand your question, please tell me. I would like to modify it.

error : "Cannot convert Array to Object" Data Scraping script

I'm new to Javascript and even newer to google script, so please be comprehensive :)
I'm trying to build a little script to scrap some data from a bunch of URL. I' using Parser library. Here is what I've:
function getArray() {
var newData = new Array();
var sheet = SpreadsheetApp.openById('my_id').getSheetByName('Sheet4');
var urls = sheet.getRange(1,1,5,5).getValues();
var fromText = '<span class="nb-shares">';
var toText = '</span>';
for(i in urls){
var url = urls[i];
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser
.data(content)
.from(fromText)
.to(toText)
.build();
newData.push(scraped);}
var sheet2 = SpreadsheetApp.openById('my_id').getSheetByName('Sheet5');
sheet2.getRange(5, 1, newData.length, newData[1].length).setValues(newData);
}
It return me the following error : Cannot convert Array to Object
What I'm trying to do is looping on an URLs array so I can scrap some data from each one of these URL and return the results in my sheet.

Try changing newData.push(scraped) to newData.push([scraped])

Extracting img urls from webpage using Google Apps Script

This is an Apps Script that goes through a webpage and collects img urls that are inside some div of a special class.
function getIMGs(url){
var url = 'url'
var result = UrlFetchApp.fetch(url);
if (result.getResponseCode() == 200) {
var doc = Xml.parse(result, true);
var bodyHtml = doc.html.body.toXmlString();
var doc = XmlService.parse(bodyHtml);
var html = doc.getRootElement();
var thumbs = getElementsByClassName(html, 'thumb');
var sheet = SpreadsheetApp.getActiveSheet();
for (i in Thumbs) {
var output = '';
var linksInMenu = getElementsByTagName(thumbs[i], 'img');
for(i in linksInMenu) {
output += XmlService.getRawFormat().format(linksInMenu[i]);
}
var linkRegExp = /data-src="(.*?)"/;
var dataSrc = linkRegExp.exec(output);
sheet.appendRow([dataSrc[1]]);
}
}
So first the code gets the html, and uses an auxiliary function to get certain elements, which look like this:
<div class="thumb"><div class="loader"><span class="icon-uniE611"></span></div><img src="//xxx" data-src="https://xxx/8491a83b1cacc2401907997b5b93e433c03c91f.JPG" data-target="#image-slider" data-slide-to="0"></div>
Then the code gets the img elements, and finally extracts the data-src address via RegExp.
While this kinda works, I have a problem:
1) After 9 loops it crashes, on the appendRow line, as the last 4 Thumbs elements don't have data-src, hence what i'm trying to write into the spreadsheet is null.
Any solution for this? I have fixed it for the moment by just doing 9 iterations only of the For loop, but this is far from optimal, as it's not automated and required me to go through the page to count the elements with data-src.
Also, any suggestion of a more elegant solution will be appreciated! I will be really grateful for any helping hand!
Cheers

Get hyper link from a SharePoint list column

I want to get the hyperlinks from a column of a SharePoint 2010 List. Right now the code gives me the hyperlink and the description concatenated together.
`
$(xData.responseXML).SPFilterNode("z:row").each(function () {
content = new Object(); //get new object
content.title = $(this).attr("ows_Title");
content.url= $(this).attr("ows_Url");`
The content.url gives me "http://www.example. ca,%20http://www.example. ca". I have tried split and then the URL doesn't work.
Any help will be greatly appreciated.
Thanks

if I'm understanding you correctly, split should work... here's an encapsulated example:
<a id="yourlinkId">link</a>
<script>
var url = "http://www.example.ca,%20http://www.example.ca";
var n = url.split(",%20");
var a = document.getElementById("yourlinkId");
a.href = n[0];
</script>
combine that with your code and you end up with something like:
$(xData.responseXML).SPFilterNode("z:row").each(function () {
content = new Object(); //get new object
content.title = $(this).attr("ows_Title");
var url = $(this).attr("ows_Url");
var n = url.split(",%20");
content.url = n[0];

Develop Reference

JavaScript is the programming language of the Web.

parse all subpages of site - javascript

Related

Getting out a product ID from URL - JavaScript

Retrieve the hyperlink on a specific text string within Google Doc using Apps Script

error : "Cannot convert Array to Object" Data Scraping script

Extracting img urls from webpage using Google Apps Script

Get hyper link from a SharePoint list column

Categories

Resources