Python HTML getElementsByClassName alike manipulate on file content

Python HTML getElementsByClassName alike manipulate on file content - javascript

I have saved the source code of a page to a file using Sikuli. I need a "roundup" on a batch of matrix style placed elements. But I don't want to calculate dimensions between them. I want URLs to type in location bar. So I wrote from scratch with a help of MZDN JavaScript implementation of such a "simple" operation. I don't want to use lxml. I want real native libraries - I mean I need "portable" script.
So I've googled a while and decided to ask a question at Stack OverFlow.
I don't want to use
split('<a href=')
magic.
I would like to do this in Python(in the most pythonic way):
var array = document.getElementsByClassName('another')
var j = array.length
for (i=0;i<j;i++) {
element = array[i];
url = element.getElementsByTagName('a')[0].href;
console.log(url);
}
var array = document.getElementsByClassName('else')
var j = array.length
for (i=0;i<j;i++) {
element = array[i];
url = element.getElementsByTagName('a')[0].href;
console.log(url);
}
Managed to do it with split. Python is for kids.
def read_file(filename):
fd = open(filename, 'r')
data = fd.read()
fd.close()
return data
def href(line):
url = line.split('a href=')[1].split('>')[0].strip().replace('"', '').replace("'", '')
return url
html = read_file('source.htm').split('\n')
for line in html:
if 'one' in line:
print href(line)
elif 'another' in line:
print href(line)
elif 'else' in line:
print href(line)

Related

Grab data from website HTML table and transfer to Google Sheets using App-Script

Ok, I know there are similar questions out there to mine, but so far I have yet to find any answers that work for me. What I am trying to do is gather data from an entire HTML table on the web (https://www.sports-reference.com/cbb/schools/indiana/2022-gamelogs.html) and then parse it/transfer it to a range in my Google Sheet. The code below is probably the closest thing I've found so far because at least it doesn't error out, but it will only find one string or value, not the whole table. I've found other answers where they use xmlservice.parse, however that doesn't work for me, I believe because the HTML format has issues that it can't parse. Does anyone have an idea of how to edit what I have below, or a whole new idea that may work for this website?
function SAMPLE() {
const url="http://www.sports-reference.com/cbb/schools/indiana/2022-gamelogs.html#sgl-basic?"
// Get all the static HTML text of the website
const res = UrlFetchApp.fetch(url, {muteHttpExceptions: true}).getContentText();
// Find the index of the string of the parameter we are searching for
index = res.search("td class");
// create a substring to only get the right number values ignoring all the HTML tags and classes
sub = res.substring(index+92,index+102);
Logger.log(sub);
return sub;
}
I understand that I can use importHTML natively in a Google Sheet, and that's what I'm currently doing. However I am doing this for over 350 webpage tables, and iterating through each one to load it and then copy the value to another sheet. App Script bogs down quite a bit when it is repeatedly waiting on Sheets to load an importHTMl and then grab some data and do it all over again on another url. I apologize for any formatting issues in this post or things I've done wrong, this is my first time posting here.
Edit: ok, I've found a method that works, but it's still much slower than I would like, because it is using Drive API to create a document with the HTML data and then parse and create an array from there. The Drive.Files.Insert line is the most time consuming part. Anyone have an idea of how to make this quicker? It may not seem that slow to you right now, but when I need to do this 350 times, it adds up.
function parseTablesFromHTML() {
var html = UrlFetchApp.fetch("https://www.sports-reference.com/cbb/schools/indiana/2022-gamelogs.html");
var docId = Drive.Files.insert(
{ title: "temporalDocument", mimeType: MimeType.GOOGLE_DOCS },
html.getBlob()
).id;
var tables = DocumentApp.openById(docId)
.getBody()
.getTables();
var res = tables.map(function(table) {
var values = [];
for (var row = 0; row < table.getNumRows(); row++) {
var temp = [];
var cols = table.getRow(row);
for (var col = 0; col < cols.getNumCells(); col++) {
temp.push(cols.getCell(col).getText());
}
values.push(temp);
}
return values;
});
Drive.Files.remove(docId);
var range=SpreadsheetApp.getActive().getSheetByName("Test").getRange(3,6,res[0].length,res[0][0].length);
range.setValues(res[0]);
SpreadsheetApp.flush();
}

Solution by formula
Try
=importhtml(url,"table",1)
Other solution by script
function importTableHTML() {
var url = 'https://www.sports-reference.com/cbb/schools/indiana/2022-gamelogs.html'
var html = '<table' + UrlFetchApp.fetch(url, {muteHttpExceptions: true}).getContentText().replace(/(\r\n|\n|\r|\t| )/gm,"").match(/(?<=\<table).*(?=\<\/table)/g) + '</table>';
var trs = [...html.matchAll(/<tr[\s\S\w]+?<\/tr>/g)];
var data = [];
for (var i=0;i<trs.length;i++){
var tds = [...trs[i][0].matchAll(/<(td|th)[\s\S\w]+?<\/(td|th)>/g)];
var prov = [];
for (var j=0;j<tds.length;j++){
donnee=tds[j][0].match(/(?<=\>).*(?=\<\/)/g)[0];
prov.push(stripTags(donnee));
}
data.push(prov);
}
return(data);
}
function stripTags(body) {
var regex = /(<([^>]+)>)/ig;
return body.replace(regex,"");
}

JavaScript Object - Split() method + '\n'

I reached 'fetch' method...
A simple question (complicated for me) ...
How to use 'fetch()' method, 'split()' and '\ n' together?
I will show you an example (i am here to learn and master some skills and i am not ashamed to ask):
I need to read and print the following data using the 'fetch' method:
from the following link 'https://v-dresevic.github.io/Advanced-JavaScript-Programming/data/students.txt' - it is necessary to read the data and print them on the page.
And that is quite clear to me! I managed to do that!
code: enter image description here
my result (wrong result): enter image description here
correct result: enter image description here
My question is:
After reading the data from the file, I have to parse them and create Student objects based on them.
Parsing can be done using the split () method of the String object.
It is best to divide the read text by line breaks, specifying "\ n" for the split () method parameter.
thanks in advance :)

here is a fast example of parsing your data to an array of objects as i think this is the only thing you ask here, from there you can loop that array and display the object as you need.
const url =
"https://v-dresevic.github.io/Advanced-JavaScript-Programming/data/students.txt";
let result = fetch(url)
.then((r) => r.text())
.then(process);
function process(result) {
const linesDescription = ["Name", "Address", "Phone", "Course"];
const array = [];
let obj = {};
var lines = result.split("\n");
let x = 0;
for(var line = 0; line < lines.length; line++){
obj[linesDescription[x]] = lines[line].trim();
x++;
if (x >= linesDescription.length) {
array.push(obj);
x = 0;
obj = {};
}
};
console.log(array);
}

Is there a way to replace text from a JSON object after it has been stringified?

I have some JSON data which contains some urls. I'm extracting these urls from the json by looping through the objects which works fine. The urls however have 'page: ' pre-pended to them which i am trying to replace with 'https://'.
I can't get the replace property to work and give me the same result each time.
I've tried using the replace() property in different way and am using the console.log to view my results. I've also tried to stringify the JSON as I hear this is a good thing to do in order to handle it.
Each time i'm still seeing the 'page: ' word and it hasn't been replaced.
function showTopArticles(jsonObj) {
var getEntries = jsonObj.feed.entry;
var stringified = JSON.stringify(getEntries);
console.log(getEntries);
for (var i = 0; i < getEntries.length; i++) {
var list = document.createElement('article');
var articleTitle = document.createElement('li');
var articleUrl = document.createElement('a');
articleTitle.textContent = getEntries[i].title.$t;
articleUrl.textContent = getEntries[i].content.$t;
articleUrl.textContent.replace("page: ", "https://");
console.log(articleUrl.textContent);
list.appendChild(articleTitle)+list.appendChild(articleUrl);
section.appendChild(list);
}
}
I expect the output url to be 'https://www.google.com' but instead im seeing 'page : www.google.com'

replace() returns a modified value, it does not modify the original string.
You want something like:
articleUrl.textContent = articleUrl.textContent.replace("page: ", "https://");

Getting a list from Django into Javascript as an array

I have a python list generated by my views.py in Django, and I would like to pass it to javascript in my HTML template.
I cannot seem to get it into javscript as an array... I need to evaluate if the list/array contains certain numbers, but it is coming over as a string.
I am passing the list to my HTML template like this:
def get_context_data(self, **kwargs):
context = super(dashboardView, self).get_context_data(**kwargs)
mylist = [10,22,33,45]
context['mylist'] = mylist
return context
When I use:
<h1 id = "list"> {{mylist}}</h1>
it shows up on the browser as
it shows up as [10,22,33,45]
Then in my template I am using javascript I have:
var mylist = document.getElementById("list").innerHTML;
for(i = 0; i < mylist.length; i++){
console.log(mylist[i])
};
this returns in the console:
'
[
1
0
,
2
2
........
I want:
10
22
33
45
I have tried to convert to JSON in python, and parse it in javascript, but can't seem to convert this string to an array, or keep getting errors.
Any ideas for the best method here?

If you are absolutely certain your list is safe, (by that I mean it never, ever includes anything entered by a user), it is very simple.
In your view:
context={"my_list": ["item1", "item2"]}
In your template:
{{ my_list|safe }}
Renders as:
['item1', 'item2']
For example:
{{ my_list|safe }}.forEach(item => {
console.log(item)
})
Impressively, if you pass a string with an apostrophe, Django automatically changes the quote types, so
context = {"my_list": ["item1", "it'em2"]}
renders as (note the double-quotes):
['item1', "it'em2"]
so
{{ my_list|safe }}.forEach
still works (tested on Django 3.2.6).
Even this works:
context = {"my_list": ["item1", "it'em\"2"]}
renders as:
['item1', 'it\'em"2']
so
{{ my_list|safe }}.forEach
still works.
However, as I said at the top, if your list might include input from a user, I still wouldn't trust this approach.

In views you can make your object as a JSON object:
import json
mylistraw = [10,22,33,45]
mylist = json.dumps(mylistraw)
context = {''mylistjson': mylist}
Now you can use your object in JavaScript:
var mylist = JSON.parse("{{mylistjson}}")

Django has the json_script template tag that addresses XSS vulnerabilities by escaping the <, >, and & characters.
Django json_script docs here

You can simply use tojson tag to convert python list to js array.
It will be like -
var mylist = {{mylist|tojson}};
for(i = 0; i < mylist.length; i++){
console.log(mylist[i])
};
Try it and let me know if any problem occurs.

In your case, a simple way is to send the list as string ','.join(mylist). And then in your templates, you could simply use split(',') in js.
views
mylist = [10,22,33,45]
context['mylist'] = ','.join([str(i) for i in mylist])
html & js
var mylist = document.getElementById("list").innerHTML;
mylist = mylist.split(',')
for(i = 0; i < mylist.length; i++){
console.log(mylist[i])
};
Or in case your js is in the template as well
var mylist = '{{mylist}}'.split(',');
for(i = 0; i < mylist.length; i++){
console.log(mylist[i])
};

Set the list value in JavaScript and not HTML.
<script>
var mylist = {{mylist}};
for(i = 0; i < mylist.length; i++){
console.log(mylist[i])
};
<\ script>
You need to do this in your HTML template file and not in an external JS file.

Extracting img urls from webpage using Google Apps Script

This is an Apps Script that goes through a webpage and collects img urls that are inside some div of a special class.
function getIMGs(url){
var url = 'url'
var result = UrlFetchApp.fetch(url);
if (result.getResponseCode() == 200) {
var doc = Xml.parse(result, true);
var bodyHtml = doc.html.body.toXmlString();
var doc = XmlService.parse(bodyHtml);
var html = doc.getRootElement();
var thumbs = getElementsByClassName(html, 'thumb');
var sheet = SpreadsheetApp.getActiveSheet();
for (i in Thumbs) {
var output = '';
var linksInMenu = getElementsByTagName(thumbs[i], 'img');
for(i in linksInMenu) {
output += XmlService.getRawFormat().format(linksInMenu[i]);
}
var linkRegExp = /data-src="(.*?)"/;
var dataSrc = linkRegExp.exec(output);
sheet.appendRow([dataSrc[1]]);
}
}
So first the code gets the html, and uses an auxiliary function to get certain elements, which look like this:
<div class="thumb"><div class="loader"><span class="icon-uniE611"></span></div><img src="//xxx" data-src="https://xxx/8491a83b1cacc2401907997b5b93e433c03c91f.JPG" data-target="#image-slider" data-slide-to="0"></div>
Then the code gets the img elements, and finally extracts the data-src address via RegExp.
While this kinda works, I have a problem:
1) After 9 loops it crashes, on the appendRow line, as the last 4 Thumbs elements don't have data-src, hence what i'm trying to write into the spreadsheet is null.
Any solution for this? I have fixed it for the moment by just doing 9 iterations only of the For loop, but this is far from optimal, as it's not automated and required me to go through the page to count the elements with data-src.
Also, any suggestion of a more elegant solution will be appreciated! I will be really grateful for any helping hand!
Cheers

Develop Reference

JavaScript is the programming language of the Web.

Python HTML getElementsByClassName alike manipulate on file content - javascript

Related

Grab data from website HTML table and transfer to Google Sheets using App-Script

JavaScript Object - Split() method + '\n'

Is there a way to replace text from a JSON object after it has been stringified?

Getting a list from Django into Javascript as an array

Extracting img urls from webpage using Google Apps Script

Categories

Resources