So I have been having some issues when trying to scrape Javascript values from a bs4 code.
Basically the javascript looks like
<script type="text/javascript">
var FancyboxI18nClose = 'Close';
var FancyboxI18nNext = 'Next';
var FancyboxI18nPrev = 'Previous';
var PS_CATALOG_MODE = false;
var ajaxsearch = true;
var attribute_anchor_separator = '-';
var blocksearch_type = 'top';
var combinationsFromController = {"163972":{"attributes_values":{"15":"40"},"attributes":[75],"price":0,"specific_price":false,"ecotax":0,"weight":0.6,"quantity":1,"reference":"IDP20059--IDPA163972","unit_impact":0,"minimal_quantity":"1","date_formatted":"","available_date":"","id_image":-1,"list":"'75'"}};
var comparator_max_item = 0;
</script>
and what I am trying to do here is to scrape the value var combinationsFromController = however what I tried to do is:
bs4 = soup(requests.text, 'html.parser')
for nosto_sku_tag in bs4.find_all('script', {'type': 'text/javascript'}):
if 'combinationsFromController' in nosto_sku_tag.text.strip():
print(nosto_sku_tag)
for att, values in json.loads(
re.findall('var combinationsFromController = (\{.*}?);', nosto_sku_tag.text.strip())[0][:-1]).values():
print(values)
Which gives me an error of Expecting ',' delimiter: line 1 column 4112 (char 4111)
I did realized that whenever I try to do
for nosto_sku_tag in bs4.find_all('script', {'type': 'text/javascript'}):
if 'combinationsFromController' in nosto_sku_tag.text.strip():
print(nosto_sku_tag)
print("---------")
The outprint gives me:
var FancyboxI18nClose = 'Close';
var FancyboxI18nNext = 'Next';
var FancyboxI18nPrev = 'Previous';
var PS_CATALOG_MODE = false;
var ajaxsearch = true;
var attribute_anchor_separator = '-';
var blocksearch_type = 'top';
var combinationsFromController = {"163972":{"attributes_values":{"15":"40"},"attributes":[75],"price":0,"specific_price":false,"ecotax":0,"weight":0.6,"quantity":1,"reference":"IDP20059--IDPA163972","unit_impact":0,"minimal_quantity":"1","date_formatted":"","available_date":"","id_image":-1,"list":"'75'"}};
var comparator_max_item = 0;
----------------------------
Which seems to mean that the javascript code is as one code which I believe maybe needs to split, However I tried to use regex for it but it didn't help me.
So my question is how am I able to scrape ONLY the value var combinationsFromController =?
Use the following regex pattern to isolate the entire javascript object which is assigned to that variable.
combinationsFromController = (.*?);
Try it here.
E.g.
import requests, re, json
r = requests.get(url)
p = re.compile(r'combinationsFromController = (.*?);', re.DOTALL)
data = json.loads(p.findall(r.text)[0])
Related
Say I have a simple HTML document with the following:
cake
pie
and results.html contains a variable that needs to be declared in Javscript as 100 for the first link, 200 for the second, and so on.
Is there any way I can make it so that when you click on the first link, it opens results.html with a javascript variable set to cake, the second one opens results.html with the variable set to pie, and so on?
You can pass them in the query string. For example:
cake
pie
In your results.html page you get and parse the query string. For example:
var queryStr = window.location.search.substring(1);
var params = parseQueryStr(queryStr);
console.log(params["result"]);
var parseQueryStr = function(queryStr) {
var params = {};
var queryArray = queryStr.split("&");
for (var i = 0, var l = queryArray.length; i < l; i++ ) {
var temp = queryArray[i].split('=');
params[temp[0]] = temp[1];
}
return params;
};
cake
pie
In results.html :
<script type="text/javascript">
function getQueryParams(qs) {
qs = qs.split("+").join(" ");
var params = {}, tokens,
re = /[?&]?([^=]+)=([^&]*)/g;
while (tokens = re.exec(qs)) {
params[decodeURIComponent(tokens[1])]
= decodeURIComponent(tokens[2]);
}
return params;
}
var query = getQueryParams(document.location.search);
alert(query.keyword);
</script>
I have a very confusing javascript code
<script type='text/javascript'>
//<![CDATA[
var _8336;varvar _1821=/[\x41\x42\x43\x44\x45\x46]/;var _2581=2;var _2576=_9264.charAt(_9264.length-1);var _9238;var _1967=_9264.split(_1821);var _5737=[String.fromCharCode,isNaN,parseInt,String];_1967[1]=_5737[_2581+1](_5737[_2581](_1967[1])/21);var _4557=(_2581==7)?String:eval;_9238='';_11=_5737[_2581](_1967[0])/_5737[_2581](_1967[1]);for(_8336=3;_8336<_11;_8336++)_9238+=(_5737[_2581-2]((_5737[_2581](_1967[_8336])+_5737[_2581](_1967[2])+_5737[_2581](_1967[1]))/_5737[_2581](_1967[1])-_5737[_2581](_1967[2])+_5737[_2581](_1967[1])-1));var _1392='_5001';var _6523='_1392=_9238';function _7340(_8036){_4557(_7420);_7340(_7857);_7857(_6523);_7340(_1392);}var _7420='_7340=_4557';var _7857='_7857=_7340';_7340(_2576);
//]]>
</script>
I want to convert these very confusing codes into something readable so that i can work on it and edit it.
First, make the code more readable using http://jsbeautifier.org/:
var _8336;
var _9264 = /* [long hexadecimal string here] */;
var _1821 = /[\x41\x42\x43\x44\x45\x46]/;
var _2581 = 2;
var _2576 = _9264.charAt(_9264.length - 1);
var _9238;
var _1967 = _9264.split(_1821);
var _5737 = [String.fromCharCode, isNaN, parseInt, String];
_1967[1] = _5737[_2581 + 1](_5737[_2581](_1967[1]) / 21);
var _4557 = (_2581 == 7) ? String : eval;
_9238 = '';
_11 = _5737[_2581](_1967[0]) / _5737[_2581](_1967[1]);
for (_8336 = 3; _8336 < _11; _8336++) _9238 += (_5737[_2581 - 2]((_5737[_2581](_1967[_8336]) + _5737[_2581](_1967[2]) + _5737[_2581](_1967[1])) / _5737[_2581](_1967[1]) - _5737[_2581](_1967[2]) + _5737[_2581](_1967[1]) - 1));
var _1392 = '_5001';
var _6523 = '_1392=_9238';
function _7340(_8036) {
_4557(_7420);
_7340(_7857);
_7857(_6523);
_7340(_1392);
}
var _7420 = '_7340=_4557';
var _7857 = '_7857=_7340';
_7340(_2576);
After trying a bit to understand how it works, you can determine that _7340 is assigned to eval, _1392 is assigned to the decrypted string, and you simply have to replace _7340(_1392); by console.log(_1392); in order to see the deobfuscated code in your browser's web console.
This line is breaking code highlighting in HTML/JS syntax highlighter in Gedit editor. This is a variable declared in the <script> tag:
var HTML_FRG6 = '"/></li>';
I know something's wrong with it, I just cant figure out what!
<!DOCTYPE html>
<html>
<head>
<title>News</title>
<link rel="stylesheet" href="css-js/jquery.mobile-1.0a2.min.css" />
<script src="css-js/jquery-1.4.4.min.js"></script>
<script src="css-js/jquery.mobile-1.0a2.min.js"></script>
</head>
<body>
<script>
// constants
var COMMA = ',';
var EMPTY = '';
var REFRESH = 'refresh';
var LI = 'li';
var PAR = 'p';
var ID = 'id';
var ITEM = 'item';
var TITLE = 'title';
var CATEGORY = 'category';
var DESCR = 'description';
var CAT_ = 'cat_';
var _D = '_d';
var _LI = '_li';
var _A = '_a';
var GET = 'GET';
var XML = 'xml';
var HTML_FRG1 = '<li id="';
var HTML_FRG2 = '"><h3><a id="';
var HTML_FRG3 = '" href="#">';
var HTML_FRG4 = '</a></h3><p id="';
var HTML_FRG5 = '"></p><a href="#" data-transition="slideup" id="';
var HTML_FRG6 = '"/></li>';
var HTML_FRG7 = '<p>';
var HTML_FRG8 = '</p><hr></hr>';
var NEWS_URI = 'bridge.php?fwd=http://rss.news.yahoo.com/rss/';
var TWO_SECONDS = 2000;
</script>
</div>
</body>
</html>
Lines after var HTML_FRG6 = '"/></li>'; are not being highlighted. I wonder why?
My lucky guess would be that the var HTML_FRG6 = '"/></li>'; line contains only one " sign which is the last one in the script, so everything after this is considered string in code highlighting. If it's true, this is just Gedit highlighting error (the syntax is fine) and the solution would be ad-hoc lucky guess. One of these could help:
put var fix = '"'; after the line
put var fix = '""'; after the line
move the line one line above
Try using it like this
. var HTML_FRG6 = "/"/>";
Its the same thing but the " is escaped.
function changeText(getString){
var smiles_from_to = new Array();
smiles_from_to[":)"] = "ab"; smiles_from_to[":-)"] = "ab";
smiles_from_to[":("] = "ac"; smiles_from_to[":-("] = "ac";
smiles_from_to["B)"] = "af"; smiles_from_to["B-)"] = "af";
smiles_from_to[";("] = "crygirl2"; smiles_from_to[";-("] = "crygirl2";
smiles_from_to[":-*"] = "aw"; smiles_from_to[":*"] = "aw";
smiles_from_to[":D"] = "ag"; smiles_from_to[":-D"] = "ag";
smiles_from_to["(devil)"] = "aq"; smiles_from_to["(wtf)"] = "ai";
smiles_from_to["(sos)"] = "bc"; smiles_from_to["(boom)"] = "bb";
smiles_from_to["(rofl)"] = "bj"; smiles_from_to["xD"] = "bj";
smiles_from_to["(music)"] = "ar"; smiles_from_to["(angel)"] = "aa";
smiles_from_to["(beer)"] = "az"; smiles_from_to["(omg)"] = "bu";
smiles_from_to["(dance)"] = "bo"; smiles_from_to["(idiot)"] = "bm";
smiles_from_to["(clap)"] = "bi"; smiles_from_to["(gotkiss)"] = "as";
var replaced = getString;
for(var key in smiles_from_to){
replaced = replaced.replace(key, "<img src='"+chrome.extension.getURL("images/"+smiles_from_to[key]+".gif")+"' />");
}
return replaced;
}
Hi everyone ,I need to optimize code for something more simple, so try to avoid for loop..
"var replaced" is a huge html code (content of div that contains 100 lines of messages with date, username, userinfo(tooltip), message,.....)
This code is a piece from my chrome extension. so i cant do it php side.
You can use a single giant regex, of the form /:\)|:\(|.../g, then pass a callbacka as the replacement that looks up the match in your lookup object.
I need to find path information from a string using jquery and push the results into an array.
The string output looks like;
"var rsr = Raphael('rsr', '270', '266');
var path_a = rsr.path('M144.869,199c0,7.659-5.479,13.139-13.14,
13.139 c-7.659,0-14.598-5.479-14.598-13.139s6.209-13.139,13.869-
13.139S144.869,191.341,144.869,199z'); path_a.attr({fill: 'non .... etc"
I need to retrieve every occurrence of the path information with in rsr.path() ;
How would i go about achieving this? using $.each on my string with a regex?
thanks cam
JavaScript will let you do this with a regular expression:
var s = " var rsr = Raphael('rsr', '270', '266'); var path_a = rsr.path('M144.869,199c0,7.659-5.479,13.139-13.14, 13.139 c-7.659,0-14.598-5.479-14.598-13.139s6.209-13.139,13.869- 13.139S144.869,191.341,144.869,199z'); path_a.attr({fill: 'non .... etc";
var regex = /M144\.869(.+?)869,199z/g;
var matches = regex.exec(s);
matches[0] then contains the capture group.
i found this worked..
$(document).ready(function(){
$('form.svg-convert').submit(function(event){
var content = $(this).find('textarea').val();
process(content);
event.preventDefault();
})
})
var process = function(c){
var content = c,
paths = [];
var contentSplit = content.split('path(');
contentSplit.shift();
$.each(contentSplit,function(i,v){
var path = v.split(')');
path = path[0];
paths.push(path);
});
}