Issues with request, and cheerio when web-scraping - javascript

I'm trying to write a code that makes a request to a website, for webscraping
So this are the steps:
Here First part of Code STARTS
The program makes the request to the mainURL
The program selects some objects from the html of the mainURL, and store them in an array of objects(advert), on of the properties of the object, is it's link, which we'll call numberURL, that the code automatically selects using a css selector, the amount of objects is something like 80-90;
The program makes requests to every numberURL(80-90 requests),
and for each of them it does set another properties to the same object, and selects another link, that we'll call accountURL
The program creates an CSV file where it writes every object in different rows
Here First part of Code ENDS
So actually the first part works pretty good, it doesn't have any issues, but the second part does
Here Second part of Code STARTS
The program makes requests to every accountURL from the previous object
The program selects some objects from the html of the accountURL, and stores them in an another array of another objects(account), also using CSS selectors
The program should console.log() all the account objects
Here Second part of Code ENDS
But the second part does have some bugs, because when console.logging the objects we see that the objects properties doesn't changed their default value.
So in debugging purposes I took one advert object and putted it's value manually from the code
post[0].link = 'https://999.md/ru/profile/denisserj'
Finally when running the code for this object it actually works correctly, so it shows the changed properties, but for the rest of them it doesn't.
I tried to set some Timeouts, thinking that the code tries to read the link, before the second request finished, but no effects
I also tried to console.log the link, to see if it exists in the array, so it actually exists there, but also no effect.
Finally here is the code:
// CLASSES
class advert {
constructor() {
this.id = 0;
this.tile = new String();
this.link = new String();
this.phone = new String();
this.account = new String();
this.accountLink = new String();
this.text = new String();
this.operator = new String();
}
show() {
console.log(this.id, this.title, this.link, this.phone, this.account, this.accountLink, this.text, this.operator);
}
}
class account {
constructor() {
this.name = 0;
this.createdAt = 0;
this.phone = [];
this.ads = [];
this.adsNumber = 0;
}
show() {
console.log(this.name, this.createdAt, this.phone, this.ads, this.adsNumber);
}
}
// HEADERS
const mainRequest = require('request');
const auxRequest = require('request');
const cheerio1 = require('cheerio');
const cheerio2 = require('cheerio');
const fs = require('fs');
const fs2 = require('fs');
const adFile = fs.createWriteStream('anunturi.csv');
const accFile = fs2.createWriteStream('conturi.csv');
// SETTINGS
const host = 'https://999.md'
const category = 'https://999.md/ru/list/transport/cars'
const timeLimit = 60; //seconds
// VARIABLES
let post = [];
let postNumber = 0;
let acc = [];
// FUNCTIONS
function deleteFromArray(j) {
post.splice(j, 1);
}
function number(i) {
let category = post[i].link;
auxRequest(category, (error, response, html) => {
if (!error && response.statusCode == 200) {
const $ = cheerio1.load(html);
let phone;
const siteTitle = $('strong').each((id, el) => {
phone = $(el).text();
});
const txt = $('.adPage__content__description').html();
const person = $('.adPage__header__stats').find('.adPage__header__stats__owner').text();
const linkToPerson = host + $('.adPage__header__stats').find('.adPage__header__stats__owner').find('a').attr('href');
post[i].phone = phone;
post[i].account = person;
post[i].accountLink = linkToPerson;
post[i].text = txt;
if (i == postNumber) {
console.log('1. Number Putting done')
writeToFileAd(accountPutter, writeToFileAccount);
}
}
});
}
function writeToFileAd() {
adFile.write('ID, Titlu, Link, Text, Cont, LinkCont, Operator\n')
for (let i = 0; i <= postNumber; i++) {
adFile.write(`${post[i].id}, ${post[i].title}, ${post[i].link}, ${post[i].phone}, ${post[i].account}, ${post[i].accountLink}, ${post[i].operator}\n`);
}
console.log('2. Write To File Ad done')
accountPutter();
}
function accountAnalyzis(i) {
let category = post[i].link;
const mainRequest = require('request');
category = category.replace('/ru/', '/ro/');
mainRequest(category, (error, response, html) => {
if (!error && response.statusCode == 200) {
const $ = cheerio2.load(html);
const name = $('.user-profile__sidebar-info__main-wrapper').find('.login-wrapper').text();
let createdAt = $('.date-registration').text();
createdAt = createdAt.replace('Pe site din ', '');
const phones = $('.user-profile__info__data').find('dd').each((id, el) => {
let phone = $(el).text();
acc[i].phone.push(phone);
});
const ads = $('.profile-ads-list-photo-item-title').find('a').each((id, el) => {
let ad = host + $(el).attr('href');
acc[i].ads.push(ad);
acc[i].adsNumber++;
});
acc[i].name = name;
acc[i].createdAt = createdAt;
console.log(name)
if (i == postNumber) {
console.log('3. Account Putting done')
writeToFileAccount();
}
}
});
}
function writeToFileAccount() {
for (let i = 0; i <= postNumber; i++) {
accFile.write(`${acc[i].name}, ${acc[i].createdAt}, ${acc[i].phone}, ${acc[i].ads}, ${acc[i].adsNumber}\n`);
}
console.log('4. Write to file Account done');
}
function numberPutter() {
for (let i = 0; i <= postNumber; i++) {
number(i);
}
}
function accountPutter() {
for (let i = 0; i <= postNumber; i++) {
accountAnalyzis(i);
}
}
// MAIN
mainRequest(category, (error, response, html) => {
let links = [];
for (let i = 0; i < 1000; i++) {
post[i] = new advert();
}
for (let i = 0; i < 1000; i++) {
acc[i] = new account();
}
if (!error && response.statusCode == 200) {
const $ = cheerio2.load(html);
const siteTitle = $('.ads-list-photo-item-title').each((id, el) => {
const ref = host + $(el).children().attr('href');
const title = $(el).text();
post[id].id = id + 1;
post[id].title = title;
post[id].link = ref;
links[id] = ref;
postNumber = id;
});
post[0].link = 'https://999.md/ru/profile/denisserj'
numberPutter()
}
});

You have an error in line
const siteTitle = $('.ads-list-photo-item-title').each((id, el) => {
What you actually want is .find('a').each...

Related

How to get xmlhttprequest results in a for loop

Good day all,
Please i'm trying to get the latency values of a ping using xmlhttprequest in a for loop for five consecutive latencies that will be stored in an global array in a react native application, after which when the for loop finishes, other codes can now execute, but it seems that the whole code just runs through without getting the array values from the initial for loop and the final result comes out as 0, then later, I start getting the array values due to the result of the xmlhttprequest. How can I ensure that I get the latency results first before now executing the remaining code. My code is below:
let latencies = [];
class App extends Component {
startScan = () => {
this.setState({
scanning: true,
});
this.getJitter();
}
getPing = () => {
var request = new XMLHttpRequest();
var startTime = new Date();
request.open(
'GET',
'http://dummy.restapiexample.com/api/v1/employees',
true,
);
request.send();
request.onreadystatechange = (e) => {
if (request.readyState == 4 && request.status == 200) {
var endTime = new Date();
var ping = endTime.getTime() - startTime.getTime();
this.setState({ping: ping});
latencies.push(ping);
console.log('ping:', ping);
console.log(latencies);
return ping;
}
};
};
getJitter = () => {
for(var i=0; i<5; i++){
this.getPing();
}
//Get each latency difference
var total1 = 0;
for (var i = 0; i < latencies.lenght; i++) {
if (typeof latencies[i] === 'number') {
console.log(latencies[i]);
total1 += latencies[i + 1] - latencies[i];
console.log(total1);
}
}
var jitter = total1 / (latencies.length - 1);
console.log(jitter); //this comes out as 0
latencies = [];
};
render() {
return (
...
<Button title="Scan" onPress={this.startScan} />
)
};
}
Thanks
Tim
The problem is that the XMLHttpRequest is asynchronous and will allow other code to run while it is trying to fetch the resource. The solution to this is to await each request and then move on to the next block.
I've switched the XMLHttpRequest for the Fetch API. fetch returns a Promise which you can await and wait for it to finish.
class App extends Component {
startScan = () => {
this.setState({
scanning: true,
});
this.getJitter().then(() => {
this.setState({
scanning: false,
});
});
}
getPing = async () => {
const startTime = new Date();
await fetch('http://dummy.restapiexample.com/api/v1/employees');
const endTime = new Date();
const ping = endTime.getTime() - startTime.getTime();
this.setState({ ping });
return ping;
}
getJitter = async () => {
const latencies = [];
for (let i = 0; i < 5; i++){
const ping = await this.getPing();
latencies.push(ping);
}
//Get each latency difference
let total1 = 0;
for (let i = 0; i < latencies.length; i++) {
if (typeof latencies[i] === 'number') {
console.log(latencies[i]);
total1 += latencies[i + 1] - latencies[i];
console.log(total1);
}
}
const jitter = total1 / (latencies.length - 1);
console.log(jitter); //this comes out as 0
};
render() {
return (
<Button title="Scan" onPress={this.startScan} />
)
}
}

NodeJS using a class from another file

I have a java script file that is referencing another javascript file that contains a class using
const Champion = require("./championgg_webscraper_cheerio.js");
I then try to instantiate an object of the class Champion by
var temp = new Champion("hello");
console.log(temp);
And when I do it prints this to the console indicating and undefined variable:
Champion {}
Also when i try to print out the properties of the class I get undefined, I think it might not have access to the most_frequent_completed_build variable.
console.log(temp.most_frequent_completed_build);
Here is a look at the championgg_webscraper_cheerio.js file
function Champion(champName) {
//CHEERIO webscraping
var cheerio = require('cheerio');
//REQUEST http library
var request = require('request');
//url of the champion
var url = "http://champion.gg/champion/Camille/Top?";
var most_frequent_completed_build;
var highest_win_percentage_completed_build;
request(url,
function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var final_build_items = $(".build-wrapper a");
var mfcb = [];
var hwpcb = [];
for (i = 0; i < 6; i++) {
var temp = final_build_items.get(i);
temp = temp.attribs.href;
//slices <'http://leagueoflegends.wikia.com/wiki/> off the href
temp = temp.slice(38);
mfcb.push(temp);
}
for (i = 6; i < 12; i++) {
var temp = final_build_items.get(i);
temp = temp.attribs.href;
//slices <'http://leagueoflegends.wikia.com/wiki/> off the href
temp = temp.slice(38);
hwpcb.push(temp);
}
most_frequent_completed_build = mfcb;
highest_win_percentage_completed_build = hwpcb;
} else {
console.log("Response Error: " + response.statusCode);
}
}
);
};
module.exports = Champion;
I think you want a Function constructor named Champion (a prototype or blue-print like classes in other programming languages like Java).
As an alternative I would suggest you to learn ES6 way of writing classes which is similar to that of Java.
You can achieve that by adding all the variables or methods to the this variable inside the Function Constructor so that you can access them using an object created using the 'new' keyword i.e make them Class members or methods.
In your case,
function Champion(champName) {
//Some code
this.most_frequent_completed_build = NULL;
//Rest of code
}
module.exports = Champion;
Just make sure whenever you try to access Class variables always use this.variable_name like this.most_frequent_completed_build.
So when you create a new object of this Class in main app you will be able to access all Class members and methods.
const Champion = require("./championgg_webscraper_cheerio.js");
var temp = new Champion("hello");
console.log(temp.most_frequent_completed_build);
You are exporting a function
All you have to do is call that function like
var temp = Champion();
You can read more about new keyword here and here
function Champion(champName) {
//CHEERIO webscraping
var cheerio = require('cheerio');
//REQUEST http library
var request = require('request');
//url of the champion
var url = "http://champion.gg/champion/Camille/Top?";
var most_frequent_completed_build;
var highest_win_percentage_completed_build;
request(url,
function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var final_build_items = $(".build-wrapper a");
var mfcb = [];
var hwpcb = [];
for (i = 0; i < 6; i++) {
var temp = final_build_items.get(i);
temp = temp.attribs.href;
//slices <'http://leagueoflegends.wikia.com/wiki/> off the href
temp = temp.slice(38);
mfcb.push(temp);
}
for (i = 6; i < 12; i++) {
var temp = final_build_items.get(i);
temp = temp.attribs.href;
//slices <'http://leagueoflegends.wikia.com/wiki/> off the href
temp = temp.slice(38);
hwpcb.push(temp);
}
most_frequent_completed_build = mfcb;
highest_win_percentage_completed_build = hwpcb;
} else {
console.log("Response Error: " + response.statusCode);
}
}
);
return {most_frequent_completed_build:most_frequent_completed_build};
};
module.exports = Champion;
var temp = new Champion("hello");
console.log(temp.most_frequent_completed_build);

Node is not returning the correct JSON response

I have an object here which I want to return as a response but before that I want to add a couple of fields to it. When I add fields and print the object, it prints the new object but when I send the response, I still get old object before editing.
let applications = await ApplicationHandler.getApplicationOverview(application_id); //I want to edit this one
if (applications == null) {
applications = [];
}
for (let i = 0; i < applications.length; i++) {
let cube_application_id = applications[i].id;
let application_ratings = await Application_Rating.findAll({where: {cube_application_id: cube_application_id}});
let application_ratings_personal = await Application_Rating.findAll({
where: {
cube_application_id: cube_application_id,
user_id: req.user.id
}
});
let total_user_rating = 0;
for (let rating of application_ratings) {
let average_user_rating = (rating.personality_rating + rating.qualification_rating + rating.motivation_rating) / 3;
total_user_rating = total_user_rating + average_user_rating;
}
applications[i].average_rating = total_user_rating / application_ratings.length;
applications[i].recommended = application_ratings_personal.recommended;
if (i == applications.length - 1) {
console.log(applications); // this prints with those 2 fields.
let successRes = {
status: 200,
data: applications
};
return successRes; //however, this returns the one which I had in the first line.
}
}

How to correctly extract text from a pdf using pdf.js

I'm new to ES6 and Promise. I'm trying pdf.js to extract texts from all pages of a pdf file into a string array. And when extraction is done, I want to parse the array somehow. Say pdf file(passed via typedarray correctly) has 4 pages and my code is:
let str = [];
PDFJS.getDocument(typedarray).then(function(pdf) {
for(let i = 1; i <= pdf.numPages; i++) {
pdf.getPage(i).then(function(page) {
page.getTextContent().then(function(textContent) {
for(let j = 0; j < textContent.items.length; j++) {
str.push(textContent.items[j].str);
}
parse(str);
});
});
}
});
It manages to work, but, of course, the problem is my parse function is called 4 times. I just want to call parse only after all 4-pages-extraction is done.
Similar to https://stackoverflow.com/a/40494019/1765767 -- collect page promises using Promise.all and don't forget to chain then's:
function gettext(pdfUrl){
var pdf = pdfjsLib.getDocument(pdfUrl);
return pdf.then(function(pdf) { // get all pages text
var maxPages = pdf.pdfInfo.numPages;
var countPromises = []; // collecting all page promises
for (var j = 1; j <= maxPages; j++) {
var page = pdf.getPage(j);
var txt = "";
countPromises.push(page.then(function(page) { // add page promise
var textContent = page.getTextContent();
return textContent.then(function(text){ // return content promise
return text.items.map(function (s) { return s.str; }).join(''); // value page text
});
}));
}
// Wait for all pages and join text
return Promise.all(countPromises).then(function (texts) {
return texts.join('');
});
});
}
// waiting on gettext to finish completion, or error
gettext("https://cdn.mozilla.net/pdfjs/tracemonkey.pdf").then(function (text) {
alert('parse ' + text);
},
function (reason) {
console.error(reason);
});
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
A bit more cleaner version of #async5 and updated according to the latest version of "pdfjs-dist": "^2.0.943"
import PDFJS from "pdfjs-dist";
import PDFJSWorker from "pdfjs-dist/build/pdf.worker.js"; // add this to fit 2.3.0
PDFJS.disableTextLayer = true;
PDFJS.disableWorker = true; // not availaible anymore since 2.3.0 (see imports)
const getPageText = async (pdf: Pdf, pageNo: number) => {
const page = await pdf.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const pageText = tokenizedText.items.map(token => token.str).join("");
return pageText;
};
/* see example of a PDFSource below */
export const getPDFText = async (source: PDFSource): Promise<string> => {
Object.assign(window, {pdfjsWorker: PDFJSWorker}); // added to fit 2.3.0
const pdf: Pdf = await PDFJS.getDocument(source).promise;
const maxPages = pdf.numPages;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= maxPages; pageNo += 1) {
pageTextPromises.push(getPageText(pdf, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
return pageTexts.join(" ");
};
This is the corresponding typescript declaration file that I have used if anyone needs it.
declare module "pdfjs-dist";
type TokenText = {
str: string;
};
type PageText = {
items: TokenText[];
};
type PdfPage = {
getTextContent: () => Promise<PageText>;
};
type Pdf = {
numPages: number;
getPage: (pageNo: number) => Promise<PdfPage>;
};
type PDFSource = Buffer | string;
declare module 'pdfjs-dist/build/pdf.worker.js'; // needed in 2.3.0
Example of how to get a PDFSource from a File with Buffer (from node types) :
file.arrayBuffer().then((ab: ArrayBuffer) => {
const pdfSource: PDFSource = Buffer.from(ab);
});
Here's a shorter (not necessarily better) version:
async function getPdfText(data) {
let doc = await pdfjsLib.getDocument({data}).promise;
let pageTexts = Array.from({length: doc.numPages}, async (v,i) => {
return (await (await doc.getPage(i+1)).getTextContent()).items.map(token => token.str).join('');
});
return (await Promise.all(pageTexts)).join('');
}
Here, data is a string or buffer (or you could change it to take the url, etc., instead).
Here's another Typescript version with await and Promise.all based on the other answers:
import { getDocument } from "pdfjs-dist";
import {
DocumentInitParameters,
PDFDataRangeTransport,
TypedArray,
} from "pdfjs-dist/types/display/api";
export const getPdfText = async (
src: string | TypedArray | DocumentInitParameters | PDFDataRangeTransport
): Promise<string> => {
const pdf = await getDocument(src).promise;
const pageList = await Promise.all(Array.from({ length: pdf.numPages }, (_, i) => pdf.getPage(i + 1)));
const textList = await Promise.all(pageList.map((p) => p.getTextContent()));
return textList
.map(({ items }) => items.map(({ str }) => str).join(""))
.join("");
};
If you use the PDFViewer component, here is my solution that doesn't involve any promise or asynchrony:
function getDocumentText(viewer) {
let text = '';
for (let i = 0; i < viewer.pagesCount; i++) {
const { textContentItemsStr } = viewer.getPageView(i).textLayer;
for (let item of textContentItemsStr)
text += item;
}
return text;
}
I wouldn't know how to do it either, but thanks to async5 I did it. I copied his code and updated it to the new version of pdf.js.
I made minimal corrections and also took the liberty of not grouping all the pages into a single string. In addition, I used a regular expression that removes many of the empty spaces that PDF unfortunately ends up creating (it does not solve all cases, but the vast majority).
The way I did it should be the way that most will feel comfortable working, however, feel free to remove the regex or make any other changes.
// pdf-to-text.js v1, require pdf.js ( https://mozilla.github.io/pdf.js/getting_started/#download )
// load pdf.js and pdf.worker.js
function pdfToText(url, separator = ' ') {
let pdf = pdfjsLib.getDocument(url);
return pdf.promise.then(function(pdf) { // get all pages text
let maxPages = pdf._pdfInfo.numPages;
let countPromises = []; // collecting all page promises
for (let i = 1; i <= maxPages; i++) {
let page = pdf.getPage(i);
countPromises.push(page.then(function(page) { // add page promise
let textContent = page.getTextContent();
return textContent.then(function(text) { // return content promise
return text.items.map(function(obj) {
return obj.str;
}).join(separator); // value page text
});
}));
};
// wait for all pages and join text
return Promise.all(countPromises).then(function(texts) {
for(let i = 0; i < texts.length; i++){
texts[i] = texts[i].replace(/\s+/g, ' ').trim();
};
return texts;
});
});
};
// example of use:
// waiting on pdfToText to finish completion, or error
pdfToText('files/pdf-name.pdf').then(function(pdfTexts) {
console.log(pdfTexts);
// RESULT: ['TEXT-OF-PAGE-1', 'TEXT-OF-PAGE-2', ...]
}, function(reason) {
console.error(reason);
});

Can't get list filled from a promise and use it in html - aurelia

I can't get the finalList filled to use in my html file, it wil run the code to fill it before the promise all code. I need to use this array in my html document so it has to be a this.variable I am using Aurelia.
activate() {
var repoList = [];
var repos = this.http.fetch({Link to github api})
.then(response => response.json())
.then(repos => this.repos = repos);
var trello = new Trello;
trello.getBoards().then(boardList => this.boards = boardList);
var boards = trello.getBoards();
//add github repo to the associated trello board (works)
Promise.all([boards, repos]).then(function(values) {
var count = 0;
for (var i in values[0]) {
for (var a in values[1]) {
if (values[1][a].hasOwnProperty("name")) {
var repo = values[1][a].name.toLowerCase();
var board = values[0][i]['name'].toLowerCase();
repoList[count] = repo;
count++;
if (repo == board) {
console.log(repo + " " + board)
}
}
}
}
});
//this list is always empty (The problem)
this.finalList = repoList;
this.title = "Trello Boards";
}
Something like this should do it. Hard to decipher what's going on in the for loops.
activate() {
let reposPromise = this.http.fetch({Link to github api})
.then(response => response.json());
let boardsPromise = new Trello().getBoards();
return Promise.all([boardsPromise, reposPromise])
.then(([boards, repos]) => {
this.boards = boards;
this.repos = repos;
this.finalList = [];
for (var i in boards) {
for (var a in repos) {
if (values[1][a].hasOwnProperty("name")) {
var repo = values[1][a].name.toLowerCase();
var board = values[0][i]['name'].toLowerCase();
this.finalList.push(repo);
if (repo == board)
{
console.log(repo + " " + board)
}
}
}
}
});
this.title = "Trello Boards";
}
I believe Your finalList should be set inside the promise handler. Like this.
activate() {
var repoList = [];
//I always use this, and I am not sure what do you mean
//by this.finalList, but still I assume you know what you are doing
//And hence I use this!
var that = this;
var repos = this.http.fetch({Link to github api})
.then(response => response.json())
.then(repos => this.repos = repos);
var trello = new Trello;
trello.getBoards().then(boardList => this.boards = boardList);
var boards = trello.getBoards();
//add github repo to the associated trello board (works)
Promise.all([boards, repos]).then(function(values) {
var count = 0;
for (var i in values[0]) {
for (var a in values[1]) {
if (values[1][a].hasOwnProperty("name"))
{
var repo = values[1][a].name.toLowerCase();
var board = values[0][i]['name'].toLowerCase();
repoList[count] = repo;
count++;
if (repo == board)
{
console.log(repo + " " + board)
};
}
};
};
//I believe when promise resolves. You should set the repoList.
that.finalList = repoList;
that.title = "Trello Boards";
});
}
My question is, do you really wanna set title and finalList to this? Just asking.
Hope this helps!

Categories

Resources