I'm web-scraping a site and managed to extract data in the for loop.
However, I don't know how can I save it to my MongoDB database as I'm receiving an error ReferenceError: nameElement is not defined .
How can I save the results from my for loop as an object to my database?
const kclResults = [];
async function scrapeInfiniteScrollItems(
page,
scrollDelay = 10000
) {
try {
const html = await page.content();
const $ = cheerio.load(html);
await page.evaluate(() => {
let elements = $("[role='listitem']")
.find("._2DX0iPG8PDF3Si_o5PlzIj")
.toArray();
for (i = 0; i < elements.length; i++) {
$(elements[i]).click();
const nameElement = $("[data-log-name='PersonName']").text();
const emailElement = $("[data-log-name='Email']").text();
const allElements = $("[aria-label='Contact information']").text();
const officeLocation = $("[data-log-name='OfficeLocation']").text();
const position = $("[data-log-name='Company']").text();
const jobTitle = $("[data-log-name='JobTitle']").text();
const departament = $("[data-log-name='Department']").text();
console.log(
`email: ${emailElement} name: ${nameElement} allElements: ${allElements} \n office location: ${officeLocation} \n position: ${position} \n jobTitle: ${jobTitle} \n departament: ${departament}`
);
}
});
let kclResult = new KingsDB({
nameElement,
emailElement,
allElements,
officeLocation,
position,
jobTitle,
departament,
});
kclResults.push(kclResult);
console.log(kclResults);
kclResult.save();
return kclResults;
} catch (error) {
console.log(error);
}
}
You are declaring nameElement (and other variables) in for loop scope and trying to access it outside that scope.
Just create an array of "elements" and iterate over it when you're writing it to your DB. This code below should work:
const kclResults = [];
async function scrapeInfiniteScrollItems(
page,
scrollDelay = 10000
) {
try {
const html = await page.content();
const $ = cheerio.load(html);
const resultArr = await page.evaluate(() => {
let elements = $("[role='listitem']")
.find("._2DX0iPG8PDF3Si_o5PlzIj")
.toArray();
const resultArr = [];
for (i = 0; i < elements.length; i++) {
$(elements[i]).click();
const nameElement = $("[data-log-name='PersonName']").text();
const emailElement = $("[data-log-name='Email']").text();
const allElements = $("[aria-label='Contact information']").text();
const officeLocation = $("[data-log-name='OfficeLocation']").text();
const position = $("[data-log-name='Company']").text();
const jobTitle = $("[data-log-name='JobTitle']").text();
const departament = $("[data-log-name='Department']").text();
resultArr.push({
nameElement,
emailElement,
allElements,
officeLocation,
position,
jobTitle,
departament
});
console.log(
`email: ${emailElement} name: ${nameElement} allElements: ${allElements} \n office location: ${officeLocation} \n position: ${position} \n jobTitle: ${jobTitle} \n departament: ${departament}`
);
}
return resultArr;
});
const kclResults = [];
for (let result of resultArr) {
const {
nameElement,
emailElement,
allElements,
officeLocation,
position,
jobTitle,
departament
} = result;
let kclResult = new KingsDB({
nameElement,
emailElement,
allElements,
officeLocation,
position,
jobTitle,
departament,
});
kclResults.push(kclResult);
console.log(kclResults);
kclResults.push(kclResult.save());
}
return kclResults;
} catch (error) {
console.log(error);
}
}
PS: The function passed to pageEvaluate runs in browser context and thus doesn't have access to your node variables, until they are explicitly passed as an argument.
Related
how to retrieve the verif code, here I try to do the next regex using trim but an error message appears "TypeError: Cannot read properties of undefined (reading 'trim')"
and I just want to fetch the verification code, like in the image
my code
const checkInboxUrl = 'https://getnada.com/api/v1/inboxes/';
const getMessageUrl = 'https://getnada.com/api/v1/messages/html/';
const refreshMailboxUrl = 'https://getnada.com/api/v1/u/';
/* eslint-disable no-unused-vars */
class Getnada {
constructor() {
this.email = '';
this.verificationCode = '';
}
async getEmail(email = 'urmxhbwrz#getnada.com') {
this.email = email;
return this;
}
async getMailbox(pattern, sleepTime = 5000) {
await sleep(sleepTime);
const timestamp = Math.floor(new Date().getTime() / 1000);
const refreshMailboxResponse = await fetch(refreshMailboxUrl + this.email + '/' + timestamp);
const checkInboxResponse = await fetch(checkInboxUrl + this.email);
const checkInboxJson = await checkInboxResponse.json();
const getMessageResponse = await fetch(getMessageUrl + checkInboxJson.msgs[0].uid);
const readInbox = await getMessageResponse.text();
const regex = new RegExp(pattern);
const verificationCodeMatch = regex.exec(readInbox);
this.verificationCode = verificationCodeMatch[1].trim();
console.log(verificationCodeMatch)
return this;
}
}
const getnada = new Getnada();
async function main() {
console.log((await getnada.getEmail()))
console.log((await getnada.getMailbox()))
}
main();
https://getnada.com/api/v1/messages/html/8lra5CwOQcHvja3mpQZgO7G5RPTS3W
To retrieve the verification code, you can try to change this lines :
const regex = new RegExp(pattern);
const verificationCodeMatch = regex.exec(readInbox);
this.verificationCode = verificationCodeMatch[1].trim();
to :
const verificationCodeMatch = pattern.exec(readInbox);
this.verificationCode = verificationCodeMatch[0].trim();
And change this line too :
console.log((await getnada.getMailbox()))
to :
console.log((await getnada.getMailbox(/\b\d{6,6}\b/)));
This regex /\b\d{6,6}\b/ will filter out strings containing exactly 6 digits of numbers which is the verification code.
I am trying to transfer SPL tokens and am getting the error from the function
mintToken.getOrCreateAssociatedAccountInfo(wallet.publicKey);
Error: Invalid seeds, address must fall off the curve
My wallet variable a an AnchorWallet
ToWallet is obtained via:
var toWallet = anchor.web3.Keypair.fromSecretKey(DEMO_TO_WALLET);
try {
if (wallet) {
const mintPublicKey = new anchor.web3.PublicKey("Token address");
const mintToken = new Token(
props.connection,
mintPublicKey,
TOKEN_PROGRAM_ID,
toWallet
);
const fromTokenAccount = await mintToken.getOrCreateAssociatedAccountInfo(
wallet.publicKey
);
const destPublicKey = new anchor.web3.PublicKey(toWallet);
// Get the derived address of the destination wallet which will hold the custom token
const associatedDestinationTokenAddr = await Token.getAssociatedTokenAddress(
mintToken.associatedProgramId,
mintToken.programId,
mintPublicKey,
destPublicKey
);
const receiverAccount = await props.connection.getAccountInfo(associatedDestinationTokenAddr);
const instructions: anchor.web3.TransactionInstruction[] = [];
if (receiverAccount === null) {
instructions.push(
Token.createAssociatedTokenAccountInstruction(
mintToken.associatedProgramId,
mintToken.programId,
mintPublicKey,
associatedDestinationTokenAddr,
destPublicKey,
wallet.publicKey
)
)
}
instructions.push(
Token.createTransferInstruction(
TOKEN_PROGRAM_ID,
fromTokenAccount.address,
associatedDestinationTokenAddr,
wallet.publicKey,
[],
1
)
);
const transaction = new anchor.web3.Transaction().add(...instructions);
transaction.feePayer = wallet.publicKey;
transaction.recentBlockhash = (await props.connection.getRecentBlockhash()).blockhash;
const transactionSignature = await props.connection.sendRawTransaction(
transaction.serialize(),
{ skipPreflight: true }
);
await props.connection.confirmTransaction(transactionSignature);
Please ensure that wallet.publicKey contains valid value.
console.log(wallet.publicKey);//I think this might be an empty string.
const fromTokenAccount = await mintToken.getOrCreateAssociatedAccountInfo(
wallet.publicKey
);
i want to find biggests changerate in the prices in my coins data
like if SDT oldprice is 1.00 and
new price is 1.02 = 2%
and if it was the biggests changerate in the coins my script should printing it
but the script dont working
it only keep give me same coin
const math = require('mathjs');
const fetch = require('node-fetch');
get()
async function get() {
const response = await fetch("https://trade.kucoin.com/_api/currency/prices?base=USD&targets=&lang=en_US");
const coin1 = await response.json();
const olddata = coin1.data
const tokens = Object.keys(olddata)
const oldprice = Object.values(olddata)
get1()
async function get1() {
const response = await fetch("https://trade.kucoin.com/_api/currency/prices?base=USD&targets=&lang=en_US");
const coin2 = await response.json();
const newdata = coin2.data
const tokens = Object.keys(newdata)
const newprice = Object.values(newdata)
function findLargestDifference() {
var large = null;
var index = 0;
for (var i = 0; i < oldprice.length; i++) {
var change = tokens[i].newprice / oldprice[i].oldprice;
if (change > large) {
large = change;
index = i;
}
}
console.log(tokens[index])
return tokens[index];
}
findLargestDifference()
}
}
here how data looks https://prnt.sc/19syjjg
How would I scrape content from multiple urls using Puppeteer?
I've created a loop, but I'm only seeing the results for the first url.
I suspect it's something to do with where I declare the results variable, but I've had no luck trying, does anyone know how to do this?
const puppeteer = require('puppeteer');
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const urls = ["https://www.marksandspencer.com/high-neck-long-sleeve-blouse/p/p60260040?image=SD_01_T43_5168_HD_X_EC_90&color=LIGHTDENIM&prevPage=plp", "https://www.marksandspencer.com/pure-cotton-printed-short-sleeve-t-shirt/p/p60263529?image=SD_01_T41_8030Z_Z4_X_EC_90&color=WHITEMIX&prevPage=plp"];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(() => {
let product = document.querySelector('h1[itemprop=name]').innerText;
let results = [];
let items = document.querySelectorAll('[data-ttip-id=sizeGridTooltip] tbody tr td label');
items.forEach((element) => {
let size = element.getAttribute('for');
let stockLevel = "";
let nearest_td = element.closest('td');
if (nearest_td.classList.contains('low-stock')) {
stockLevel = "Low stock"
} else if (nearest_td.classList.contains('out-of-stock')) {
stockLevel = "Out of stock"
} else {
stockLevel = "In stock"
}
results.push({
product: product,
size: size,
stock: stockLevel
})
});
return results
})
browser.close();
return resolve(products);
}
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
These lines are inside your for loop:
browser.close();
return resolve(products);
So as part of the first iteration, you close the browser and return the function. You should move this out of your for loop and store products inside an array like this:
const urls = /* ... */;
const productsList = [];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(/* ... */);
productsList.push(products);
}
browser.close();
return resolve(productsList); // resolve with an array containing the aggregated products
In case you are looking for a more elegant solution (for crawling the pages in parallel), you might want to have a look at the library puppeteer-cluster (disclaimer: I'm the author).
Working on scraping TV episodes from IMDb (Breaking Bad in the example below). The problem is when implementing the for loop, only the first iteration of j is returned.
My assumption is the return statement is exiting the loop but I'm unsure how to fix the problem.
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const searchUrl = 'https://www.imdb.com/find?s=tt&ttype=tv&ref_=fn_tv&q=';
const movieUrl = 'https://www.imdb.com/title/';
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
const episodes = [];
for (let j = 1; j <= numSeasons; j++) {
return fetch(`${movieUrl}${imdbID}/episodes?season=${j}`)
.then(response => response.text())
.then(body => {
const $ = cheerio.load(body);
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season: j,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes; //Only season 1 is returned.
});
}
}
Let's rewrite the function using async await style. This way we make sure we fire fetch numSeasons times, await all of them, and process them one by one.
async function processResponse(response, season) {
const body = await response.text();
const $ = cheerio.load(body);
let episodes = [];
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes;
}
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
let promises = [];
for (let j = 1; j <= numSeasons; j++) {
promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
}
const responses = await Promise.all(promises);
return responses.reduce((accumulator, response, index) => {
return accumulator.concat(await processResponse(response, index + 1));
}, []);
}