How can I put the crawling result value in the cafecrawling array?
await page.waitForTimeout('5000')
console.log('crawlling start')
const cafecrawlling = await page.evaluate(() => {
const cafecrawlling = []
document.querySelector('#cafe_main')
const Post_name = Array.from(document.querySelector('#cafe_main').contentDocument.querySelectorAll('tr > td:first-of-type .board-list .inner_list .article')).map((v) => v.textContent.replace(/\s+/g, '')) // title
const Post_link = Array.from(document.querySelector('#cafe_main').contentDocument.querySelectorAll('tr > td:first-of-type .board-list .inner_list .article')).map((v)=> v.href) // link
const Post_nickname = Array.from(document.querySelector('#cafe_main').contentDocument.querySelectorAll('tr > td:nth-of-type(2) .p-nick .m-tcol-c')).map((v) => v.textContent) //nick name
const Post_date = Array.from(document.querySelector('#cafe_main').contentDocument.querySelectorAll('tr > td:nth-of-type(3)')).map((v)=> v.textContent) // date
return Post_name.map((v, i) => {
return {
postname: v,
postlink: Post_link[i],
nickname: Post_nickname[i],
postdate: Post_date[i],
}
})
});
after contents scraping code is
const articlePage = await browser.newPage()
for (let article of cafecrawlling) {
await articlePage.goto(article.href)
const content = await articlePage.$eval('#postContent', element => element.innerText)
article.content = content;
}
Related
I've created a script using puppeteer to scrape the links of different authors from a webpage traversing multiple pages triggering click on the next page button. The script appears to be working in the right way.
Although the content of this site is static, I intentionally used puppeteer within the following script only to learn as to how I can parse content from inner pages.
Given that I wish to go one layer deep to scrape description from such pages. How can I achieve that?
const puppeteer = require('puppeteer');
function run (pagesToScrape) {
return new Promise(async (resolve, reject) => {
try {
if (!pagesToScrape) {
pagesToScrape = 1;
}
const browser = await puppeteer.launch({headless:false});
const [page] = await browser.pages();
await page.goto("https://quotes.toscrape.com/");
let currentPage = 1;
let urls = [];
while (currentPage <= pagesToScrape) {
let newUrls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('[class="quote"]');
items.forEach((item) => {
results.push({
authorUrl: 'https://quotes.toscrape.com' + item.querySelector("small.author + a").getAttribute('href'),
title: item.querySelector("span.text").innerText
});
});
return results;
});
urls = urls.concat(newUrls);
if (currentPage < pagesToScrape) {
await Promise.all([
await page.waitForSelector('li.next > a'),
await page.click('li.next > a'),
await page.waitForSelector('[class="quote"]')
])
}
currentPage++;
}
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run(3).then(console.log).catch(console.error);
I would go this way:
const puppeteer = require('puppeteer');
let browser;
(async function main() {
browser = await puppeteer.launch({ headless: false, defaultViewport: null });
const [pageQuotes] = await browser.pages();
const pageAbout = await browser.newPage();
await pageQuotes.bringToFront(); // Otherwise, click on the next page link does not work.
const pagesToScrape = 3;
await pageQuotes.goto('https://quotes.toscrape.com/');
let currentPage = 1;
const data = { quotes: {}, abouts: {} };
const visitedAbouts = new Set();
while (currentPage <= pagesToScrape) {
await pageQuotes.waitForSelector('.quote');
const { quotes, aboutURLs } = await pageQuotes.evaluate(() => ({
quotes: Array.from(
document.querySelectorAll('.quote'),
quote => [quote.querySelector('small.author').innerText, quote.innerText],
),
aboutURLs: Array.from(
document.querySelectorAll('.quote small.author + a[href]'),
quote => quote.href,
),
}));
for (const [author, quote] of quotes) {
if (data.quotes[author] === undefined) data.quotes[author] = [];
data.quotes[author].push(quote);
}
for (const aboutURL of aboutURLs) {
if (!visitedAbouts.has(aboutURL)) {
visitedAbouts.add(aboutURL);
await pageAbout.goto(aboutURL);
await pageAbout.waitForSelector('div.author-details');
const { title, about } = await pageAbout.evaluate(() => ({
title: document.querySelector('div.author-details h3.author-title').innerText,
about: document.querySelector('div.author-details').innerText,
}));
data.abouts[title] = about;
}
}
if (currentPage < pagesToScrape) {
const nextLink = await pageQuotes.waitForSelector('li.next > a');
await Promise.all([
nextLink.click(),
pageQuotes.waitForNavigation(),
]);
}
currentPage++;
}
console.log(JSON.stringify(data, null, ' '));
})().catch(console.error).finally(async () => { if (browser) await browser.close(); });
I get stuck with creating a new trigger to send only one email the next day, after changing the status in orders to 'DELIVERED', that is, create a query in the sendStatusEmails trigger that retrieves orders by status and id. Who can help with this problem or give an example?
Here's cron dailyEmails.js
module.exports = functions.pubsub
.schedule("0 0 12 * *")
.timeZone("")
.onRun(async () => {
await checkUnsentOrders();
await gameMailUnsentOrders();
await sendStatusEmails();
]);
});
example of trigger gameMailUnsentOrders.js
const processGamerStatusReminderEmail = async (gamerOrders: OrderData[]) => {
const firstOrder = _.head(gamerOrders);
const gamerName = firstOrder
? firstOrder.gamerName
? firstOrder.gamerName
: ""
: "";
const gamerId = firstOrder
? firstGameOrder.gamerId
? firstGameOrder.gamerId
: ""
: "";
const gamerData = await (await gamer.doc(gamerId).get()).data();
const gamerReminderEmail = gamerData.reminderEmail;
if (gamerReminderEmail === "" || !gamerReminderEmail) {
console.log(
` ${gamerName} email is not finded`
);
return;
}
let rows = gamerOrders
.map(
(doc: OrderData) =>
`...`
)
let view = `<strong>Status: </strong> ${status}(
`;
return await sentGameHtmlEmail(
gamerReminderEmail
);
};
module.exports = async () => {
const startDate = startOfDate();
const endDate = endOfDate();
const dateBefore = dateBeforeGameDay();
const finalizedStatusDayBefore = [
...finalisedOrdersStatusIds,
constants.DELIVERED.id,
];
let listGameOrders = await getGameOrdersMailing(startDate, endDate);
let ordersStatus = listGameOrders.filter((gameOrder) => {
const gameDate = DateTime.fromJSDate(gameOrder.gamePDate)
.toJSDate();
if (dateBefore.getDay() === gameDate.getDay()) {
return !finalizedStatusDayBefore.includes(gameOrder.gameStatus.id);
}
return !finalisedOrdersStatusIds.includes(gameOrder.gameStatus.id);
});
let orderedByGamerId = _.orderBy(
ordersStatus.map((order) => order),
"gamerId"
);
return await Promise.all(
_.map(orderedByGamerId, (gameOrders: OrderData[]) => {
return processGamerReminderEmail(gameOrders);
})
);
};
I am trying to open a array from an API
tried using the code
const names_2 = await page.evaluate(() => Array.from(document.querySelectorAll('.mainDiv > Departure'), Departure => Departure.innerText));
But with no luck
Here is my input
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('http://xmlopen.rejseplanen.dk/bin/rest.exe/multiDepartureBoard?id1=8600646&format=json')
const result = await page.evaluate(() => {
let temperature = document.getElementsByTagName("pre")[0].innerText;
temperature = JSON.parse(temperature);
return {
temperature
}
})
console.log(result)
})()
This is my output
{
temperature: {
MultiDepartureBoard: {
noNamespaceSchemaLocation: 'http://xmlopen.rejseplanen.dk/xml/rest/hafasRestMultiDepartureBoard.xsd',
Departure: [Array]
}
}
}
What you are doing here doesn't make sense. Simply request the data
const rp = require('request-promise');
rp.get({
uri: 'http://xmlopen.rejseplanen.dk/bin/rest.exe/multiDepartureBoard?id1=8600646&format=json',
json: true
})
.then(res => res.MultiDepartureBoard.Departure)
.map(e => console.log(e))
;
I'm working in a crawler using node.js and puppeteer, my goal is to get the data of two columns in a table(date and description), the code work fine until the block to get the data from columns...
Full code below, include the url for the page i'm crawling:
const fs = require('fs');
const puppeteer = require('puppeteer');
const urlConsulta = "http://www.tre-pr.jus.br/";
const numeroProcessoSeq = "000000889";
const numeroProcessoAno = "2014";
const numeroProcessoDigito = "6160047";
var wait = ms => new Promise((r, j)=> setTimeout(r, ms));
void (async () => {
try {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto(urlConsulta);
await page.select('#acao', 'pesquisarNumUnico');
await page.evaluate((numeroProcessoSeq, numeroProcessoAno, numeroProcessoDigito) => {
document.getElementById('numUnicoSequencial').value = numeroProcessoSeq;
document.getElementById('numUnicoAno').value = numeroProcessoAno;
document.getElementById('numUnicoOrigem').value = numeroProcessoDigito;
}, numeroProcessoSeq, numeroProcessoAno, numeroProcessoDigito);
await page.$eval('form[action*="http://www.tre-pr.jus.br/##processrequest"]', form => form.submit());
await page.waitForNavigation();
var frame = await page.frames().find(f => f.name() === 'ifr_servicos');
await frame.click('a[href*="ExibirDadosProcesso"]');
await page.frames().find(f => f.name() === 'ifr_servicos');
await wait(10000);
await frame.click('[name*="todos"]');
await frame.$eval('[name*="ExibirPartesProcessoZona"]', form => form.submit());
await wait(10000);
let string = await buscaFases(frame);
fs.writeFile("teste.txt", string, function(err) {
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
console.log(string);
await wait(10000);
await browser.close();
} catch (error) {
console.log(error);
}
})();
async function buscaFases(frame) {
return await frame.evaluate(() => {
let div = document.querySelector('div[id*="conteudo"]');
let rowns = Array.from(div.children[4].children[0].children);
let movimentosInfo = rowns.map(row => {
let data = row.querySelector("tr td:first-child").textContent;
let descricao = row.querySelector("tr td:first-child + td").textContent;
return { data, descricao };
});
return JSON.stringify(movimentosInfo);
});
};
The specific lines to get the data :
let data = row.querySelector("tr td:first-child").textContent;
let descricao = row.querySelector("tr td:first-child + td").textContent;
The problem is that not all tr are having the child elements you are expecting. This might be because of a td tag with a colspan. So you should first filter your array to sort the other elements out.
Code
Change your lines including your map function beginning from let movimentosInfo = ... to this:
let movimentosInfo = rowns.filter(row => {
return row.querySelector("tr td:first-child") && row.querySelector("tr td:first-child + td");
}).map(row => {
let data = row.querySelector("tr td:first-child").textContent;
let descricao = row.querySelector("tr td:first-child + td").textContent;
return { data, descricao };
});
This adds a filter function which tests whether the desired elements do exist before mapping their content.
I am trying to scrape the website below and I am not getting the value of the attribute 'data-link'.
http://www.apptrace.com/itunes/charts/FRA/topfreeapplications/36/2018-12-27
Could someone help me?
//attempt #1 (error)
const puppeteer = require('puppeteer')
let scrape = async () => {
const browser = await puppeteer.launch({headless: true})
const page = await browser.newPage()
await page.goto('http://www.apptrace.com/itunes/charts/USA/topfreeapplications/36')
await page.waitFor(1000)
const countryCharts = await page.evaluate(() => {
const abbrAppsCountry = []
document.getElementById('#current_storefront_list')
.getAttribute('li > a[data-link]')
.forEach(app => abbrAppsCountry.push(app.value))
return abbrAppsCountry
})
browser.close()
return countryCharts
}
scrape().then((value) => {
console.log(value)
})
//attempt #2 (array of nulls)
const puppeteer = require('puppeteer')
let scrape = async () => {
const browser = await puppeteer.launch({headless: true})
const page = await browser.newPage()
await page.goto('http://www.apptrace.com/itunes/charts/USA/topfreeapplications/36')
await page.waitFor(1000)
const countryCharts = await page.evaluate(() => {
const abbrAppsCountry = []
document.querySelectorAll('#current_storefront_list > li > a[data-link]')
.forEach(app => abbrAppsCountry.push(app.value))
return abbrAppsCountry
})
browser.close()
return countryCharts
}
scrape().then((value) => {
console.log(value)
})
I would like to get the abbreviation of country names.
You can use dataset or getAttribute APIs:
document.querySelectorAll('#current_storefront_list > li > a')
.forEach(app => abbrAppsCountry.push(app.dataset.link))
Or:
document.querySelectorAll('#current_storefront_list > li > a')
.forEach(app => abbrAppsCountry.push(app.getAttribute('data-link')))