how to recognize two images using tesseract.js? - javascript

Here is my code, I can recognize only one image using tesseract.js (i.e filepath1), I want to recognize both images (filepath1 and filepath2), how to achieve that?
app.post("/",(req,res)=>{
const form = formidable({multiples:true});
form.parse(req,(err,fields,files)=>{
const filepath1 = files.file1.filepath;
const filepath2 = files.file2.filepath;
const worker = createWorker({
logger: m => console.log(m)
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(filepath1);
const replacedtext1 = text.replace(/[^a-zA-Z ]/g, "");
const wordextract1 = replacedtext1.match(/\w+/g);
fileextract1.push(wordextract1);
console.log(wordextract1);
await worker.terminate();
})();
});
});

Related

Puppeteer - how to not repeat if statements to check for missing selectors

I've managed to get Puppeteer working to scrape data off a number of different web pages. However, I'm repeating the same if statement for each bit of data - I'm really new to Javascript so I'm pretty sure I'm overlooking something simple so it's not repeated.
I've searched online quite a bit & tried a few different things but can't get it working.
For the code example below, I've taken out a lot of the different query selectors so it's easier to read and just an example, but in the actual code there's 12 of them, all with exactly the same code except the querySelector.
const puppeteer = require('puppeteer');
// This gets the url's I want to scrape stored on another file
let urls = require('./assets/links.js').urls;
(async () => {
// Initiate the browser
const browser = await puppeteer.launch();
// Create a new page with the default browser context
const page = await browser.newPage();
for (let i = 0; i < urls.length; i++) {
// Go to the target website
await page.goto(urls[i]);
let url = urls[i];
const title = await page.evaluate(() => {
let element = document.querySelector('h1')
if (element) {
return element.innerText
} return null;
})
const reviews = await page.evaluate(() => {
let element = document.querySelector('.example-class')
if (element) {
return element.innerText
} return null;
})
const description = await page.evaluate(() => {
let element = document.querySelector('#example-id')
if (element) {
return element.innerText
} return null;
})
console.log({ url, title, reviews, description });
}
// Closes the browser and all of its pages
await browser.close();
})();
I've tried creating a function but it wouldn't let me use it with await.
You can write a simple function that grabs text from a selector and returns null if the element doesn't exist:
const puppeteer = require("puppeteer"); // ^19.0.0
const {urls} = require("./assets/links.js");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const text = sel =>
page.$eval(sel, el => el.textContent).catch(() => null);
for (const url of urls) {
await page.goto(url);
console.log({
url,
title: await text("h1"),
reviews: await text(".example-class"),
description: await text("#example-id"),
});
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
If there's 12 of them, you might want to add an array and a loop:
const puppeteer = require("puppeteer"); // ^19.0.0
const {urls} = require("./assets/links.js");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const text = sel =>
page.$eval(sel, el => el.textContent).catch(() => null);
const selectors = {
title: "h1",
reviews: ".example-class",
description: "#example-id",
// ...
};
for (const url of urls) {
await page.goto(url);
const textProms = Object.entries(selectors).map(
async (k, sel) => [k, await text(sel)]
);
console.log({
url,
...Object.fromEntries(await Promise.all(textProms)),
});
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());

Unable to implement any logic to scrape content from innermost pages using puppeteer

I've created a script using puppeteer to scrape the links of different authors from a webpage traversing multiple pages triggering click on the next page button. The script appears to be working in the right way.
Although the content of this site is static, I intentionally used puppeteer within the following script only to learn as to how I can parse content from inner pages.
Given that I wish to go one layer deep to scrape description from such pages. How can I achieve that?
const puppeteer = require('puppeteer');
function run (pagesToScrape) {
return new Promise(async (resolve, reject) => {
try {
if (!pagesToScrape) {
pagesToScrape = 1;
}
const browser = await puppeteer.launch({headless:false});
const [page] = await browser.pages();
await page.goto("https://quotes.toscrape.com/");
let currentPage = 1;
let urls = [];
while (currentPage <= pagesToScrape) {
let newUrls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('[class="quote"]');
items.forEach((item) => {
results.push({
authorUrl: 'https://quotes.toscrape.com' + item.querySelector("small.author + a").getAttribute('href'),
title: item.querySelector("span.text").innerText
});
});
return results;
});
urls = urls.concat(newUrls);
if (currentPage < pagesToScrape) {
await Promise.all([
await page.waitForSelector('li.next > a'),
await page.click('li.next > a'),
await page.waitForSelector('[class="quote"]')
])
}
currentPage++;
}
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run(3).then(console.log).catch(console.error);
I would go this way:
const puppeteer = require('puppeteer');
let browser;
(async function main() {
browser = await puppeteer.launch({ headless: false, defaultViewport: null });
const [pageQuotes] = await browser.pages();
const pageAbout = await browser.newPage();
await pageQuotes.bringToFront(); // Otherwise, click on the next page link does not work.
const pagesToScrape = 3;
await pageQuotes.goto('https://quotes.toscrape.com/');
let currentPage = 1;
const data = { quotes: {}, abouts: {} };
const visitedAbouts = new Set();
while (currentPage <= pagesToScrape) {
await pageQuotes.waitForSelector('.quote');
const { quotes, aboutURLs } = await pageQuotes.evaluate(() => ({
quotes: Array.from(
document.querySelectorAll('.quote'),
quote => [quote.querySelector('small.author').innerText, quote.innerText],
),
aboutURLs: Array.from(
document.querySelectorAll('.quote small.author + a[href]'),
quote => quote.href,
),
}));
for (const [author, quote] of quotes) {
if (data.quotes[author] === undefined) data.quotes[author] = [];
data.quotes[author].push(quote);
}
for (const aboutURL of aboutURLs) {
if (!visitedAbouts.has(aboutURL)) {
visitedAbouts.add(aboutURL);
await pageAbout.goto(aboutURL);
await pageAbout.waitForSelector('div.author-details');
const { title, about } = await pageAbout.evaluate(() => ({
title: document.querySelector('div.author-details h3.author-title').innerText,
about: document.querySelector('div.author-details').innerText,
}));
data.abouts[title] = about;
}
}
if (currentPage < pagesToScrape) {
const nextLink = await pageQuotes.waitForSelector('li.next > a');
await Promise.all([
nextLink.click(),
pageQuotes.waitForNavigation(),
]);
}
currentPage++;
}
console.log(JSON.stringify(data, null, ' '));
})().catch(console.error).finally(async () => { if (browser) await browser.close(); });

Clicking on internal javascript links and returning urls using puppeteer

My goal is to click on each link (called a footnote) on this page and then return the footnote link, text, and then all of the URLs that appear in the sidebar. I'm stuck on accessing the sidebar values when they appear and after a few weeks of failure, I'm looking for some pointers on what I'm doing wrong (very new to both javascript and puppeteer).
const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function () {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url);
const footnotes = await page.$$eval(selector, nodes => {
return nodes.map(node => {
const ref = node.href.replace('https://www.churchofjesuschrist.org', '');
const txt = node.text;
return {
ref,
txt
};
});
});
for (const a of footnotes) {
page.click(a.ref);
const links = await page.$$eval('.scripture-ref', nodes => {
return nodes.map(node => {
return node.href
})
})
}
console.log(footnotes);
console.log(links);
// const fs = require('fs');
// fs.writeFile('./footnotes.json', JSON.stringify(footnotes), err => err ? console.log(err) : null);
await browser.close();
})();
Maybe something like this:
const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function main() {
const browser = await puppeteer.launch({ headless: true });
const [page] = await browser.pages();
await page.goto(url);
const data = {};
for (const footnote of await page.$$(selector)) {
const [href, text] = await page.evaluate(
(a) => {
a.click();
return [a.getAttribute('href').replace('/#note', ''), a.innerText.slice(1)];
},
footnote
);
data[href] = { text };
const header = await page.waitForXPath(`//aside/div/header/span[text()="${href} ${text}"]`);
data[href].links = await page.evaluate(
(span) => {
const aside = span.closest('aside');
return [...aside.querySelectorAll('a[href]')].map(
a => ({ [a.innerText]: a.href })
);
},
header
);
console.log(`Done: ${href} ${text}`);
}
console.log(JSON.stringify(data, null, 2));
await browser.close();
})();
Part of the output:
{
"1a": {
"text": "pondering",
"links": [
{
"D&C 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19"
},
{
"TGĀ Meditation": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
},
{
"Doctrine and Covenants 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19#19"
},
{
"Meditation, Meditate": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
}
]
},
}

Opening array in an API (puppeteer)

I am trying to open a array from an API
tried using the code
const names_2 = await page.evaluate(() => Array.from(document.querySelectorAll('.mainDiv > Departure'), Departure => Departure.innerText));
But with no luck
Here is my input
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('http://xmlopen.rejseplanen.dk/bin/rest.exe/multiDepartureBoard?id1=8600646&format=json')
const result = await page.evaluate(() => {
let temperature = document.getElementsByTagName("pre")[0].innerText;
temperature = JSON.parse(temperature);
return {
temperature
}
})
console.log(result)
})()
This is my output
{
temperature: {
MultiDepartureBoard: {
noNamespaceSchemaLocation: 'http://xmlopen.rejseplanen.dk/xml/rest/hafasRestMultiDepartureBoard.xsd',
Departure: [Array]
}
}
}
What you are doing here doesn't make sense. Simply request the data
const rp = require('request-promise');
rp.get({
uri: 'http://xmlopen.rejseplanen.dk/bin/rest.exe/multiDepartureBoard?id1=8600646&format=json',
json: true
})
.then(res => res.MultiDepartureBoard.Departure)
.map(e => console.log(e))
;

Puppeteer: setDefaultNavigationTimeout to 0 still times out

Every time I run this script is times-out.
Does setDefaultNavigationTimeout actually prevent time outs?
There is about 26 URLs I'm going through, and each page has a large amount of images. Can't imagine Puppeteer can't handle these pages just because of heavy images?
const url = 'test.com';
const jsonReturn = [];
async function runScraper() {
const browser = await puppeteer.launch(prodConfig);
const page = await browser.newPage({
timeout: 0
});
page.setDefaultNavigationTimeout(0);
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.waitForSelector('.featured-shows-featured-show');
let featuredShowsURLs = await page.$$eval('.featured-shows-featured-show > a', (links) => {
return links.map(link => {
return link.href;
});
});
featuredShowsURLs = _.uniq(featuredShowsURLs)
for (const featuredShowsURL of featuredShowsURLs) {
const page = await browser.newPage({
timeout: 0
});
try {
await page.goto(featuredShowsURL);
await page.waitForSelector('.show-title');
} catch (e) {
featuredShowsURL;
debugger;
}
const showTitle = await findAndReturnSelectorText('.show-title', page);
const showDates = await findAndReturnSelectorText('.show-dates', page);
const showLocation = await findAndReturnSelectorText('.show-location', page);
const showGallery = await findAndReturnSelectorText('.entity-link', page);
const showDetail = await findAndReturnSelectorText('.show-press-release', page);
const newItem = {
showTitle,
showDates,
showLocation,
showGallery,
showDetail,
};
const id = hash(newItem);
jsonReturn.push({
...newItem,
id
});
}
await browser.close();
}
runScraper();

Categories

Resources