Need help scraping image from craigslist - javascript

I've tried everything I can think of. I'm able to get postUrl, date, title, price and location. If you go to https://sandiego.craigslist.org/search/sss?query=surfboards and paste the code snippet below into the console it returns all the images. But when I try to access in my code it's returning undefined. Any help on this would be greatly appreciated!
$('#search-results > li').each((index, element) => {
console.log( $(element).children().find('img').attr('src') )
})
import axios from 'axios'
import request from 'request-promise'
import cheerio from 'cheerio'
import express from 'express'
import path from 'path'
const __dirname = path.resolve();
const PORT = process.env.PORT || 8000;
const app = express();
app.get('', (req, res) => {
res.sendFile(__dirname + '/views/index.html')
});
const surfboards = [];
axios("https://sandiego.craigslist.org/search/sss?query=surfboards")
.then(res => {
const htmlData = res.data;
const $ = cheerio.load(htmlData);
$('#search-results > li').each((index, element) => {
const postUrl = $(element).children('a').attr('href');
const date = $(element).children('.result-info').children('.result-date').text();
const title = $(element).children('.result-info').children('.result-heading').text().trim();
const price = $(element).children('.result-info').children('.result-meta').children('.result-price').text();
const location = $(element).children('.result-info').children('.result-meta').children(".result-hood").text().trim();
// Why is this not working?!?!?!?!?!
const img = $(element).children().find('img').attr('src');
surfboards.push({
title,
postUrl,
date,
price,
location,
img
})
})
return surfboards
}).catch(err => console.error(err))
app.get('/api/surfboards', (req, res) => {
const usedboards = surfboards
return res.status(200).json({
results: usedboards
})
})
// Make App listen
app.listen(PORT, () => console.log(`Server is listening to port ${PORT}`))

Looks like the page sets the images with JavaScript. Thus axios gets the HTML without actual links to images.
But there seems to be a workaround here. You can generate links to images by concatenate https://images.craigslist.org and data-ids value from parent a tag.
You can get the data-ids like this:
var data_ids = $(element).children('a').attr('data-ids')
then split it to array by comma, delete first two 3: symbols and concat it like this:
`${img_base_url}/${ids}_${resolution_and_extension}`
But if you need to get URL only for first image then there is no need to create new array each time. Use substring instead (note that sometimes li don't have image at all):
if (data_ids && data_ids.includes(',')) {
data_ids.substring(data_ids.indexOf('3:') + 2, data_ids.indexOf(','))
} else if (data_ids) {
data_ids.substring(data_ids.indexOf('3:') + 2, data_ids.length)
}

Related

Looking for a solution to stream dynamic changes in JSON data on to browser without refreshing

I have a node.js / next.js api built that essentially does a bunch of stuff after the user submits text into a form on the front end. One of the things it does is write stage completion messages periodically to a JSON file to signify the completion of certain stages.
my api looks something like this
import dbConnect from '../../../lib/dbConnect'
import Demo from '../../../models/Demo'
import fs from 'fs'
import shell from 'shelljs';
export default async function handler(req, res) {
const {
method,
body,
} = req
await dbConnect()
switch (method) {
case 'GET':
try {
const demos = await Demo.find({})
res.status(200).json({ success: true, data: demos })
} catch (error) {
res.status(400).json({ success: false })
}
break
case 'POST':
try {
const initialjson = '[]'
const timestamp = Date.now();
// stage 1
if (shell.exec('./initial_checks.sh').code !== 0) {
shell.echo('Sorry stage failed');
shell.exit(1);
};
const objSuccess1 = JSON.parse(initialjson);
objSuccess1.push("Stage 1 complete", + timestamp);
const finalJSONSuccess1 = JSON.stringify(objSuccess1);
fs.writeFileSync('success-stage.json', finalJSONSuccess1);
// stage 2
if (shell.exec('./secondary_checks.sh').code !== 0) {
shell.echo('Sorry stage failed');
shell.exit(1);
};
const objSuccess2 = JSON.parse(initialjson);
objSuccess2.push("Stage 2 complete", + timestamp);
const finalJSONSuccess2 = JSON.stringify(objSuccess2);
fs.writeFileSync('success-stage.json', finalJSONSuccess2);
const demo = await Demo.create(
req.body
)
res.status(201).json({ success: true, data: demo })
} catch (error) {
res.status(400).json({ success: false })
}
break
default:
res.status(400).json({ success: false })
break
}
}
I am using socket.io, my server.js file is
server.js
const app = require("express")();
const server = require("http").Server(app);
const io = require("socket.io")(server);
const next = require("next");
const dev = process.env.NODE_ENV !== "production";
const nextApp = next({ dev });
const nextHandler = nextApp.getRequestHandler();
let port = 3000;
const fs = require('fs')
const data = fs.readFileSync('success-stage.json', 'utf8')
io.on("connect", (socket) => {
socket.emit("now", {
message: data
});
});
nextApp.prepare().then(() => {
app.all("*", (req, res) => {
return nextHandler(req, res);
});
server.listen(port, (err) => {
if (err) throw err;
console.log("> Ready on port: " + port);
});
});
and here is the pages/index.js file
import { useEffect, useRef, useState } from "react";
import io from "socket.io-client";
export default function IndexPage() {
const socket = useRef();
const [hello, setHello] = useState();
useEffect(() => {
socket.current = io();
socket.current.on("now", (data) => {
setHello(data.message);
});
}, []);
return <h1>{hello}</h1>;
}
so at this point we are seeing the 2nd message from my JSON file match what is rendered on the frontend when I build my application. It looks like this
["Stage 2 complete",1664289144513]
I am wondering how I can stream this data onto the front end for clients without having to refresh the page? I need it to show the current stage's success message... There are 5 total stages, so i guess i am looking for a way to either stream data or maybe to revalidate the browser window like every second without having to refresh... is this possible?
Any help would be greatly appreciated... Thanks in advance for your time everyone...
You've already got a solution implemented that can handle this. What you're describing is exactly what sockets are for -- bidirectional communication between the client and server without refreshing the page.
Just create a new socket listener on the frontend for a new topic, maybe "stageStatus", and then emit messages to that topic on the backend at various stages in the process. That's it!

Node.js Cheerio Live Update/Instant Refresh page on every HTML body change

I am using Axios and cheerio to scrape the cricket score website and convert its data to JSON format.
But the problem is that scores and other information are not updating instantly on my API response.
I want to have kind of useEffect (I know, it's React Hook and don't work in Express) functionality in my express server so that whenever the score changes on the main website, my server re-scrapes the page and show updated data.
axios(link).then(response => {
const html = response.data
const $ = cheerio.load(html)
const score = []
$('.scorecard-container', html).each(function(){
const title = $(this).text()
const url = link + $(this).find('a').attr('href')
score.push({
id: score.length + 1,
title,
url
})
})
res.json(score)
}).catch(err => {res.send('Something went wrong'); console.log(err)})
Thanks in Advance :)
example express endpoint
get("/stats", async (req, res) => {
let { data } = await axios.get(someUrl)
let $ = cheerio.load(data)
res.json({
title: $('title').text()
})
})
in react
useEffect(() => {
fetch("/stats").then(r=> r.json().then(data => setStats(data)))
}, [])

javascript cannot convert undefined or null to object question

I am trying javascript for the first time and I am having this trouble with the example:
https://www.twilio.com/blog/web-scraping-and-parsing-html-with-node-js-and-cheerio
It is a web scrapper example that uses got and cheerio, both of which I have installed. But when i run the sample code it gives me 'cannot convert undefined or null to object error.
Why is that? I didn't change anything from the example at all.
the code in question:
const $ = cheerio.load(response.body);
$('a').each((i, link) => {
const href = link.attribs.href;
console.log(href);
});
}).catch(err => {
console.log(err);
});
How does your index.js file look like? I did the tutorial and my code is working. Maybe you are miss typed the url?
Here is my index.js
const fs = require("fs");
const cheerio = require("cheerio");
const got = require("got");
const vgmUrl = "https://www.vgmusic.com/music/console/nintendo/nes";
got(vgmUrl)
.then((response) => {
const $ = cheerio.load(response.body);
$("a").each((i, link) => {
const href = link.attribs.href;
console.log(href);
});
})
.catch((err) => {
console.log(err);
});

How to use dynamic query for the rest api

I am beginner of javascript and I am trying to create a simple rest api using node.js. This is so far, I have it.
I have a database called testDb and table called testMeasurement in influxdb. testMeasurement table contains DateOfBirth,ID,FirstName,LastName
(ID is tag in my testMeasurement table)
var express = require('express');
const Influx = require('influx')
var app = express();
const influx = new Influx.InfluxDB('http://user:password#localhost:8086/testDb')
app.listen(3000, 'localhost');
app.get('/myapi', function (req, res) {
influx.query('select * from testMeasurement').then(result => {
res.json(result)
}).catch(err => {
res.status(500).send(err.stack)
})
})
Now, Above gives me all the data which I have in testMeasurement table from database "testDb".
How do I define my query in a dynamic way so that I can filter my result?
for eg. if I type localhost/myapi/ID={someValue}, this should give me the relatedData of that ID.
Any advice would be so helpful.
There are many ways to achieve what you want. The best way to do it is using wildcards. Example:
app.get('/myapi/:userId', (req, res) => {
var query_str = 'select * from testMeasurement';
if (req.params.userId){
query_str += ' where id = ' + req.params.userId;
}
influx.query(query_str).then(result => {
res.json(result)
}).catch(err => {
res.status(500).send(err.stack)
})
});
That implies that you must have a structured API to consume, having nodes for each item. If you just want to test a little bit, one basic example is to test for GET params like:
app.get('/myapi', function (req, res) {
var query_str = 'select * from testMeasurement';
if (req.query.id != null){
query_str += ' where id = ' + req.query.id;
}
influx.query(query_str).then(result => {
res.json(result)
}).catch(err => {
res.status(500).send(err.stack)
})
})
Hope it helps!

How do I call two different REST api endpoints simultaneously and display the data from both on one endpoint of my app?

I am building a simple application for my portfolio using The Movie Database api. In my GET /movie route, I want to get and display the data about the movie, and the names and photos of the cast members, however the data for the movie and the data for the cast members belong two separate endpoints of the api, and I am at a complete loss as to how to access both response data sets under a single endpoint in my app.
I cannot call axios.get() on both endpoints under the /movie route because I will get a "Headers already sent" error, and I have tried to write a function that uses axios.get for 1 endpoint that returns the response and gets called in my GET /movie route, but that causes the entire GET route to return undefined.
Here is my current code for my /movie route that is incorrect, but closely conveys what I am trying to accomplish
const express = require('express');
const router = express.Router();
const axios = require('axios');
const api_key = require('../config/keys').api_key;
const imgURL = "http://image.tmdb.org/t/p/";
const dateFormat = require('../config/dateFormat');
getActors = movie_id => {
axios.get(`https://api.themoviedb.org/3/movie/${movie_id}/credits?api_key=${api_key}&language=en-US`)
.then(res => {
return res.data;
}).catch(err => console.log(err.message));
}
router.get('/:id', (req, res) => {
const id = req.params.id
axios.get(`https://api.themoviedb.org/3/movie/${id}?api_key=${api_key}&language=en-US`)
.then(res => {
const movieData = res.data;
const actors = getActors(id); //calling above function here, but returns undefined and hangs the application
res.render('movie', {movieInfo: movieData, imgURL: imgURL, releaseDate: dateFormat(movieData.release_date), actors: actors})
})
.catch(err => console.log(err.status_message))
});
module.exports = router;
any help is greatly appreciated.
You can use axios.all to concatenate several promises and executing them in parallel. Then, once all the added requests have been finished, you can handle with the then promise the result of all of them. For instance, in your code:
const express = require('express');
const router = express.Router();
const axios = require('axios');
const api_key = require('../config/keys').api_key;
const imgURL = "http://image.tmdb.org/t/p/";
const dateFormat = require('../config/dateFormat');
axios.all([
axios.get(`https://api.themoviedb.org/3/movie/${movie_id}/credits?api_key=${api_key}&language=en-US`),
axios.get(`https://api.themoviedb.org/3/movie/${id}?api_key=${api_key}&language=en-US`)
])
.then(axios.spread((actorsRes, moviesRes) => {
// Your logic with each response
});
module.exports = router;
I see that getActors is currently returning null. Change it to...
const getActors = movie_id => {
return axios.get(`https://api.themoviedb.org/3/movie/${movie_id}/credits?api_key=${api_key}&language=en-US`)
.then(res => {
return res.data;
}).catch(err => console.log(err.message));
}
Another problem is calling of getActors function. It's a function that contains asynchronous function axios.get(). Change that to ..
router.get('/:id', (req, res) => {
let movieData;
const id = req.params.id
axios.get(`https://api.themoviedb.org/3/movie/${id}?api_key=${api_key}&language=en-US`)
.then(res => {
movieData = res.data;
return getActors(id);
})
.then(actors => {
res.render('movie', {movieInfo: movieData, imgURL: imgURL, releaseDate: dateFormat(movieData.release_date), actors: actors})
})
.catch(err => console.log(err.status_message))
});

Categories

Resources