Node.js Scraping Data Click Event - javascript

I have a repetitive task that I have to do at regular intervals. Basically, I need to enter the website, get some values from different tables then write them on spreadsheet. By using these values, make some calculation, prepare a report etc.
I would like to create a helper bot because this is straight forward task to do.
I can basically get information by opening up console (while I am on the related page) and by using DOM or Jquery I am fetching data easily.
I would like to take it a step further and create an application on Node.js (without entering related website, I will send my bot to related page and do same actions that I do on console.)
I started to write something with cheerio. However, at some point my bot needs to click a button (in order to change table). I searched but couldn't find the way.
My question is "clicking a button on server side (change the table) and fetch data from that table is possible ?"
If do you know better way to create this kind of bot, please make suggestion.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', (req, res) => {
url = 'http://www.imdb.com/title/tt1229340/';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var title, release;
var json = { title : "", release : ""};
$('.header').filter(() => {
var data = $(this);
title = data.children().first().text();
release = data.children().last().children().text();
json.title = title;
json.release = release;
})
// This is not possible
$( "#target" ).click(function() {
alert( "Handler for .click() called." );
});
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), (err) => {
console.log('File successfully written!);
})
res.send('Check your console!')
}) ;
})
app.listen('8080');
edit: The Answer of this question is "Use Zombie"
Now I have another question related to this one.
I am trying to learn & use zombie. I could
connect to website
go to necessary table
print console all tds
However by using this method, I could only get really messed up string. (All tds were printed without any whitespace, no chance to clean out, basically I want to put all tds in an array. How can I do that ?)
browser.visit(url, () => {
var result = browser.text('table > tbody.bodyName td');
console.log(result);
})

I'd suggest you try using a headless browser such as Phantom.js or Zombie for this purpose. What you're trying to do above is assign a click handler to an element in Cheerio, this won't work!
You should be able to click a button based on the element selector in Zombie.js.
There's a browser.pressButton command in Zombie.js for this purpose.
Here's some sample code using zombie.js, in this case clicking a link..
const Browser = require('zombie');
const url = 'http://www.imdb.com/title/tt1229340/';
let browser = new Browser();
browser.visit(url).then(() => {
console.log(`Visited ${url}..`);
browser.clickLink("FULL CAST AND CREW").then(() => {
console.log('Clicked link..');
browser.dump();
});
}).catch(error => {
console.error(`Error occurred visiting ${url}`);
});
As for the next part of the question, we can select elements using zombie.js and get an array of their text content:
const Browser = require('zombie');
const url = 'http://www.imdb.com/title/tt1229340/';
let browser = new Browser();
browser.visit(url).then(() => {
console.log(`Visited ${url}..`);
var result = browser.queryAll('.cast_list td');
var cellTextArray = result.map(r => r.textContent.trim())
.filter(text => text && (text || '').length > 3);
console.log(cellTextArray);
}).catch(error => {
console.error(`Error occurred visiting ${url}`);
});

Related

How to handle two slashes in a row in a router using express?

Problem
Hi devs,
I am having trouble passing an id that has a '/' at the beginning.
This is the log
GET /api/v1/ GetAnimeInfo//anime/5226/tokyo-ghoul/Tokyo% 20Ghoul 404 0.466 ms - 1310
As you can see, He can not recognize two / after GetAnimeInfo//
Isn't there a way expressjs allows me that pattern?
//id = '/anime/5226/tokyo-ghoul/'
//title = 'Tokyo Ghoul'
router.get('/GetAnimeInfo/:id([^/]+/[^/]+[^/]+/[^/])/:title' , (req , res , next) =>{
let id = req.params.id
let title = req.query.title;
api.getAnimeInfo(id , title)
.then(info =>{
res.status(200).json({
info
});
}).catch((err) =>{
console.log(err);
});
});
I would highly advise against doing this.
If a client is sending an erroneous double slash there is a functional bug creating that issue and you should fix the bug, not provide a weird workaround on the server - that way you end up with more robust, predictable and maintainable code in the future.
If you're trying to manipulate the server to accept double slash as part of routing, there will be no guarantee that clients will respect the behavior so you will run into situations where one browser will work and another will not.
If you have shows which begin in a slash, eg '/ShowName', that you need to account for, you should be escaping the show name with URL encoding - https://en.wikipedia.org/wiki/Percent-encoding
Yeah, that's unlikely to work given Express would have no idea where where the :id ends and where the rest of the URL pattern match begins.
Can't you just parse the URL manually? Doesn't seem like it would be all that difficult e.g.
router.get('/GetAnimeInfo/:idAndTitle', (req, res, next) => {
const { idAndTitle } = req.params;
const idx = idAndTitle.lastIndexOf("/") + 1;
const id = idAndTitle.substring(0, idx);
const title = idAndTitle.substring(idx, idAndTitle.length);
...
});
Demo
const idAndTitle = '/anime/5226/tokyo-ghoul/Tokyo Ghoul';
const idx = idAndTitle.lastIndexOf("/") + 1;
const id = idAndTitle.substring(0, idx);
const title = idAndTitle.substring(idx, idAndTitle.length);
console.log(`ID=${id}`);
console.log(`Title=${title}`);

Telegram Bot run commands from outside

I'm quite new to working with telegram bots, but I managed well so far with some basic bot. Now I want to improve a bit things and let my site "feed" the bot.
This is the scenario
I have a Google spreadsheet that make some calculation and then sends a message to the bot with the classic URL. Something like this...
var optionsUG = {
'method' : 'post',
'payload' : formDataUG,
'muteHttpExceptions':true
};
var optionsLG = {
'method' : 'post',
'payload' : formDataLG
};
//SpreadsheetApp.getUi().alert('UrlFetchApp options ['+options+"]");
//UrlFetchApp.fetch('https://api.telegram.org/bot'+token+'/sendMessage?chat_id='+channelNumber+'&text='+text);
var result = UrlFetchApp.fetch('https://api.telegram.org/bot'+token+'/sendMessage',optionsUG);
Utilities.sleep(5 * 1000);
result = UrlFetchApp.fetch('https://api.telegram.org/bot'+token+'/sendMessage',optionsLG);
now I would like to make something like but, instead of sendMessage I would like to call a method of my bot
I use JavaScript Telegraf framework ATM, but I can change is not a problem.
I want to achieve something like:
var result = UrlFetchApp.fetch('https://api.telegram.org/bot'+token+'/register',optionsUG);
here is the bot currently configured
const serverPath = "/home/bots/PlatoonAdvisor/telegram";
const commands = require(serverPath+'/package/modules/commands.js');
const config = require(serverPath+'/config.json');
var helpText = require(serverPath+'/package/help.txt');
const token = config.TELEGRAM_BOT_SECRET;
const Telegraf = require('telegraf');
const bot = new Telegraf(token);
const REGISTER_COM = 'register';
const HELP_COM = 'help';
const REQUIREMENTS_COM = 'requirements';
const CAHT_ID_COM = 'chatid';
const getCommandParameters = function (text, command) {
var reg = "/\/"+command+" (.*)/g";
var arr = text.match(reg);
return arr;
}
/*
bot.on('text', message=> {
return message.reply('I am Grooth');
})
*/
bot.command(HELP_COM, ctx=> {
return ctx.reply(helpText);
});
bot.command(REGISTER_COM, ctx=> {
var replyMsg;
var param = getCommandParameters(ctx.message.text, REGISTER_COM);
var player_name, allycode;
if (param != null) {
try {
var params = param.split(",");
if (params.length < 2) {
replyMsg = "Missing parameters, try /help if you need help :)";
throw replyMsg;
}
player_name = params[1];
allycode = params[0];
var channel = ctx.chat.id;
commands.registerTPlayer(player_name, allycode, channel);
replyMsg = "Successfully registered player ${player_name} with allycode ${allycode}!"
} catch (ex) {
console.log (ex);
}
}
return ctx.reply(replyMsg);
});
bot.command(REQUIREMENTS_COM, ctx=> {
var param = getCommandParameters(ctx.message.text, REQUIREMENTS_COM);
var params = param.split(",");
var json = ctx.chat.id;
return ctx.reply(json);
});
bot.command(CAHT_ID_COM, ctx=> {
var id = ctx.chat.id;
var msg = "The chat id requested is ${id}";
return ctx.reply(msg);
});
bot.startPolling();
is that even possible? I'm looking over the internet for a while now and was not able to find any clue about.
EDIT: Doing some more digging I found webhooks to send content to a web server when something happens in the bot but not vice versa. I'm getting frustrated.
My goal is to update the local database with information the spreadsheet have but the bot still don't so users can later ask to the bot to retrieve those information.
I mean I could make an ajax call if it were a real web server, but it is just a spreadsheet which doesn't act as a server.
Ok I forgot to answer this question with the solution I found.
there is no way indeed to call a specific function of the bot from the outside because it is not a real function, it is a parsed string that a user type and the bot interpret as a command.
So I had to be creative and expose a RestServer from the bot itself (the NodeJS express library did the trick) which I was then able to call from the script.
Here an handy guide for Express.js
This is my solution which is working great now.

express post & delete responses concepts not understood

I have no clue, but it seems that when I try to get into an attribute of an element from the JSON file it says that it's null.
Still got npm audit problems :(, what do you think about that?
Here is the EDITED code that I did so far:
export const data = require('./file.json');
export let DATA = data as Type[];
let temp = DATA;
app.post('/api/tickets', (req, res) => {
// load past data into json string
const past_data = JSON.stringify(temp);
// load new data into json string
const new_element = JSON.stringify(req.params.formData)
if (new_element !== "")
{
// concat both string to 1 json string, then write into fs
fs.writeFile("./file.json",[past_data,new_element],(err) => {
if (err) throw err;
});
}
// send it back as response to request
const new_data = JSON.parse([past_data,new_element].toString());
res.send(new_data);
});
app.delete('/api/tickets/:id', (req,res) => {
// fined requested ticket based on id in global temp
const ticket = temp.find(t => t.id === (req.params.id));
if (typeof ticket !== 'undefined') {
const index = temp.indexOf(ticket);
// remove it from global temp
temp.splice(index, 1)
}
// create json string out of global temp
const data_after_delete = JSON.stringify(temp);
// write it straight into fs
fs.writeFile("./file.json",data_after_delete,(err) => {
if (err) throw err;
});
// send it back to requester
const new_data = JSON.parse(data_after_delete);
res.send(new_data);
});
One object from the json file before I write into it:
[
{
"id": "81a885d6-8f68-5bc0-bbbc-1c7b32e4b4e4",
"title": "Need a Little Help with Your Site? Hire a Corvid Web Developer",
"content": "Here at Wix we strive to support you with this community forum, API references, articles, videos and code examples. But sometimes you might need a little extra help to get your site exactly the way you want it. \nHire a developer from the Wix Arena, an online marketplace with top Corvid web developers from around the world. Submit your project details here, and we’ll find the right professional for you.",
"userEmail": "jug#nesetal.af",
"creationTime": 1542111235544,
"labels": ["Corvid", "Api"]
},
One object from the json file after I write into it:
["[\"[\\\"[\\\\\\\"[{\\\\\\\\\\\\\\\"id\\\\\\\\\\\\\\\":\\\\\\\\\\\\\\\"81a885d6-8f68-5bc0-bbbc-1c7b32e4b4e4\\\\\\\\\\\\\\\",\\\\\\\\\\\\\\\"title\\\\\\\\\\\\\\\":\\\\\\\\\\\\\\\"Need a Little Help with Your Site? Hire a Corvid Web Developer\\\\\\\\\\\\\\\",\\\\\\\\\\\\\\\"content\\\\\\\\\\\\\\\":\\\\\\\\\\\\\\\"Here at Wix we strive to support you with this community forum, API references, articles, videos and code examples. But sometimes you might need a little extra help to get your site exactly the way you want it. \\\\\\\\\\\\\\\\nHire a developer from the Wix Arena, an online marketplace with top Corvid web developers from around the world. Submit your project details here, and we’ll find the right professional for you.\\\\\\\\\\\\\\\",\\\\\\\\\\\\\\\"userEmail\\\\\\\\\\\\\\\":\\\\\\\\\\\\\\\"jug#nesetal.af\\\\\\\\\\\\\\\",\\\\\\\\\\\\\\\"creationTime\\\\\\\\\\\\\\\":1542111235544,\\\\\\\\\\\\\\\"labels\\\\\\\\\\\\\\\":[\\\\\\\\\\\\\\\"Corvid\\\\\\\\\\\\\\\",\\\\\\\\\\\\\\\"Api\\\\\\\\\\\\\\\"]},
You should only use JSON.stringify when writing to a file, and JSON.parse when reading from a file (if you don't use require which does the parsing implicitly). Manipulate your data as plain objects and arrays, not as JSON strings - that will only damage the structure as you noticed.
export let DATA: Type[] = require('./file.json');
function save() {
const jsonString = JSON.stringify(DATA);
// ^^^^^^^^^^^^^^^^^^^^^ only call it here
fs.writeFile("./file.json", jsonString, (err) => {
if (err) throw err;
});
}
app.post('/api/tickets', (req, res) => {
if (req.params.formData) {
const new_element = req.params.formData; // might need a JSON.parse() if it's a a json string
// add to global temp by array manipulation
DATA.push(new_element);
save();
}
// send it back as response to request
res.send(DATA);
});
app.delete('/api/tickets/:id', (req,res) => {
// find requested ticket based on id in global temp
const ticket = DATA.findIndex(t => t.id === (req.params.id));
if (ticket !== -1) {
// remove it from global temp
DATA.splice(index, 1);
save();
}
// send it back to requester
res.send(DATA);
});

Event Listener on particular document Id, not on whole database

How can I continuously listen on particular document of a database in couchdb? If anything changes only in that document, then and only then I want to console that document, otherwise not. How can I achieve that?
my db entry in couchdb:
{
"_id": "my-doc",
"_rev": "13-7cf9b1373542d93da7b484774856429d",
"awesome": "my-doc"
}
my code:
var
util = require('util'),
couchdb = require('felix-couchdb'),
client = couchdb.createClient(5984, 'localhost'),
db = client.db('lookup');
db.changesStream({id:"my-doc"}, function(err,success){
if(!err){
console.log(success)
}
})
this code generates an error
stream = new process.EventEmitter(),
^
TypeError: process.EventEmitter is not a constructor
at exports.Db.Db.changesStream (/home/xyz/Projects/practice/node_modules/felix-couchdb/lib/couchdb.js:676:14)
other than I have tried using libraries like :- couchdb-api, couchdb-change-events..
To listen to a single document changes, you must supply docs_ids=["$id"] where $id is the id you want to track.
It seems like EventEmitter is not available. Perhaps you're trying to run nano in the browser?
You can use db.changes like this:
const nano = require('nano')('http://localhost:5984')
const db = nano.use('foo');
const req = db.changesAsStream({
since: "now", feed: 'continuous', doc_ids: JSON.stringify(["my-doc", "doc_2"])
}).pipe(process.stdout);
I recommend using the db.follow API. It has more feature and it's more stable I think
const nano = require('nano')('http://localhost:5984')
const db = nano.use('foo');
const feed = db.follow({since: "now",filter:"_doc_ids",doc_ids:JSON.stringify(["my-doc","doc_2"])});
feed.on('change', (change) => {
console.log("change: ", change);
});
feed.follow();

Is there a way to access data from another table during table.read - Azure Mobile App

I am trying to get data from another database before reading data in the table. However, I can't seem to find a way to access it properly.
The best I've got so far is based on some other examples both on Microsoft's documentation and on StackOverflow but they all seem to fail.
table.read(function (context) {
var results = context.tables("table2").read();
var text = results[0].column;
context.query.where({ columnName: text });
return context.execute();
});
I get an error when doing this saying that column doesn't exist.
As per your description, if I do not misunderstand, you want to query table2 in table1 operations in EasyTables scripts.
we can leverage "use()" to custom middleware to specify middleware to be executed for every request against the table as the description on the document of azure-mobile-apps sdk at
E.G.
var queries = require('azure-mobile-apps/src/query');
var insertMiddleware = function(req,res,next){
var table = req.azureMobile.tables('table2'),
query = queries.create('table2')
.where({ TestProperty : req.body.testproperty });
table.read(query).then(function(results) {
if(results){
req.someStoreData = somehander(results); //some hander operations here to get what you want to store and will use in next step
next();
}else{
res.send("no data");
}
});
};
table.insert.use(insertMiddleware, table.operation);
table.insert(function (context) {
console.log(context.req.someStoreData);
return context.execute();
});
More example:
async function filterByAllowedDomain(context) {
var domains = await context.tables('domains')
.where({ allowed: true })
.read();
var categories = await context.tables('categories')
.where(function (ids) {
return this.domainId in ids;
}, domains.map(d => d.id))
.read();
context.query.where(function (ids) {
return this.categoryId in ids;
}, categories.map(c => c.id));
return context.execute(); }
The tables module in azure-mobile-apps-node sdk contains functionality for adding tables to an Azure Mobile App. It returns a router that can be attached to an express app with some additional functions for registering tables. Which actually leverage Azure SQL (SQL Server database service on Azure).
Hope it helps.

Categories

Resources