I'm looking for an efficient way to take a raw sql file and have it executed synchronously against a postgres database, akin to if you ran it through psql.
I have an sql file which creates all databases, imports data, etc. I need to execute this using node.js but cannot find any module which does this automatically. For the node.js application itself, we use node-postgres ('pg'), knex.js and bookshelf.js. I assume though that pg is best for this.
One alternative I can think of is to read the full file, split it by semicolons, replace newlines with spaces, trim any duplicate space, then feed it into pg one by one in a manner that they're executed sequentially, not asynchronously. I'm a little surprised if this is truly the most efficient way and also if no libraries exist yet to solve this. I'm a little hesitant to jump into it seeing as SQL syntax can itself be a little challenging and I might accidentally mash it up.
Some clarifications in advance:
psql cannot be used as it's not installed on the target machine
I've chosen to develop and source control sql statements in sql native form, because it's a lot easier for a DBA to use and manipulate it
You can just separate consequent queries with a semicolon when passed to client.query
That works:
var pg = require('pg');
pg.connect('postgres://test:test#localhost/test', function(err, client, done){
client.query('CREATE TABLE test (test VARCHAR(255)); INSERT INTO test VALUES(\'test\') ');
done();
});
And consequently, that works too:
var pg = require('pg');
var fs = require('fs');
var sql = fs.readFileSync('init_database.sql').toString();
pg.connect('postgres://test:test#localhost/test', function(err, client, done){
if(err){
console.log('error: ', err);
process.exit(1);
}
client.query(sql, function(err, result){
done();
if(err){
console.log('error: ', err);
process.exit(1);
}
process.exit(0);
});
});
I've written the following function which works for my case. It would have been much more simpler if it weren't for:
Using batch to manage concurrency
Having the tricky PostgreSQL COPY case to consider
Code snippet:
function processSQLFile(fileName) {
// Extract SQL queries from files. Assumes no ';' in the fileNames
var queries = fs.readFileSync(fileName).toString()
.replace(/(\r\n|\n|\r)/gm," ") // remove newlines
.replace(/\s+/g, ' ') // excess white space
.split(";") // split into all statements
.map(Function.prototype.call, String.prototype.trim)
.filter(function(el) {return el.length != 0}); // remove any empty ones
// Execute each SQL query sequentially
queries.forEach(function(query) {
batch.push(function(done) {
if (query.indexOf("COPY") === 0) { // COPY - needs special treatment
var regexp = /COPY\ (.*)\ FROM\ (.*)\ DELIMITERS/gmi;
var matches = regexp.exec(query);
var table = matches[1];
var fileName = matches[2];
var copyString = "COPY " + table + " FROM STDIN DELIMITERS ',' CSV HEADER";
var stream = client.copyFrom(copyString);
stream.on('close', function () {
done();
});
var csvFile = __dirname + '/' + fileName;
var str = fs.readFileSync(csvFile);
stream.write(str);
stream.end();
} else { // Other queries don't need special treatment
client.query(query, function(result) {
done();
});
}
});
});
}
Beware that this would fail if you used semicolons anywhere except to terminate SQL statements.
The #databases/pg client supports running SQL files out of the box:
const createPool = require('#databases/pg');
const {sql} = require('#databases/pg');
const db = createPool();
db.query(sql.file('my-file.sql')).catch(ex => {
console.error(ex);
process.exitCode = 1;
}).then(() => db.dispose());
It also supports having multiple statements in a single call to db.query:
const createPool = require('#databases/pg');
const {sql} = require('#databases/pg');
const db = createPool();
db.query(sql`
INSERT INTO users (name) VALUES (${'Forbes'});
SELECT * FROM users;
`)).then(
results => console.log(results)
).catch(ex => {
console.error(ex);
process.exitCode = 1;
}).then(() => db.dispose());
In this example, each statement is run in sequence, and the result of the last statement is returned.
The following, which just reads a file into a string and runs it using query seems to work for me:
const { Pool } = require("pg");
const pool = new Pool({ host, port, user, password, database });
dbClient = await pool.connect();
var sql = fs.readFileSync("/path/to/file.sql", "utf8");
await dbClient.query(sql);
In case it also helps, here is further code to run all "*.sql" files in a directory in alphabetical order:
const pathWithSqlFiles = "/path/to/sqldir";
const filenames = fs
.readdirSync(pathWithSqlFiles, { withFileTypes: true })
.filter((item) => !item.isDirectory() && item.name.toLowerCase().endsWith(".sql"))
.map((item) => item.name);
for (const filename of filenames) {
var sql = fs.readFileSync(`${pathWithSqlFiles}/${filename}`, "utf8");
await dbClient.query(sql);
}
(Don't forget to close the client connection at some point after this using await dbClient.end()).
There are many ways to import a database through SQL file the simplest and fasted way is to just run this command in you cmd where your file is saved:
psql -h localhost -U postgres -d myDataBase -a -f myFile.sql
Or you can read and parse the file through node.js and run it. But it would take time.
function processSQLFile(fileName) {
// Extract SQL queries from files. Assumes no ';' in the fileNames
var queries = fs.readFileSync(fileName).toString()
.replace(/(\r\n|\n|\r)/gm," ") // remove newlines
.replace(/\s+/g, ' ') // excess white space
.split(";") // split into all statements
.map(Function.prototype.call, String.prototype.trim)
.filter(function(el) {return el.length != 0}); // remove any empty ones
// Execute each SQL query sequentially
queries.forEach(function(query) {
batch.push(function(done) {
if (query.indexOf("COPY") === 0) { // COPY - needs special treatment
var regexp = /COPY\ (.*)\ FROM\ (.*)\ DELIMITERS/gmi;
var matches = regexp.exec(query);
var table = matches[1];
var fileName = matches[2];
var copyString = "COPY " + table + " FROM STDIN DELIMITERS ',' CSV HEADER";
var stream = client.copyFrom(copyString);
stream.on('close', function () {
done();
});
var csvFile = __dirname + '/' + fileName;
var str = fs.readFileSync(csvFile);
stream.write(str);
stream.end();
} else { // Other queries don't need special treatment
client.query(query, function(result) {
done();
});
}
});
});
}
Related
I am unable to execute the sql, when using the global database connection in node.js.
I have followed the steps as in Azure documentation: https://learn.microsoft.com/en-us/azure/mysql/connect-nodejs and able to display the output on the console. But, I want to put all my Azure SQL database connection in a separate file, but the select query is not printing the output on the console.
DatabaseManager.js
var Connection = require('tedious').Connection;
var Request = require('tedious').Request;
var sqlConnection = function sqlConnection() {
// Create connection to database
var config =
{
userName: 'uname',
password: 'password',
server: 'dbserver.database.windows.net',
options:
{
database: 'mydatabase',
encrypt: true
}
}
var connection = new Connection(config);
// Attempt to connect and execute queries if connection goes through
connection.on('connect', function(err) {
if (err)
{
console.log(err)
}
else
{
console.log('CONNECTED TO DATABASE');
}
}
);
}
module.exports = sqlConnection;
app.js
var restify = require('restify');
var builder = require('botbuilder');
var botbuilder_azure = require("botbuilder-azure");
var azure = require('azure-storage');
var dbconnection = require('./DatabaseManager');
bot.dialog('profileDialog',
(session) => {
session.send('You reached the profile intent. You said \'%s\'.', session.message.text);
console.log('Reading rows from the Table...');
dbconnection("select FNAME from StudentProfile where ID=1"),
function (err, result, fields) {
if (err) throw err;
console.log(result);
}
session.endDialog();
}
Console Output:
Reading rows from the Table...
CONNECTED TO DATABASE
I was expecting the output of FNAME, but nothing is printing on the console. Is there anything, I am missing?
Thank you.
There's a couple of problems here. First off, you should only ever import a module once per file. This is just a performance consideration and won't actually break your code.
Next, pay attention to what you're exporting from your DatabaseManager module. Right now, you're exporting a function that creates the connection and then doesn't do anything with it. We can fix this by using a pattern called a "callback" which lets us provide a function that will then be called with the connection as an argument.
I added a ton of comments to the code explaining things. This code won't run as-is - there's a couple places where I have "do this or this". You'll have to choose one.
var Tedious = require('tedious'); // Only require a library once per file
var Connection = Tedious.Connection;
var Request = Tedious.Request;
// Or using the object spread operator
var { Connection, Request } = require('tedious');
// You called this `sqlConnection`. I'm going to use a verb since it's a
// function and not a variable containing the connection. I'm also going
// to change the declaration syntax to be clearer.
function connect(cb) { // cb is short for callback. It should be a function.
var config = {
userName: 'uname',
password: 'password',
server: 'dbserver.database.windows.net',
options: {
database: 'mydatabase',
encrypt: true
}
}; // Put a semi-colon on your variable assignments
var connection = new Connection(config);
// Attempt to connect and execute queries if connection goes through
connection.on('connect', function(err) {
if (err) {
console.log(err);
return; // Stop executing the function if it failed
}
// We don't need an "else" because of the return statement above
console.log('CONNECTED TO DATABASE');
// We have a connection, now let's do something with it. Call the
// callback and pass it the connection.
cb(connection);
});
}
module.exports = connect; // This exports a function that creates the connection
Then back in your main file, you can use it like so.
var restify = require('restify');
var builder = require('botbuilder');
var botbuilder_azure = require('botbuilder-azure');
var azure = require('azure-storage');
var connect = require('./DatabaseManager'); // renamed to be a verb since it's a function.
bot.dialog('profileDialog', (session) => { // Hey, this is a callback too!
session.send('You reached the profile intent. You said \'%s\'.', session.message.text);
console.log('Creating a connection');
connect((connection) => {
// or with the traditional function notation
connect(function(connection) {
console.log('Reading rows from the Table...');
// Execute your queries here using your connection. This code is
// taken from
// https://github.com/tediousjs/tedious/blob/master/examples/minimal.js
request = new Request("select FNAME from StudentProfile where ID=1", function(err, rowCount) { // Look another callback!
if (err) {
console.log(err);
} else {
console.log(rowCount + ' rows');
}
connection.close();
});
request.on('row', function(columns) { // Iterate through the rows using a callback
columns.forEach(function(column) {
if (column.value === null) {
console.log('NULL');
} else {
console.log(column.value);
}
});
});
connection.execSql(request);
});
I have this code:
#!/usr/bin/env node
'use strict';
const request = require('superagent');
const querystring = require('querystring');
const path = require('path');
const fs = require('fs');
const timestamp = Math.floor(Date.now() / 1000).toString();
const logdir = path.join(__dirname, '../web/public/data/')
const logfile = path.join(logdir, timestamp + '.json');
// newsapi.org api key
const NEWSAPI_KEY = process.env.NEWSAPI_KEY;
// sources endpoint
const SOURCES = 'https://newsapi.org/v1/sources?language=en';
// articles endpoint
const ARTICLE_ENDPOINT = 'https://newsapi.org/v1/articles?';
function getLatest(src){
let sb = ['top','latest','popular'];
for (let i in sb){
let qs = querystring.stringify({source: src, sortBy: sb[i], apiKey: NEWSAPI_KEY});
request
.get(ARTICLE_ENDPOINT + qs)
.end((err, res) => {
if (!err){
fs.appendFile(logfile, JSON.stringify(res.body.articles), (err) => {
});
}
});
}
}
request
.get(SOURCES)
.end((err, res) => {
if (!err){
for ( var i in res.body.sources){
getLatest(res.body.sources[i].id);
}
}
});
The file that this code writes ends up looking like this
[{...},{...},{...}][{...},{...},{...}]...
This is invalid JSON. How would I make sure that it's valid like this
[{...},{...},{...},{...},{...},{...}]
I've tried so many things to make it work including applying a regex to the entire file after the fact, wrapping superagent in a function that returns a promise, and editing the file after the fact in the code that relies on this script to run. All to no avail. I know there must be a better (proper?) way to do this.
Use regex to find pattern "][" and replace it with ","
Rather than appending to the file with fs.appendFile, create an array before the loop, append to that and then serialise it.
function getLatest(src){
let sb = ['top','latest','popular'];
let reqs = sb.length;
let allArticles = [];
for (let i in sb){
let qs = querystring.stringify({source: src, sortBy: sb[i], apiKey: NEWSAPI_KEY});
request
.get(ARTICLE_ENDPOINT + qs)
.end((err, res) => {
if (!err){
allArticles.push(res.body.articles);
}
else{
// error happend, decrement the number of requests needed for success.
reqs--;
}
// if the number of responses is as expected, write to the json all at once.
if(allArticles.length == reqs)
fs.writeFile(logfile, JSON.stringify(allArticles), (err) => {});
});
}
}
The problem was that you are serialising individual data and then appending to the file rather than joining the data, serialising in one big go and then writing to file.
I have a script that I want to run on a scheduled basis in node. The script is not terminating and exiting. I suspect that this is because my database client is still open.
var client = new pg.Client(conString);
client.connect();
function registerBundle (innerHash, outterHash) {
// some stuff here
}
var query = client.query("SELECT id, chain FROM mytable where \
state_ready = true and transaction_id='' ");
query.on('row', function(row) {
var chain = row['chain'];
var pg_record = row['id'];
console.log(pg_record);
var innerHash = "something";
var outerHash = "something else";
var registrar = registerBundle(innerHash, outerHash);
var update = client.query('UPDATE mytable SET transaction_id = $1::text \
where id=$2::int', [transactionHash, pg_record]);
console.log(chain);
});
if I include the following, the client connection closes before the updates have a time to fire.
query.on('end', function() {
client.end();
});
I cannot use setTimeout or any other such mechanism because I don't know how long to wait for the registerBundle function to complete. Also I think query.on('end' will fire when the update is completed. Not sure how to test this.
My question, I need things to fire in order.
Query DB
Process each row (query.on
Update each row with value returned from registerBundle
Close db client/connection when all rows have been processed.
Terminate script and exit node
Seems pretty straightforward from a python/php world but falls apart in my javascript world.
A promise-based interface like pg-promise is the way to go:
var bluebird = require('bluebird');
var pgp = require('pg-promise')({
promiseLib: bluebird
});
var db = pgp(/*connection details*/);
db.tx(t => {
// BEGIN executed
return t.map('SELECT id, chain FROM mytable where state_ready = $1 and transaction_id = $2', [true, 123], a => {
var chain = data.chain;
var pg_record = data.id;
return t.none('UPDATE mytable SET transaction_id = $1::text where id=$2::int', [transactionHash, pg_record]);
}).then(t.batch); // settling all internal queries
})
.then(data => {
// success, COMMIT executed
})
.catch(error => {
// error, ROLLBACK executed
})
.finally(pgp.end); // shuts down the connection pool
The example above does exactly what you asked for, plus it uses a transaction. But in reality you're gonna want to do it all in one query, for performance reasons ;)
See more examples.
I have about a million JSON files saved across many sub-directories of the directory "D:/njs/nodetest1/imports/source1/" and I want to import them into the collection "users" in my mongoDB database.
The following code correctly traverses through the file system. As you can see, it reads each item in the directory and if that item is a directory it reads each item in it. For each item that is not a directory it performs a some operations on it before sending a variable holding an to a function.
function traverseFS (path){
var files = fs.readdirSync(path);
for (var i in files){
var currentFile = path + '/' + files[i];
var stats = fs.statSync(currentFile);
if (stats.isFile())
runOnFile(currentFile);
else
traverseFS(currentFile);
}
}
traverseFS("D:/njs/nodetest1/imports/source1/")
Next, I run a few operations on the code (see below). This reads the file, parses it into a JSON object, reads two attributes of that object into variables,creates an object in the variable "entry" and passes the variable to another function.
function runOnFile(currentFile){
var fileText = fs.readFileSync(currentFile,'utf8');
var generatedJSON = JSON.parse(fileText);
var recordID = generatedJSON.recordID;
var recordText = generatedJSON.recordTexts;
var entry = {recordID:recordID, recordText:recordText};
insertRecord(entry);
}
The final function then should be used to insert the data into mongoDB. I think that this is where thing go wrong.
function insertRecord(entry){
var MongoClient = mongodb.MongoClient;
var MongoURL = 'mongodb://localhost:27017/my_database_name';
MongoClient.connect(MongoURL, function (err, db) {
var collection = db.collection('users');
collection.insert([entry], function (err, result) {
db.close();
});
});
}
I expected this to run through the file structure, reading the JSON files into objects and then inserting those objects into my mongoDB. Instead it reads the first file into the database and then stops/hangs.
Notes:
I don't want to use mongoimport because I don't want to insert all the data from these files into my MongoDB database. I however am not tied to any aspect of this approach. If some other solution exists I am open to it.
This connects to the database just fine. For each item in the directory this successfully creates an "entry" object and passes it to the insertRecord function. In other words, the problem must be occuring in the insertRecord section. But it obviously could be caused by something earlier in the process.
If I add error handling, no errors are produced. I have left the error handling out of this post because it clutters the readability of the code snippets.
As per mongodb2.2 (current latest) documentation, insert is deprecated
DEPRECATED
Use insertOne, insertMany or bulkWrite
So the short answer is probably to change collection.insert([entry], ...) to collection.insertOne(entry, ...) and you're done.
Then for the long answer, you say "about a million of json files", which typically deserves a full async approach with the least amount of overhead.
There are two (potential) bottlenecks in the sample code:
fs.readFileSync, this is a blocking operation
the connecting, inserting a record and closing the database connection
Both are executed "about a million of times". Granted, an import is not usually done over and over again and (hopefully) not on a machine which needs its performance for other important tasks. Still, the sample code can easily be made more robust.
Consider using the glob module to obtain the list of json file.
glob('imports/**/*.json', function(error, files) {...})
This provides you with the full list of files easily in an async fashion.
Then consider connecting to the database just once, insert everything and close once.
Maintaining more or less the same steps you have in the sample, I'd suggest something like:
var glob = require('glob'),
mongodb = require('mongodb'),
fs = require('fs'),
MongoClient = mongodb.MongoClient,
mongoDSN = 'mongodb://localhost:27017/my_database_name',
collection; // moved this to the "global" scope so we can do it only once
function insertRecord(json, done) {
var recordID = json.recordID || null,
recordText = json.recordText || null;
// the question implies some kind of validation/sanitation/preparation..
if (recordID && recordText) {
// NOTE: insert was changed to insertOne
return collection.insertOne({recordID: recordID, recordText: recordText}, done);
}
done('No recordID and/or recordText');
}
function runOnFile(file, done) {
// moved to be async
fs.readFile(file, function(error, data) {
if (error) {
return done(error);
}
var json = JSON.parse(data);
if (!json) {
return done('Unable to parse JSON: ' + file);
}
insertRecord(json, done);
});
}
function processFiles(files, done) {
var next = files.length ? files.shift() : null;
if (next) {
return runOnFile(next, function(error) {
if (error) {
console.error(error);
// you may or may not want to stop here by throwing an Error
}
processFiles(files, done);
});
}
done();
}
MongoClient.connect(mongoDSN, function(error, db) {
if (error) {
throw new Error(error);
}
collection = db.collection('users');
glob('imports/**/*.json', function(error, files) {
if (error) {
throw new Error(error);
}
processFiles(files, function() {
console.log('all done');
db.close();
});
});
});
NOTE: You can collect multiple "entry"-records to leverage the performance gain of multiple inserts using insertMany, though I have the feeling the inserted records are more complicated than described and it might give some memory issues if not handled correctly.
Just structure your data into one big array of objects, then run db.collection.insertMany.
I suggest you doing this using Promises:
const Bluebird = require('bluebird');
const glob = Bluebird.promisify(require('glob'));
const mongodb = require('mongodb');
const fs = Bluebird.promisifyAll(require('fs'));
const Path = require('path');
const MongoClient = mongodb.MongoClient;
const insertMillionsFromPath = Bluebird.coroutine(function *(path, mongoConnString) {
const db = yield MongoClient.connect(mongoConnString);
try {
const collection = db.collection('users');
const files = yield glob(Path.join(path, "*.json"));
yield Bluebird.map(
files,
Bluebird.coroutine(function *(filename) {
console.log("reading", filename);
const fileContent = yield fs.readFileAsync(filename);
const obj = JSON.parse(fileContent);
console.log("inserting", filename);
yield collection.insertOne(obj);
}),
{concurrency: 10} // You can increase concurrency here
);
} finally {
yield db.close();
}
});
insertMillionsFromPath("./myFiles", "mongodb://localhost:27017/database")
.then(()=>console.log("OK"))
.catch((err)=>console.log("ERROR", err));
In order to work, you will need to install the following packages:
npm install --save mongodb bluebird glob
and you will need to use node.js version 6 or greater, otherwise you will need to transpile your javascript (due to function *() generators usage).
I have got a situation where i need to create n number of subpocess, for each subprocess i need to provide stdin data and expected output, the result of the subpocess is success, if the expected output is same as that of output produced. If all such subprocess is success then the status need to be send to user. How to do the above in nodejs in a nonblocking way?
Promises!
I personally use Bluebird, and here is an example that uses it too.
I hope you understand it, feel free to ask when you do not :-)
var Promise = require('bluebird')
var exec = require('child_process').exec
// Array with input/output pairs
var data = [
['input1', 'output1'],
['input2', 'output2'],
...
]
var PROGRAM = 'cat'
Promise.some(data.map(function(v) {
var input = v[0]
var output = v[1]
new Promise(function(yell, cry) {
// Yes it is ugly, but exec is just saves many lines here
exec('echo "' + input + '" | ' + PROGRAM, function(err, stdout) {
if(err) return cry(err)
yell(stdout)
})
}).then(function(out) {
if(out !== output) throw new Error('Output did not match!')
})
}), data.length) // Require them all to succeed
.then(function() {
// Send succes to user
}).catch(function() {
// Send failure to the user
})