Bulk convert XML to JSON with particular output - javascript

How do I convert approximately 3000 XML files into JSON files using node?
I've been able to get a script below to a single XML to JSON file in the format that I want, and I've been attempting to promisify the script using bluebird, but I keep getting errors. I've been able to get the script below to list the filenames, but then I get the error "Unhandled rejection Error: ENOENT: no such file or directory, open 'journal-article-10.2307_357359.xml'"
var Promise = require('bluebird');
var fs = require('fs');
var convert = require('xml-js');
fs.readdirAsync = function(dirname) {
return new Promise(function(resolve, reject) {
fs.readdir(dirname, function(err, filenames){
if (err)
reject(err);
else
resolve(filenames);
});
});
};
fs.readFileAsync = function(filename, enc) {
return new Promise(function(resolve, reject) {
fs.readFile(filename, enc, function(err, data){
if (err)
reject(err);
else
resolve(data);
});
});
};
function getFile(filename) {
return fs.readFileAsync(filename, 'utf8');
}
fs.readdirAsync('./metadata/').then(function (filenames){
console.log(filenames);
return Promise.all(filenames.map(getFile));
}).then(function (files){
files.forEach(function(files){
function nativeType(value) {
var nValue = Number(value);
if (!isNaN(nValue)) {
return nValue;
}
var bValue = value.toLowerCase();
if (bValue === 'true') {
return true;
} else if (bValue === 'false') {
return false;
}
return value;
}
var removeJsonTextAttribute = function(value, parentElement) {
try {
var keyNo = Object.keys(parentElement._parent).length;
var keyName = Object.keys(parentElement._parent)[keyNo - 1];
parentElement._parent[keyName] = nativeType(value);
} catch (e) {}
};
var options = {
compact: true,
trim: true,
ignoreDeclaration: true,
ignoreInstruction: true,
ignoreAttributes: true,
ignoreComment: true,
ignoreCdata: true,
ignoreDoctype: true,
textFn: removeJsonTextAttribute,
spaces: 2
};
fs.writeFile("./json/" + fileaname + ".json", convert.xml2json(options));
});
});
I would like to be able to convert the entire folder of XML files to JSON (to upload to couchDB).

ENOENT is a standard POSIX error code that means the path to the file is not found. You've tried to open the name of a file or directory that doesn't exist. In this case, fs.readdir returns names that are not fully qualified file names, so you'll need to prefix them with the path that you gave it, specifically: './metadata/'. The error message that you're seeing tells you the file that was opened: journal-article-10.2307_357359.xml, but in this case, you probably want to open ./metadata/journal-article-10.2307_357359.xml.
You can see this with the following simple example:
# Create a dummy directory named `garbage` that contains only 3 entries
$ mkdir -p garbage/{foo,bar,baz}
# Run `node` interactively
$ node
> const fs = require('fs');
undefined
> fs.readdirSync('./garbage')
[ 'bar', 'baz', 'foo' ]
It doesn't make sense to open 'bar' since that doesn't exist. You'll need to open './garbage/bar' for it to work correctly.

Related

How could I check If a zip file is corrupted in NodeJS?

I would check if a ZIP file is corrupted using NodeJS using less CPU and memory as possible.
How to corrupt a ZIP file:
Download a ZIP file
Open the ZIP file using a text editor optimized like Notepad++
Rewrite the header. Only put random characters.
I am trying to reach this goal using the NPM library "node-stream-zip"
private async assertZipFileIntegrity(path: string) {
try {
const zip = new StreamZip.async({ file: path });
const stm = await zip.stream(path);
stm.pipe(process.stdout);
stm.on('end', () => zip.close());
} catch (error) {
throw new Error();
}
}
However, when I run the unit tests I receive an error inside an array:
Rejected to value: [Error]
import zip from 'yauzl';
import path from 'path';
const invalidZipPath = path.resolve('invalid.zip');
const validZipPath = path.resolve('valid.zip');
const isValidZipFile = (filePath) => {
return zip.open(filePath, { lazyEntries: true }, (err, stream ) => {
if (err) {
console.log('fail to read ', filePath);
return false;
}
console.log('success read ', filePath);
return true;
});
}
isValidZipFile(validZipPath);
isValidZipFile(invalidZipPath);

Using promisses on Node JS

I'm trying to use a promise on this code:
//Listando arquivos
app.post('/readList', function(req, res) {
var cleared = false
var readList = new Promise(function(resolve, reject){
fs.readdir(req.body.path, (err, files) => {
files.forEach(file => {
console.log(file)
var fileDetail = {
name: '',
local: true,
filetype: 'fas fa-folder-open',
filepath: '',
isFile: false
}
if(!cleared){
listedFiles = []
cleared = true
}
fileDetail.name = file
fileDetail.filepath = req.body.path + file
fs.stat(req.body.path + file, function(err, stats) {
fileDetail.isFile = stats.isFile()
if(stats.isFile()) fileDetail.filetype = 'far fa-file-alt'
else fileDetail.filetype = 'fas fa-folder-open'
})
listedFiles.push(fileDetail)
})
})
})
readList.then(
console.log('vorta'),
res.end(JSON.stringify(listedFiles))
)
})
I've putted this line to show the itens listing:
console.log(file)
And put this line to execute after promise:
readList.then(
console.log('vorta'),
res.end(JSON.stringify(listedFiles))
)
I don't know where is the mistake, but console is showing 'vorta' before the files names.
What am I doing wrong?
Here you're passing two params:
readList.then(
//#1 In this case you're executing the log function and cause that the message is being printed.
console.log('vorta'),
res.end(JSON.stringify(listedFiles)) //# 2
)
So, you need to pass a function
readList.then(function() {
console.log('vorta');
res.end(JSON.stringify(listedFiles));
})
Further, you need to call the function resolve within the async logic.
As I said in my earlier comment, there are at least four problems here:
You aren't calling resolve(listedFiles) to resolve the promise so its .then() handler is never called
You need to pass a single function to .then()
You have no error handling for your async operations
You seem to be assuming that fs.stat() is synchronous when it is not
The best way to attack this problem is to promisify all your asynchronous functions and then use promises for controlling the flow and the error handling. Here's a way to fix all of these issues:
const util = require('util');
const fs = require('fs');
const readdirAsync = util.promisify(fs.readdir);
const statAsync = util.promisify(fs.stat);
//Listando arquivos
app.post('/readList', function(req, res) {
// add code here to sanitize req.body.path so it can only
// point to a specific sub-directory that is intended for public consumption
readdirAsync(req.body.path).then(files => {
return Promise.all(files.map(file => {
let fileDetail = {
name: file,
local: true,
filepath: req.body.path + file
};
return statAsync(fileDetail.filepath).then(stats => {
fileDetail.isFile = stats.isFile();
fileDetail.filetype = fileDetail.isFile ? 'far fa-file-alt' : 'fas fa-folder-open';
return fileDetail;
});
}));
}).then(listedFiles => {
res.json(listedFiles);
}).catch(err => {
console.log(err);
res.sendStatus(500);
});
});
FYI, this is kind of a dangerous implementation because it lists files on ANY path that the user passes in so any outsider can see the entire file listing on your server's hard drive. It could even list network attached drives.
You should be limiting the scope of the req.body.path to only a specific file hieararchy that is intended for public consumption.
You need to pass a function to then.
As it stands, you are calling log and end immediately and passing their return values.
Here is working copy of your code, I have made few changes that you can omit since those are only to give you a working code:
var express = require('express');
var fs = require('fs');
var app = express();
app.post('/readList', function(req, res) {
//Assuming sample data coming in req.body
//Remove this when you run at you side
req.body = {
path: 'temp_dir'
};
var cleared = false;
var listedFiles = [];
//Promising readList :)
function readList(){
return new Promise(function (resolve, reject) {
// Suppose req.body.path is 'temp_dir' and it has 2 files
fs.readdir(req.body.path, (err, files) => {
console.log(files);
//in following commented code you can do what you need
/*files.forEach(file = > {
console.log(file);
var fileDetail = {
name: '',
local: true,
filetype: 'fas fa-folder-open',
filepath: '',
isFile: false
}
if (!cleared) {
listedFiles = [];
cleared = true;
}
// really? you can think of it later!!
fileDetail.name = file;
fileDetail.filepath = req.body.path + file
// I changed below to avoid surprises for you in data!
const stats = fs.statSync(req.body.path + file);
fileDetail.isFile = stats.isFile();
if (stats.isFile())
fileDetail.filetype = 'far fa-file-alt';
else
fileDetail.filetype = 'fas fa-folder-open'
listedFiles.push(fileDetail);
});*/
resolve(listedFiles);
});
});
}
readList().then((data) => {
console.log('vorta');
// data here will be containing same data as listedFiles so choose your way of doing, I would recommend to go with data
res.send(JSON.stringify(listedFiles)); // fine
// res.send(JSON.stringify(data)); // better and recommended
});
})
app.listen(process.env.PORT || 3000);
console.log('Listening on port 3000');

Node.js read a file in a zip without unzipping it

I have a zip file (actually it's an epub file) I need to loop through the files in it and read them without unzipping them to the disk.
I tried to use a Node.js library called JSZip but the content of each file is stored in memory in Buffer and whenever I try to decode the buffer content to string the content returned is unreadable
Here's the code I tried:
const zip = new JSZip();
// read a zip file
fs.readFile(epubFile, function (err, data) {
if (err) throw err;
zip.loadAsync(data).then(function (zip) {
async.eachOf(zip.files, function (content, fileName, callback) {
if (fileName.match(/json/)) {
var buf = content._data.compressedContent;
console.log(fileName);
console.log((new Buffer(buf)).toString('utf-8'));
}
callback();
}, function (err) {
if (err) {
console.log(err);
}
});
});
});
Since unzip seems to be abandoned, I used node-stream-zip with pretty good success.
npm install node-stream-zip
Reading files be all like:
const StreamZip = require('node-stream-zip');
const zip = new StreamZip({
file: 'archive.zip',
storeEntries: true
});
zip.on('ready', () => {
// Take a look at the files
console.log('Entries read: ' + zip.entriesCount);
for (const entry of Object.values(zip.entries())) {
const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`;
console.log(`Entry ${entry.name}: ${desc}`);
}
// Read a file in memory
let zipDotTxtContents = zip.entryDataSync('path/inside/zip.txt').toString('utf8');
console.log("The content of path/inside/zip.txt is: " + zipDotTxtContents);
// Do not forget to close the file once you're done
zip.close()
});
npm install unzip
https://www.npmjs.com/package/unzip
fs.createReadStream('path/to/archive.zip')
.pipe(unzip.Parse())
.on('entry', function (entry) {
var fileName = entry.path;
var type = entry.type; // 'Directory' or 'File'
var size = entry.size;
if (fileName === "this IS the file I'm looking for") {
entry.pipe(fs.createWriteStream('output/path'));
} else {
entry.autodrain();
}
});

Looping through files in a folder Node.JS

I am trying to loop through and pick up files in a directory, but I have some trouble implementing it. How to pull in multiple files and then move them to another folder?
var dirname = 'C:/FolderwithFiles';
console.log("Going to get file info!");
fs.stat(dirname, function (err, stats) {
if (err) {
return console.error(err);
}
console.log(stats);
console.log("Got file info successfully!");
// Check file type
console.log("isFile ? " + stats.isFile());
console.log("isDirectory ? " + stats.isDirectory());
});
Older answer with callbacks
You want to use the fs.readdir function to get the directory contents and the fs.rename function to actually do the renaming. Both these functions have synchronous versions if you need to wait for them to finishing before running the code afterwards.
I wrote a quick script that does what you described.
var fs = require('fs');
var path = require('path');
// In newer Node.js versions where process is already global this isn't necessary.
var process = require("process");
var moveFrom = "/home/mike/dev/node/sonar/moveme";
var moveTo = "/home/mike/dev/node/sonar/tome"
// Loop through all the files in the temp directory
fs.readdir(moveFrom, function (err, files) {
if (err) {
console.error("Could not list the directory.", err);
process.exit(1);
}
files.forEach(function (file, index) {
// Make one pass and make the file complete
var fromPath = path.join(moveFrom, file);
var toPath = path.join(moveTo, file);
fs.stat(fromPath, function (error, stat) {
if (error) {
console.error("Error stating file.", error);
return;
}
if (stat.isFile())
console.log("'%s' is a file.", fromPath);
else if (stat.isDirectory())
console.log("'%s' is a directory.", fromPath);
fs.rename(fromPath, toPath, function (error) {
if (error) {
console.error("File moving error.", error);
} else {
console.log("Moved file '%s' to '%s'.", fromPath, toPath);
}
});
});
});
});
Tested on my local machine.
node testme.js
'/home/mike/dev/node/sonar/moveme/hello' is a file.
'/home/mike/dev/node/sonar/moveme/test' is a directory.
'/home/mike/dev/node/sonar/moveme/test2' is a directory.
'/home/mike/dev/node/sonar/moveme/test23' is a directory.
'/home/mike/dev/node/sonar/moveme/test234' is a directory.
Moved file '/home/mike/dev/node/sonar/moveme/hello' to '/home/mike/dev/node/sonar/tome/hello'.
Moved file '/home/mike/dev/node/sonar/moveme/test' to '/home/mike/dev/node/sonar/tome/test'.
Moved file '/home/mike/dev/node/sonar/moveme/test2' to '/home/mike/dev/node/sonar/tome/test2'.
Moved file '/home/mike/dev/node/sonar/moveme/test23' to '/home/mike/dev/node/sonar/tome/test23'.
Moved file '/home/mike/dev/node/sonar/moveme/test234' to '/home/mike/dev/node/sonar/tome/test234'.
Update: fs.promises functions with async/await
Inspired by ma11hew28's answer (shown here), here is the same thing as above but with the async functions in fs.promises. As noted by ma11hew28, this may have memory limitations versus fs.promises.opendir added in v12.12.0.
Quick code below.
//jshint esversion:8
//jshint node:true
const fs = require( 'fs' );
const path = require( 'path' );
const moveFrom = "/tmp/movefrom";
const moveTo = "/tmp/moveto";
// Make an async function that gets executed immediately
(async ()=>{
// Our starting point
try {
// Get the files as an array
const files = await fs.promises.readdir( moveFrom );
// Loop them all with the new for...of
for( const file of files ) {
// Get the full paths
const fromPath = path.join( moveFrom, file );
const toPath = path.join( moveTo, file );
// Stat the file to see if we have a file or dir
const stat = await fs.promises.stat( fromPath );
if( stat.isFile() )
console.log( "'%s' is a file.", fromPath );
else if( stat.isDirectory() )
console.log( "'%s' is a directory.", fromPath );
// Now move async
await fs.promises.rename( fromPath, toPath );
// Log because we're crazy
console.log( "Moved '%s'->'%s'", fromPath, toPath );
} // End for...of
}
catch( e ) {
// Catch anything bad that happens
console.error( "We've thrown! Whoops!", e );
}
})(); // Wrap in parenthesis and call now
fs.readdir(path[, options], callback) (which Mikey A. Leonetti used in his answer) and its variants (fsPromises.readdir(path[, options]) and fs.readdirSync(path[, options])) each reads all of a directory's entries into memory at once. That's good for most cases, but if the directory has very many entries and/or you want to lower your application's memory footprint, you could instead iterate over the directory's entries one at a time.
Asynchronously
Directories are async iterable, so you could do something like this:
const fs = require('fs')
async function ls(path) {
const dir = await fs.promises.opendir(path)
for await (const dirent of dir) {
console.log(dirent.name)
}
}
ls('.').catch(console.error)
Or, you could use dir.read() and/or dir.read(callback) directly.
Synchronously
Directories aren't sync iterable, but you could use dir.readSync() directly. For example:
const fs = require('fs')
const dir = fs.opendirSync('.')
let dirent
while ((dirent = dir.readSync()) !== null) {
console.log(dirent.name)
}
dir.closeSync()
Or, you could make directories sync iterable. For example:
const fs = require('fs')
function makeDirectoriesSyncIterable() {
const p = fs.Dir.prototype
if (p.hasOwnProperty(Symbol.iterator)) { return }
const entriesSync = function* () {
try {
let dirent
while ((dirent = this.readSync()) !== null) { yield dirent }
} finally { this.closeSync() }
}
if (!p.hasOwnProperty(entriesSync)) { p.entriesSync = entriesSync }
Object.defineProperty(p, Symbol.iterator, {
configurable: true,
enumerable: false,
value: entriesSync,
writable: true
})
}
makeDirectoriesSyncIterable()
And then, you could do something like this:
const dir = fs.opendirSync('.')
for (const dirent of dir) {
console.log(dirent.name)
}
Note: "In busy processes, use the asynchronous versions of these calls. The synchronous versions will block the entire process until they complete, halting all connections."
References:
Node.js Documentation: File System: Class fs.Dir
Node.js source code: fs.Dir
GitHub: nodejs/node: Issues: streaming / iterative fs.readdir #583
Read all folders in a directory
const readAllFolder = (dirMain) => {
const readDirMain = fs.readdirSync(dirMain);
console.log(dirMain);
console.log(readDirMain);
readDirMain.forEach((dirNext) => {
console.log(dirNext, fs.lstatSync(dirMain + "/" + dirNext).isDirectory());
if (fs.lstatSync(dirMain + "/" + dirNext).isDirectory()) {
readAllFolder(dirMain + "/" + dirNext);
}
});
};
The answers provided are for a single folder. Here is an asynchronous implementation for multiple folders where all the folders are processed simultaneously but the smaller folders or files gets completed first.
Please comment if you have any feedback
Asynchronously Multiple Folders
const fs = require('fs')
const util = require('util')
const path = require('path')
// Multiple folders list
const in_dir_list = [
'Folder 1 Large',
'Folder 2 Small', // small folder and files will complete first
'Folder 3 Extra Large'
]
// BEST PRACTICES: (1) Faster folder list For loop has to be outside async_capture_callback functions for async to make sense
// (2) Slower Read Write or I/O processes best be contained in an async_capture_callback functions because these processes are slower than for loop events and faster completed items get callback-ed out first
for (i = 0; i < in_dir_list.length; i++) {
var in_dir = in_dir_list[i]
// function is created (see below) so each folder is processed asynchronously for readFile_async that follows
readdir_async_capture(in_dir, function(files_path) {
console.log("Processing folders asynchronously ...")
for (j = 0; j < files_path.length; j++) {
file_path = files_path[j]
file = file_path.substr(file_path.lastIndexOf("/") + 1, file_path.length)
// function is created (see below) so all files are read simultaneously but the smallest file will be completed first and get callback-ed first
readFile_async_capture(file_path, file, function(file_string) {
try {
console.log(file_path)
console.log(file_string)
} catch (error) {
console.log(error)
console.log("System exiting first to catch error if not async will continue...")
process.exit()
}
})
}
})
}
// fs.readdir async_capture function to deal with asynchronous code above
function readdir_async_capture(in_dir, callback) {
fs.readdir(in_dir, function(error, files) {
if (error) { return console.log(error) }
files_path = files.map(function(x) { return path.join(in_dir, x) })
callback(files_path)
})
}
// fs.readFile async_capture function to deal with asynchronous code above
function readFile_async_capture(file_path, file, callback) {
fs.readFile(file_path, function(error, data) {
if (error) { return console.log(error) }
file_string = data.toString()
callback(file_string)
})
}

Why is my else statement run in this piece of node code?

I'm doing the learnyounode exercises. In this function, I list a directory and filter files by extension. As I understand, if I provide an invalid directory name, it should go into the if clause and exit. But instead, it goes to the if clause and then complains about something in the else clause.
Module:
module.exports = function (path, ext, callb) {
var fs = require('fs');
var elems = [];
var r = new RegExp("\\." + ext);
fs.readdir(path, function (err, list) {
console.log(err); // this runs, it's an object like { [Error: ENOENT, scandir '/Users/JohnnyLee/baobabf/
console.log(list); // this also runs, it's undefined
if (err) {
console.log("ERR!!!"); // this ran!
return callb(err);
} else {
console.log("NO ERRR!!!"); // this didn't run :/
list.forEach(function (i) { // this crashes?
if (i.match(r)) {
elems.push(i);
}
});
return callb(err, elems);
}
console.log(list);
});
};
Executable:
var mymod = require('./p06-1');
var filename = process.argv[2]
var extension = process.argv[3]
if (filename && extension) {
mymod(filename, extension, function (err, list) {
list.forEach(function (i) {
console.log(i);
});
})
}
The output:
triton:learnnode JohnnyLee$ node p06-2.js doesntexist txt
{ [Error: ENOENT, scandir 'doesntexist'] errno: -2, code: 'ENOENT', path: 'doesntexist' }
undefined
ERR!!!
/Users/JohnnyLee/learnnode/p06-2.js:7
list.forEach(function (i) {
^
TypeError: Cannot read property 'forEach' of undefined
at /Users/JohnnyLee/learnnode/p06-2.js:7:13
at /Users/JohnnyLee/learnnode/p06-1.js:11:20
at FSReqWrap.oncomplete (fs.js:95:15)
triton:learnnode JohnnyLee$
What did I miss :(?
You call it with:
mymod(filename, extension, function (err, list) {
list.forEach(function (i) {
console.log(i);
});
})
It is the list.forEach in there that errors. Look at the filename/line number for the error. This is because you call callb(err) in the module, but you don't check for an error before assuming there's a list to work with.
Please check the error stack properly, it's loud and clear :)
As for your question of what you've missed,
Add an error check statement to handle error.
Modify mymod call as shown below,
if (filename && extension) {
mymod(filename, extension, function (err, list) {
if(err){
//do something with error
}else{
list.forEach(function (i) {
console.log(i);
}
});
})
}

Categories

Resources