Node.js Downloading multiples files asynchronously - javascript

In trying to get a hang of node.js asynchronous coding style, I decided to write a program that would read a text file containing a bunch of URLS to download and download each file. I started out writing a function to download just one file (which works fine), but having trouble extending the logic to download multiple files.
Here's the code:
var http = require("http"),
fs = require("fs"),
input = process.argv[2],
folder = "C:/Users/Wiz/Downloads/",
regex = /(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?/,
urls = null,
url = "",
filename = "";
fs.readFile(input, "utf8", function(e, data) {
console.log("Reading file: " + input);
if (e) console.log("Got error:" + e.message);
urls = data.split("\n");
for (var i = urls.length; i--;) {
url = urls[i];
if (!url.match(regex)) continue;
filename = folder + url.substring(url.lastIndexOf('/') + 1);
downloadQueue.addItem(url, filename);
}
});
var downloadQueue = {
queue: [],
addItem: function(p_sSrc, p_sDest) {
this.queue.push({
src: p_sSrc,
dest: p_sDest
});
if (this.queue.length === 1) {
this.getNext();
}
},
getNext: function() {
var l_oItem = this.queue[0];
http.get(l_oItem.src, function(response) {
console.log("Downloading: " + l_oItem.dest);
var file = fs.createWriteStream(l_oItem.dest);
response.on("end", function() {
file.end();
console.log("Download complete.");
downloadQueue.removeItem();
}).on("error", function(error) {
console.log("Error: " + error.message);
fs.unlink(l_oItem.dest);
});
response.pipe(file);
});
},
removeItem: function() {
this.queue.splice(0, 1);
if (this.queue.length != 0) {
this.getNext();
} else {
console.log("All items downloaded");
}
}
};
How do I structure the code so that the completion of the first download can signal the initiation of the next one. Please note that this exercise is just for learning purposes, to understand how asynchronous coding works. In practice, I'm sure there are much better tools out there to download multiple files.

Try simple at first, it look like you copy paste codes and quite don't understand what they do.
Do a simple loop, that get the url, and print something.
var http = require('http');
URL = require('url').parse('http://www.timeapi.org/utc/now?format=%25F%20%25T%20-%20%25N')
URL['headers'] = {'User-Agent': 'Hello World'}
// launch 20 queries asynchronously
for(var i = 0; i < 20; i++) {
(function(i) {
console.log('Query ' + i + ' started');
var req = http.request(URL, function(res) {
console.log('Query ' + i + ' status: ' + res.statusCode + ' - ' + res.statusMessage);
res.on('data', function(content){
console.log('Query ' + i + ' ended - ' + content);
});
});
req.on('error', function(err) {
console.log('Query ' + i + ' return error: ' + err.message);
});
req.end();
})(i);
}
All the urls will be fetched asynchronously. You can observe that the response does not arrive in order, but are still processed correctly.
The difficulty with async is not to do the things is parallel, because you just write like a single task, and execute multiple time. It becomes complicated when you need for instance to wait for all tasks to finished before continuing. And for that, have a look at promises

Here is what I started out with. Figuring that each download was invoked asynchronously, they would all be independent of each other.
var http = require("http"),
fs = require("fs"),
input = process.argv[2],
folder = "C:/Users/Wiz/Downloads/",
regex = /(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?/,
urls = null,
url = "",
filename = "";
fs.readFile(input, "utf8",
function(e, data) {
console.log("Reading file: " + input);
if (e) console.log("Got error:" + e.message);
urls = data.split("\n");
for (var i = urls.length; i--;) {
url = urls[i];
if (!url.match(regex)) continue;
filename = folder + url.substring(url.lastIndexOf('/') + 1);
http.get(url, function(response) {
var file = fs.createWriteStream(filename);
response.on("end", function() {
file.end();
});
response.pipe(file);
})
}
});

Related

Javascript for loop returning same value

I have this JS function using which I'm using to upload files from HTML multi-file uploader to dropbox using Javascript SDK. It's working well. Now I'm trying to add list of file names that are failed to upload(when catch block executes) to a string named "failed", but it's adding the name of the first file for all the failed files. What am I doing wrong here?
function uploadFile() {
var count1=0,count2=0,loop=0, failed='';
const UPLOAD_FILE_SIZE_LIMIT = 150 * 1024 * 1024;
var ACCESS_TOKEN = 'SomeAccessToken';
var dbx = new Dropbox.Dropbox({
accessToken: ACCESS_TOKEN
});
var fileInput = document.getElementById('file-upload');
var formp= document.getElementById('formp');
for (var i = 0; i < fileInput.files.length; i++) {
formp.innerHTML='<p> Uploading ' + fileInput.files.length + ' files </p>' ;
var file = fileInput.files[i];
var filename = fileInput.files[i].name;
if (file.size < UPLOAD_FILE_SIZE_LIMIT) { // File is smaller than 150 Mb - use filesUpload API
dbx.filesUpload({path: '/Test/' + file.name, contents: file})
.then(function(response) {
var results = document.getElementById('results');
var br = document.createElement("br");
results.appendChild(document.createTextNode(file.name + 'File uploaded!'));
results.appendChild(br);
count1=count1+1;
if(count1+count2==fileInput.files.length)
{
formp.innerHTML='<p> Uploaded ' + count1 + ' files. Failed ' + count2 + ' files</p>';
}
console.log(response);
})
.catch(function(error) {
count2=count2+1;
console.error(error);
failed+=file.name;
if(count1+count2==fileInput.files.length)
{
formp.innerHTML='<p> Uploaded ' + count1 + ' files. Failed '+ count2 + ' files</p>';
}
});
}
}
As far as I have seen, I dint find anything wrong in this code. For testing purpose I commented few lines of your code and tested. The files names are properly concatenated to the "failed" string.
<form>
<input type="file" id="formp" multiple="true" accept="*/*" />
</form>
<pre></pre>
$('#formp').change(function (event) {
var count1=0,count2=0,loop=0, failed='';
const UPLOAD_FILE_SIZE_LIMIT = 150 * 1024 * 1024;
var ACCESS_TOKEN = 'SomeAccessToken';
/* var dbx = new Dropbox.Dropbox({
accessToken: ACCESS_TOKEN
}); */
var fileInput = document.getElementById('formp');
var formp= document.getElementById('formp');
for (var i = 0; i < fileInput.files.length; i++) {
formp.innerHTML='<p> Uploading ' + fileInput.files.length + ' files </p>' ;
var file = fileInput.files[i];
var filename = fileInput.files[i].name;
failed+=file.name;
console.log('Failed check', failed);
if (file.size < UPLOAD_FILE_SIZE_LIMIT) { // File is smaller than 150 Mb - use filesUpload API
// dbx.filesUpload({path: '/Test/' + file.name, contents: file})
// .then(function(response) {
// var results = document.getElementById('results');
// var br = document.createElement("br");
// results.appendChild(document.createTextNode(file.name + 'File uploaded!'));
// results.appendChild(br);
// count1=count1+1;
// if(count1+count2==fileInput.files.length)
// {
// formp.innerHTML='<p> Uploaded ' + count1 + ' files. Failed ' + count2 + ' files</p>';
// }
// console.log(response);
// })
// .catch(function(error) {
// count2=count2+1;
// console.error(error);
// failed+=file.name;
// if(count1+count2==fileInput.files.length)
// {
// formp.innerHTML='<p> Uploaded ' + count1 + ' files. Failed '+ count2 + ' files</p>';
// }
// });
}
}
});
Try the above code in jsFiddle.
There might be a chance that the first file name might be cached in file variable and might be displaying the same variable name for each and every loop again and again. So emptying the "file" variable at the end of the each loop might fix the issue.

ServiceNow - Change a file name in Midserver

I am currently trying to replace the name of a file in the Mid Server after a scheduled export.
The idea here is that the file goes with the name in the format "file_name_datetime" and the customer needs "datetime_file_name" for the file to be correctly read by another system.
My main idea was to rename the file after the export to the correct format, but if there is a way of changing the file name to the required one I could do that also.
I would love to hear from you guys as I have no idea how can I do this.
Thanks in advance.
If anyone is interested in the answer, see below:
Script include:
initialize: function() {
this.filePath = gs.getProperty('directory_path');
this.midServer = gs.getProperty('midserver');
this.authMidServerBase64 = gs.getProperty('authmidserver');
},
nameChange: function(exportSetName) {
var exportGr = new GlideRecord("sys_export_set_run");
exportGr.addEncodedQuery("set.nameSTARTSWITH" + exportSetName);
exportGr.orderByDesc("completed");
exportGr.query();
if (exportGr.next()) {
var attachSysID = exportGr.ecc_agent_attachment.sys_id;
}
var attachGr = new GlideRecord("sys_attachment");
attachGr.addEncodedQuery("table_sys_idSTARTSWITH" + attachSysID);
attachGr.query();
if (attachGr.next()) {
var attachName = attachGr.file_name;
var attachDate = attachName.match((/\d+/));
var newName = attachDate + '_' + exportSetName + '.csv';
}
var jspr = new JavascriptProbe(this.midServer);
jspr.setName('FileNameChange'); // This can be any name
jspr.setJavascript('var ddr = new MidServer_script_include(); res = ddr.execute();');
jspr.addParameter("verbose", "true");
jspr.addParameter("skip_sensor", "true"); // prevent Discovery sensors running for the ECC input
jspr.addParameter("filename", this.filePath + "\\" + attachName);
jspr.addParameter("filePath", this.filePath);
jspr.addParameter("newName", this.filePath + "\\" + newName);
jspr.addParameter("operation", "rename");
return jspr.create();
},
Mid Server Script include:
initialize: function() {
/**
*** Set up the Packages references
**/
this.File = Packages.java.io.File;
this.FileOutputStream = Packages.java.io.FileOutputStream;
this.FileInputStream = Packages.java.io.FileInputStream;
this.Path = Packages.java.nio.file.Path;
this.Paths = Packages.java.nio.file.Paths;
this.Files = Packages.java.nio.file.Files;
this.StandardCopyOption = Packages.java.nio.file.StandardCopyOption;
/**
/* Set up the parameters
**/
this.verbose = probe.getParameter("verbose");
this.filePath = probe.getParameter("filePath");
this.filename = probe.getParameter("filename");
this.operation = probe.getParameter("operation");
this.newName = probe.getParameter("newName");
result = "initialize complete";
},
execute: function() {
if (this.operation == 'rename') {
this.fileRename(this.filename, this.newName);
}
return result;
},
fileRename: function(fileName, newName) {
result+= "\r\n Renaming file.";
this._debug(result);
try {
var res = this._moveFile(fileName, newName);
} catch (e) {
result += "\r\n Erro no renomeamento do ficheiro: " + e;
this._debug(result);
}
},
_moveFile: function(initialPath, targetPath) {
try {
this._debug("Initiating file move function");
var inPath = this.Paths.get(initialPath);
var tgPath = this.Paths.get(targetPath);
var res = this.Files.move(inPath, tgPath, this.StandardCopyOption.REPLACE_EXISTING);
result += "File successfully moved from: " + initialPath + " to: " + targetPath + " \r\n Result: " + res;
this._debug(result);
} catch (e) {
this._debug('Error:' + e);
}
},
_debug: function(m) {
if (this.verbose == "true") {
ms.log("::: Mid Server script include logger ::: " + m);
}
},
https://community.servicenow.com/community?id=community_question&sys_id=a56b38a6db326490fa192183ca961987

Wait for fetch response to continue in for loop. Javascript Nodejs

I have a function that connect to a web service in SOAP. Unfortunately the web service only support a very limited connections. I have an array of items to search in the web service, if i do a for or a foreach loop, the 70% of cases complete with no error, but in the 30% the web service response a error. This occurs when the max connections is overflow. This happens because the loop is no waiting the response of the webservice and the loop cotinues creating a lot of connections.
Here's my code:
var promiseArray = [];
for (var i = 0; i < result.length; i++) {
let m = result[i].id
let xml = '<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/" xmlns:tem="http://tempuri.org/">' +
'<soapenv:Header/>' +
'<soapenv:Body>' +
'<tem:EjecutarConsultaXML>' +
'<!--Optional:-->' +
'<tem:pvstrxmlParametros>' +
'<![CDATA[' +
'<Consulta><NombreConexion>USERNAME</NombreConexion>' +
'<IdConsulta>QUERY</IdConsulta>' +
'<Parametros>' +
'<doc>' + m + '</doc>' +
'</Parametros>' +
'</Consulta>' +
']]>' +
'</tem:pvstrxmlParametros>' +
'</tem:EjecutarConsultaXML>' +
'</soapenv:Body>' +
'</soapenv:Envelope>';
const options = {
explicitArray: true
};
promiseArray.push(new Promise(async(resolve, reject) => {
await axios.post(url, xml, {
headers: {
'Content-Type': 'text/xml;charset=UTF-8'
}
})
.then((data) => {
xml2js.parseString(data.data, options, (err, result) => {
var temp = (result['soap:Envelope']['soap:Body'][0]['EjecutarConsultaXMLResponse'][0]['EjecutarConsultaXMLResult'][0]['diffgr:diffgram'][0]['NewDataSet'][0]['Resultado'])
resolve({
doc: m,
state: temp[0].f430_ind_estado[0]
})
});
})
.catch((err) => {
console.log(err)
});
}))
}
res.send(await Promise.all(promiseArray))
There are several issues with your code within the call to promiseArray.push().
There is no need to create a new Promise() since axios already provides one
This is actually and antipattern
There is no need for async/await in that call for the same reason.
Mixing Promises and functions that use callbacks usually doesn't turn out too well
You have no error checking in your code if the XML parser fails
The option object is not required as explicitArray: true is the default
Changes:
Removed all the extra/uneeded Promise code
Replaced xml2js.parseString with xml2js.parseStringPromise
Changed resolve to return
Since you're simply console.log() the error, removed unecessary boilerplate
Everything else is OK as written. Please let me know if I've missed something.
promiseArray.push(
axios.post(url, xml, {
headers: {
'Content-Type': 'text/xml;charset=UTF-8'
}
})
.then(data=>data.data)
.then(xml2js.parseStringPromise)
.then(result => {
var temp = result['soap:Envelope']['soap:Body'][0]['EjecutarConsultaXMLResponse'][0]['EjecutarConsultaXMLResult'][0]['diffgr:diffgram'][0]['NewDataSet'][0]['Resultado'];
return {
doc: m,
state: temp[0].f430_ind_estado[0]
};
});
.catch(console.log)
);
Just do it one by one, using async/await to do that, this means you have to use parseStringPromise instead.
var response = [];
for (var i = 0; i < result.length; i++) {
let m = result[i].id
let xml = '<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/" xmlns:tem="http://tempuri.org/">' +
'<soapenv:Header/>' +
'<soapenv:Body>' +
'<tem:EjecutarConsultaXML>' +
'<!--Optional:-->' +
'<tem:pvstrxmlParametros>' +
'<![CDATA[' +
'<Consulta><NombreConexion>USERNAME</NombreConexion>' +
'<IdConsulta>QUERY</IdConsulta>' +
'<Parametros>' +
'<doc>' + m + '</doc>' +
'</Parametros>' +
'</Consulta>' +
']]>' +
'</tem:pvstrxmlParametros>' +
'</tem:EjecutarConsultaXML>' +
'</soapenv:Body>' +
'</soapenv:Envelope>';
const options = {
explicitArray: true
};
try {
var { data } = await axios.post(url, xml, { // extract data from data.data
headers: {
'Content-Type': 'text/xml;charset=UTF-8'
}
})
var xmlObject = await xml2js.parseStringPromise(data)
var temp = (xmlObject['soap:Envelope']['soap:Body'][0]['EjecutarConsultaXMLResponse'][0]['EjecutarConsultaXMLResult'][0]['diffgr:diffgram'][0]['NewDataSet'][0]['Resultado'])
response.push({
doc: m,
state: temp[0].f430_ind_estado[0]
}) // push item to result array
} catch (error) {
console.log(error);
}
}
res.send(result) // send the result to client

Getting values from http request in nodejs

var latLon = "40.8,-77.8"; //Lat/lon
var cityCode = ""; //City code
var cityName = "";
var latLongCityCodeURL = ("http://dataservice.accuweather.com/locations/v1/cities/geoposition/search?apikey=" + weatherKey + "&q=" + latLon);
//Current Conditions Vars
var ccWeatherText = ""; //Text for weather at location
var ccTemp = 0; //Degrees Farenheit
var ccIcon = 0; //weather icon number https://developer.accuweather.com/weather-icons
var ccURL = "test"; //URL for get
//12 hour forecast Conditions Vars
//5 day forecast conditions Vars
//Get city code
http.get(latLongCityCodeURL, (resp) => {
var that = this;
resp.on("data", (chunk) => {
var result = JSON.parse(chunk);
var cityCode = result.Key;
var cityName = result.EnglishName;
console.log(cityCode + " " + cityName);
that.cityName = cityName;
that.cityCode = cityCode;
});
}).on("error", (err) => {
console.log("Error: " + err.message);
});
console.log(cityCode + " " + cityName);
So my issue is, I am making an http request using require('http'), what I want to do is parse the data and store it in my global variables so that I can use them for other requests. I have tried using var that=this and I have tried just assigning my global variables to the data. I am not sure how to do it, i just keep getting undefined. I know it has something to do with ASYNC and also something to do with the scope. Please help
You can save your result to a variable at various levels of scope.. just remember that most i/o calls in Node.js are asynchronous.
Here's an example:
var latLon = "40.8,-77.8"; //Lat/lon
var cityCode = ""; //City code
var cityName = "";
var latLongCityCodeURL = ("http://dataservice.accuweather.com/locations/v1/cities/geoposition/search?apikey=" + weatherKey + "&q=" + latLon);
//Current Conditions Vars
var ccWeatherText = ""; //Text for weather at location
var ccTemp = 0; //Degrees Farenheit
var ccIcon = 0; //weather icon number https://developer.accuweather.com/weather-icons
var ccURL = "test"; //URL for get
var savedResult = null;
//Get city code
http.get(latLongCityCodeURL, (resp) => {
var jsonData = '';
resp.on("data", (chunk) => {
jsonData += chunk;
});
resp.on("end", () => {
savedResult = JSON.parse(jsonData);
});
}).on("error", (err) => {
console.log("Error: " + err.message);
});
// Display saved result once available.
setTimeout(displaySavedResult, 2000);
function displaySavedResult() {
if (!savedResult) {
console.log('Last result is null!');
} else {
console.log('Last result: City Code: ' + savedResult.Key + " Name" + savedResult.EnglishName);
console.log('Last result (all properties): ', JSON.stringify(savedResult, null, 2));
}
}
You can use Promise to make http request, here is code that may help you
const httpGet = url => {
return new Promise((resolve, reject) => {
http.get(url, res => {
let body = '';
res.on('data', chunk => body += chunk);
res.on('end', () => {
try {
body = JSON.parse(body);
} catch (err) {
reject(new Error(err));
}
resolve({
cityCode: body.Key,
cityName: body.EnglishName
});
});
}).on('error', reject);
});
};
httpGet(latLongCityCodeURL).then(data => {
console.log(data.cityCode + " " + data.cityName);
}).catch(err => console.log('Got error ', err));
var latLon = "40.8,-77.8";
var latLongCityCodeURL = ("http://dataservice.accuweather.com/locations/v1/cities/geoposition/search?apikey=" + weatherKey + "&q=" + latLon);
//Current Conditions Vars
var ccWeatherText = ""; //Text for weather at location
var ccTemp = 0; //Degrees Farenheit
var ccIcon = 0; //weather icon number https://developer.accuweather.com/weather-icons
var ccURL = "test"; //URL for get
//12 hour forecast Conditions Vars
//5 day forecast conditions Vars
//Get city code
function getCityCode(latLongCityCodeURL){
return new Promise((resolve, reject) => {
http.get(latLongCityCodeURL, (resp) => {
resp.on("data", (chunk) => {
var result = JSON.parse(chunk);
var cityCode = result.Key;
var cityName = result.EnglishName;
resolve({cityCode, cityName});
});
}).on("error", (err) => {
reject(err);
console.log("Error: " + err.message);
});
})
}
getCityCode(latLongCityCodeURL)
.then((result) => {
console.log(result.cityCode, result.cityName)
}).catch((err) => console.log(err))
Another way is to use async-await API interface which is supported in node 8.
async function getCityCode(latLongCityCodeURL){
const result = await http.get(latLongCityCodeURL, (resp) => {
resp.on("data", (chunk) => {
var result = JSON.parse(chunk);
var cityCode = result.Key;
var cityName = result.EnglishName;
return {cityCode, cityName};
});
}).on("error", (err) => {
return err;
console.log("Error: " + err.message);
});
return result;
}
getCityCode(latLongCityCodeURL)
.then((res) => {
console.log(res.cityCode, res.cityName)
})
Your code block runs in a synchronized way and console.log part hits just after http.get call. The thing is http.get is an async function and its callback part will be called future ticks of NodeJS when your response has arrived.
var http = require('http');
var latLon = "40.8,-77.8"; //Lat/lon
var cityCode = ""; //City code
var cityName = "";
var latLongCityCodeURL = ("http://dataservice.accuweather.com/locations/v1/cities/geoposition/search?apikey=" + "fmKWSgaG5EAA0diCP2lSREEOYG6PC5q9" + "&q=" + latLon);
var that;
//Current Conditions Vars
var ccWeatherText = ""; //Text for weather at location
var ccTemp = 0; //Degrees Farenheit
var ccIcon = 0; //weather icon number https://developer.accuweather.com/weather-icons
var ccURL = "test"; //URL for get
//12 hour forecast Conditions Vars
//5 day forecast conditions Vars
//Get city code
getWeather = url => {
return new Promise((resolve, reject) => {
http.get(url, res => {
let body = '';
res.on('data', chunk => resolve(JSON.parse(chunk)));
}).on('error', reject);
});
};
getWeather(latLongCityCodeURL).then( weather => {
console.log(weather.Key + " " + weather.EnglishName);
})
You must wait for the response, then send the other requests or log into. The scope isn't really the issue here, it's the order of operations that you're missing. You define a variable, then wait for a HTTP request, meanwhile you're immediately logging a value, then the request finishes (or times out or other error), and you can log again the value - at which point you see data.
In other words, to fix the issue, other requests must be made from within a function of resp.on("end", which says when you've finished getting all data chunks
And by "within", the code can still be in a separate method, outside those brackets, but you must call that particular function from within that response body.
You should pass such asynchronous returned data through parameter variables, not update some external, global state in most cases

Streaming to and from external programs expecting files with javascript nodejs

Problem:
I need to upload hundreds of PDF documents, convert them to HTML and then store the HTML in MongoDB. I am currently saving both the incoming PDF documents and converted HTML in the file system. Is there a way to use streams to avoid all the file I/O?
Current approach (which works but is slow):
I am using:
Busboy to read the uploaded PDF documents which I save to the file system.
I create an "exec" child process in node.js which invokes "'pdftohtml -c -s -noframes -nodrm ' + inputFileNamePDF + ' ' + outputFileNameHTML,". The HTML output files get saved to the file system.
I then iterate through all the HTML files to create a Bulk upsert to MongoDB.
Ideally I'd like to stream the uploaded PDF file directly to "inputFileNamePDF". Then stream the converted "outputFileNameHTML" to the bulk upsert.
Here's the Code:
var path = require("path"),
Busboy = require('busboy')
http = require('http'),
util = require('util'),
fs = require('fs-extra'),
pdftohtml = require('pdftohtmljs'),
exec =require('child_process').exec,
pdf_extract = require('pdf-extract'),
exports.postUpload = function (req, res) {
// parse a file upload
var fileName = "";
var uploadDir = '/tmp/' + res.locals.user._doc.email.replace(/[#\.]/g,"_");
var infiles = 0, outfiles = 0, done = false,
busboy = new Busboy({ headers: req.headers });
console.log('Start parsing form ...');
busboy.on('file', function (fieldname, file, filename) {
++infiles;
console.log("file event #" + infiles);
onFile(fieldname, file, filename, function () {
++outfiles;
console.log("file #" + infiles + " written.");
if (done) console.log(outfiles + '/' + infiles + ' parts written to disk');
if (done && infiles === outfiles) {
// ACTUAL EXIT CONDITION
console.log('All parts written to disk');
res.writeHead(200, { 'Connection': 'close' });
res.end("That's all folks!");
convertToHTMLTxt();
}
});
});
busboy.on('finish', function () {
console.log('Done parsing form!');
done = true;
});
req.pipe(busboy);
function onFile(fieldname, file, filename, next) {
// or save at some other location
var fileName = "";
fileName = filename.replace( /[^a-z0-9_\-]/gi,"_");
fileName = fileName.replace(/_(pdf|docx|doc)$/i,".$1");
var fstream = fs.createWriteStream(path.join(uploadDir, fileName));
file.on('end', function () {
console.log(fieldname + '(' + fileName + ') EOF');
});
fstream.on('close', function () {
console.log(fieldname + '(' + fileName + ') written to disk');
next();
});
console.log(fieldname + '(' + fileName + ') start saving');
file.pipe(fstream);
}
function convertToHTMLTxt () {
var execTxt, execHTML, execPDF;
var textDir = 'text';
var htmlDir = 'html';
console.log('Directory: ', uploadDir);
fs.readdir(uploadDir, function(err, files) {
if (err) {
console.log('error reading directory: ', uploadDir);
return;
}
files.forEach(function(fileName) {
var fileNameHTML = path.join(uploadDir, htmlDir,
fileName.replace(/(pdf|docx|doc)$/i,"html"));
var fileNamePDF = path.join(uploadDir, fileName);
if (fileName.match(/pdf$/i)) {
execPDF = exec('pdftohtml -c -s -noframes -nodrm '
+ fileNamePDF + ' ' + fileNameHTML,
function(error, stdout, stderr) {
console.log('stdout: ', stdout);
console.log('stderr: ', stderr);
if (error !== null) {
console.log('exec error: ', error);
}
});
execPDF.on('close', function (code) {
console.log('******** PDF to HTML Conversion complete - exit code '
+ code);
});
}
})
});
Once the conversion is done I iterate through all the HTML files and do a MongoDB bulk upsert:
fs.readFile(fileNameHTML, 'utf8', function (err, HTMLData) {
if (err) {
console.log('error reading file: ', fileNameHTML + '/nerror: ' + err);
callback(err);
return;
}
bulk.find({ userName: userName,
docName : fileName
}).upsert()
.updateOne({userName: userName,
docName : fileName,
HTMLData : HTMLData});

Categories

Resources