I'm stuck at using text that is created in a constructor outside the constructor.
Actually, I'm trying to read a pdf file using pdfreader node module. I'm able to print pdf data word by word as text in below constructor.
fs.readFile(pdfFilePath, (err, pdfBuffer) => {
// pdfBuffer contains the file content
new pdfreader.PdfReader().parseBuffer(pdfBuffer, function(err, item){
if (err)
callback(err);
else if (!item)
callback();
else if(item.text !== undefined)
console.log(item.text);
});
});
But I want to get that text into some string or array and I need to check whether a piece of text is there in the whole string or array.
I am not able to do this.
var a = '';
var pdfreader = require('pdfreader');
let pdfFilePath = 'C:/Users/Downloads/';
fs.readFile(pdfFilePath, (err, pdfBuffer) => {
new PdfReader().parseBuffer(pdfBuffer, function(err, item) {
if (err) callback(err);
else if (!item) callback();
else if(item.text !== undefined)
a = a.concat(item.text);
});
});
console.log('after block' + a)
Try this. I'm unsure if the item.text considers a space or not.
Related
I'm trying to write a script, when a new url is found it will turn the url to a hash. Check if the file already has been written it just ignores it, and if it's not known earlier it should be added.
needle.get(mainUrl, function(err, res) {
if (err) throw err;
if (res.statusCode == 200 && !err ) {
var $ = cheerio.load(res.body)
var href = $('div div a').each(function(index, element) {
urlList.push($(element).attr("href"))
var url =($(element).attr("href"))
var hash = crypto.createHash('md5').update(url).digest('hex');
fs.writeFile('./directory/otherdirectory' + `${hash}`, url, (err) => {
if (err) throw err;
console.log('Hash created: ' + url + ' saved as ' + hash
});
}
)
}
})
This is what I've done so far, but this only writes new files. it doesn't check if files already has been added and doesn't remove files that's not found anymore.
So what I try to do:
I've written a script that fetches a website for urls.
Hash all the urls.
Make FS check if file already has been written, if it has just ignore it.
If it not is known earlier, add it as a new file.
If url isn't found when fetching anymore, delete it from the list.
I think this might be an X/Y problem and for that I'm still awaiting the answer to my comment.
With that said, you can simply ignore the existing files using fs.existsSync, if that returns true just skip saving the current file, otherwise save it. And to remove files that are not available anymore, just get all the files in the directory using fs.readdir and remove files that you whose urls are not in the response using fs.unlink:
needle.get(mainUrl, (err, res) => {
if (err) throw err;
if (res.statusCode == 200) {
let $ = cheerio.load(res.body);
let hashes = []; // list of hashes for this website (to be used later to keep only the items that are still available)
$('div div a').each((index, element) => {
let url = $(element).attr("href");
let hash = crypto.createHash('md5').update(url).digest('hex');
hashes.push(hash); // store the hash of the current url
if (!fs.existsSync('./directory/otherdirectory/' + hash)) { // if this file doesn't exist (notice the "not operator !" before fs.existsSync)
fs.writeFile('./directory/otherdirectory/' + hash, url, err => { // save it
if (err) throw err;
console.log('Hash created: ' + url + ' saved as ' + hash);
});
}
});
fs.readdir('./directory/otherdirectory', (err, files) => { // get a list of all the files in the directory
if (err) throw err;
files.forEach(file => { // and for each file
if(!hashes.includes(file)) { // if it was not encountered above (meaning that it doesn't exist in the hashes array)
fs.unlink('./directory/otherdirectory/' + file, err => { // remove it
if (err) throw err;
});
}
});
});
});
Another approach:
Since you only seem to want to store the urls, the best way to so would be to use one single file to store them all instead of storing each url in its own file. Something like this is more efficient:
needle.get(mainUrl, (err, res) => {
if (err) throw err;
if (res.statusCode == 200) {
let $ = cheerio.load(res.body);
let urls = $('div div a') // get the 'a' elements
.map((index, element) => $(element).attr("href")) // map each one into its href attribute
.get(); // and get them as an array
fs.writeFile('./directory/list-of-urls', urls.join('\n'), err => { // then save all the urls encountered in the file 'list-of-urls' (each on its own line, hence the join('\n'))
if (err) throw err;
console.log('saved all the urls to the file "list-of-urls"');
});
}
});
That way old urls will be removed automatically as the file gets overwritten each time, and new urls will be added automatically. No need to check whether an url is already encountered or not because it will get re-saved anyway.
And if you want to get the list of urls somewhere else, just read the file and split it by '\n' like so:
fs.readFile('./directory/list-of-urls', 'utf8', (err, data) => {
if (err) throw err;
let urls = data.split('\n');
// use urls here
});
I have my text data like this and I mark * charset to check unfinished job.
And here's my part of code.
I want to make it when txt lines are start with * charset, it should be processed and after that processed string's * charset should be removed.
fs.readFile('data.txt', async function (err, data) {
if (err) throw err;
let array = data.toString().split("\n");
for (i in array) {
if (array[i].charAt(0) === '*') {
console.log(`Now Processing : ${array[i]} | ${array.length - i -1} items left`);
//
// SOME JOBS
//
let newValue = array[i].replace('*', '');
fs.writeFile('data.txt', newValue, 'utf-8', function (err, data) {
if (err) throw err;
console.log('Done!');
})
} else {
console.log(`${array[i]} Already Captured`)
}
From what I understand you are trying to read through a file, find lines containing "*" character, doing some work, then removing the * from the affected lines.
Firstly, the call to fs.writeFile is happening inside a loop, so every iteration of that loop calls the writeFile function. From the nodejs docs this method will "... asynchronously write data to the file, replacing the file if it already exists." You are replacing the file with every iteration. What you want to do is use fs.appendFile or better yet pass the 'append' system flag to writeFile. System flags can be seen here. Take a look at the 'a' flag and pass it in the options object of writeFile.
Your usage of async in the readFile callback is incomplete also as you don't call await within that callback.
fs.readFile('data.txt', async function (err, data) {
if (err) throw err;
let array = data.toString().split("\n");
for (i in array) {
if (array[i].charAt(0) === '*') {
console.log(`Now Processing : ${array[i]} | ${array.length - i -1} items left`);
//
// SOME JOBS
//
let newValue = array[i].replace('*', '');
newValue = newValue + '\n';
await fs.appendFile('data.txt', newValue, 'utf-8', function (err, data) {
if (err) throw err;
console.log('Done!');
})
} else {
console.log(`${array[i]} Already Captured`)
}
}
});
I'm trying to extract text from pdf file then put it in a string.
I found "pdfReader" and tried to implement it, but I always get an error. At first it begins reading the text normally then when the pdf file ends it stops and block the app.
Code:
var PdfReader = require("pdfreader").PdfReader;
router.get('/adminse', function(req, res, next){
aux='';
new PdfReader().parseFileItems("D:/bureau/VoguelConsulting/Backend/uploads/cv_url_CV_anglais_20191337991.pdf", function(err, item){
if (err)
callback(err);
else if(item==='undefined'){
console.log('erreur');
}
else if(item.text)
{
aux = item.text;
console.log(' aux = ' + aux);
}
else
{
console.log('working');}
});
});
Error:
Use a falsy check with the not operator (!), this will work with undefined and null as well:
else if (!item){
console.log('erreur');
} else if(item.text) {
...
The code I have so far allows me to query the database by finding a specific key:value in the documents, but I want to be able to pass an argument into the function so that the user's search becomes the 'value' in the key:value query. Currently, I have "Cosmic Black" in place of where I want the user's argument to be.
var findMaterials = function(db, callback) {
var cursor = db.collection('materials').find( {"material_name": "Cosmic Black"} );
cursor.each(function(err, doc) {
assert.equal(err, null);
if (doc !== null) {
console.log(doc);
} else {
callback();
}
});
};
I just don't know how to get the argument in there, or if it's even possible with the current code setup. Any thoughts would be much appreciated.
var findMaterials = function(db, value, callback) {
var cursor = db.collection('materials').find( {"material_name": value} );
cursor.each(function(err, doc) {
assert.equal(err, null);
if (doc !== null) {
console.log(doc);
} else {
callback();
}
});
};
How you're calling findMaterials(), that's a different question. Something like this should do the trick.
function doAllTheWork(url, value){
MongoClient.connect(url, function(err, db) {
assert.equal(null, err);
findMaterials(db, value, function() {
db.close();
});
});
}
Pass value as some sort of param. If you're building an API pluck it out of the JSON. If you're running all this from the cmd line then just do the same via that methodology.
I am trying to work with twitter data in node and am running into some road blocks that I think should be related to node style coding. The block of code is meant to grab the tweets, check if the text is in mongo and if not insert it.
The first stumbling block I find is that on trying to print out i to the console it will always iterate through each i before it starts iterating through the cursor. I think if I could clear that up it may help me going forward. Is this enough info to help me out?
What I have is:
T.get('statuses/user_timeline', options , function(err, data) {
var db = MongoClient.connect('mongodb://127.0.0.1:27017/test', function(err, db) {
if(err)
throw err;
console.log("connected to the mongoDB !");
myCollection = db.collection('test_collection2');
for (var i = 0; i < data.length ; i++) {
//let's wrap this in a loop
docu = data[i];
//console.dir(data);
console.dir(i);
var cursor = myCollection.find({text : data[i].text}).limit(1);
cursor.each(function(err, doc) {
if (doc != null) {
console.dir('doc is not null');
console.dir(doc.text);
} else {
console.dir('doc is null - inserting');
myCollection.insert(docu, function(err, records){
console.log("Record added as "+records.text);
});
}
})
}
});
})
The problem is just because Javascript is async. The loop is finished before the find function in Mongo gives you a return value.
I would do the Following, or something similar - just to explaine the concept:
T.get('statuses/user_timeline', options , function(err, data) {
var db = MongoClient.connect('mongodb://127.0.0.1:27017/test', function(err, db) {
if(err)
throw err;
console.log("connected to the mongoDB !");
myCollection = db.collection('test_collection2');
var myfunction = function(correct_i,docu){
var cursor = myCollection.find({text : data[correct_i].text}).limit(1);
cursor.each(function(err, doc) {
if (doc != null) {
console.dir('doc is not null');
console.dir(doc.text);
} else {
console.dir('doc is null - inserting');
myCollection.insert(docu, function(err, records){
console.log("Record added as "+records.text);
});
}
})
};
for (var i = 0; i < data.length ; i++) {
//let's wrap this in a loop
docu = data[i];
//console.dir(data);
console.dir(i);
myfunction(i,docu);
}
});
})
This way each lookup/find to Mongo will have the correct i. because the function gets it as an parameter