download pdf from URL into gDrive

download pdf from URL into gDrive - javascript

I need to download a PDF from a link in the following format
fileURL = "https://docs.google.com/feeds/download/documents/export/Export?id=<...DOCID...>&revision=3970&exportFormat=pdf"
and add it to gDrive folder.
I have this code, but the generated file just contain "Blob" rather than the actual content
function dFile(fileName,fileURL) {
var response = UrlFetchApp.fetch(fileURL);
var fileBlob = response.getBlob().getAs('application/pdf');
var folder = DriveApp.getFolderById('..folderID..');
var result = folder.createFile(fileName,fileBlob,MimeType.PDF);
Logger.log("file created");
}
How to I download the actual PDF?
Update:
I have updated my code and now I get this as generated PDF. Which makes me think I need to auth, but not sure how to do it, I set up all auth in manifest already
function dFile(fileName,fileURL) {
var response = UrlFetchApp.fetch(fileURL);
var fileBlob = response.getBlob().getAs('application/pdf');
var folder = DriveApp.getFolderById('..folderID..');
var result = folder.createFile(fileBlob).setName(fileName);
Logger.log("file created");
}

In your script, how about the following modification?
From:
var response = UrlFetchApp.fetch(fileURL);
var fileBlob = response.getBlob().getAs('application/pdf');
To:
var response = UrlFetchApp.fetch(fileURL, { headers: { authorization: "Bearer " + ScriptApp.getOAuthToken() } });
var fileBlob = response.getBlob();
I thought that in your endpoint, getBlob() returns the PDF format.
In your script, createFile is used. By this, the required scope has already been included. But, if an error is related to Drive API, please enable Drive API at Advanced Google services.
Note:
In your endpoint, if revision=3970 is not existing, an error occurs. Please be careful about this.
Reference:
getOAuthToken()

Related

Node JS with Axios. How to get extension of the image from url

I am trying to download the image and save it in my server from the url address. So for example I make a POST request with URL of the image. I download the image and I save it in my server. The problem comes when I need to figure our the extension of the image. Right now it works staticaly only for jpg files, but it should work for png aswell. How can I find out the extension of the file before saving it?
One way would be to get the extension from the url itself, but not all urls will have the extension , for example: https://media.istockphoto.com/photos/winter-in-the-sequoias-picture-id1292624259
This is the code that I have made right now. It works, however how I said, its static and only working for jpg:
var config = {
responseType: 'stream'
};
async function getImage(url) {
let time = Math.floor(Date.now() / 1000)
let resp = await axios.get(url, config)
resp.data.pipe(fs.createWriteStream(time+'.jpg')) // here I need to get the image extension isntead of static '.jpg'
}

You can use response headers for that. The Content-Type header should tell you the type of the file and with Content-Disposition you can get the filename with extension.
In your code you can access these headers like this
resp.headers['content-type'];
resp.headers['content-disposition'];

I'd suggest using a module such as mime to get the extension from the content-type.
Complete example:
const axios = require('axios');
const fs = require('fs');
const mime = require('mime');
var config = {
responseType: 'stream'
};
async function getImage(url) {
let time = Math.floor(Date.now() / 1000)
let resp = await axios.get(url, config)
const contentLength = resp.headers['content-length'];
const contentType = resp.headers['content-type'];
const extension = mime.extension(contentType);
console.log(`Content type: ${contentType}`);
console.log(`Extension: ${extension}`);
const fileName = time + "." + extension;
console.log(`Writing ${contentLength} bytes to file ${fileName}`);
resp.data.pipe(fs.createWriteStream(fileName));
}
const url = 'https://media.istockphoto.com/photos/winter-in-the-sequoias-picture-id1292624259';
getImage(url)
This will give an output somewhat like:
Content type: image/jpeg
Extension: jpeg
Writing 544353 bytes to file 1638867349.jpeg

google appscript DocumentApp invalid argument error

Very simple use of the API, just like the example at https://developers.google.com/apps-script/reference/document/document-app#openbyurlurl but I'm getting this error and can't understand why. I get the same error if I try openById() instead.

After some further testing, I believe this error is because the mimetype of the file is not a native google drive document (doc, sheet, slide) but instead a pdf file. I was able to get this working when declaring the mimetype.
var srcfolderId = "1vaAInpB8ACPiYi12A7F2yuOEU7EwNzbI"; // <--- Please input folder ID.
var files = DriveApp.getFolderById(srcfolderId).getFilesByType(MimeType.PDF);
while (files.hasNext()) {
var file = files.next();
var fileBlob = file.getBlob();
var resource = {
title: fileBlob.getName(),
mimeType: fileBlob.getContentType()
};
var options = {
ocr: true
};
//Enable Drive API under advanced api's
var docFile = Drive.Files.insert(resource, fileBlob, options);
Logger.log(docFile.title+" " + docFile.alternateLink);
// Extract Text from PDF file
var doc = DocumentApp.openById(docFile.id);
var text = doc.getBody().getText();
}

not able to fetch text data from web url using javascript

I need to extract text data from web url (http://www.africau.edu/images/default/sample.pdf)
I used two node_module.
1) crawler-Request
it('Read Pdf Data using crawler',function(){
const crawler = require('crawler-request');
function response_text_size(response){
response["size"] = response.text.length;
return response;
}
crawler("http://www.africau.edu/images/default/sample.pdf",response_text_size).then(function(response){
// handle response
console.log("Reponse =" + response.size);
});
});
What happen for this it will not print anything on console.
2) pfd2json/pdfparser
it('Read Data from url',function(){
var request = require('request');
var pdf = require('pfd2json/pdfparser');
var fs = require('fs');
var pdfUrl = "http://www.africau.edu/images/default/sample.pdf";
let databuffer = fs.readFileSync(pdfUrl);
pdf(databuffer).then(function(data){
var arr:Array<String> = data.text;
var n = arr.includes('Thursday 02 May');
console.log("Print Array " + n);
});
});
Failed: ENOENT: no such file or directory, open 'http://www.africau.edu/images/default/sample.pdf'
I am able to access data from local path but not able to extract it from url.

The issue here is that you are using the fs module (File System) to read a file on a distant server.
You also mistyped the pdf2json module, which should give you an error ?
You did require the request module. This module will make it possible to access that distant file. Here's one way to do this :
it('Read Data from url', function () {
var request = require('request');
var PDFParser = require('pdf2json');
var pdfUrl = 'http://unec.edu.az/application/uploads/2014/12/pdf-sample.pdf';
var pdfParser = new PDFParser(this, 1);
// executed if the parser fails for any reason
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError));
// executed when the parser finished
pdfParser.on("pdfParser_dataReady", pdfData => console.log(pdfParser.getRawTextContent()));
// request to get the pdf's file content then call the pdf parser on the retrieved buffer
request({ url: pdfUrl, encoding: null }, (error, response, body) => pdfParser.parseBuffer(body));
});
This will make it possible to load the distant .pdf file in your program.
I'd recommend looking at the pdf2json documentation if you want to do more. This will simply output the textual content of the .pdf file when the parser has completed reading data.

How to download multiple files from Google Drive as .zip using JSZip on Salesforce

The case:
On Salesforce platform I use Google Drive to store files (images for this case) with configured Apex Google Drive API Framework. So Google Drive API handles authToken and so on. I can upload and browse images in my application. In my case I want to select multiple files and download them in a single zip file. So far I'm trying to do that using JSZip and FileSaver libraries. With the same code below I can zip and download multiple files stored somewhere else with proper response header, but not from GDrive because of CORS error.
https://xxx.salesforce.com/contenthub/download/XXXXXXXXXX%3Afile%XXXXXX_XXXXXXXXX. No'Access-Control-Allow-Origin' header is present on the requested resource. Origin 'https://xxx.visual.force.com' is therefore not allowed access. If I just click on this link, file starts to download.
Is there any way to configure GDrive to enable response header: Access-Control-Allow-Origin: * or Access-Control-Allow-Origin: https://*/mydomain.com somehow or I just have to use something else, maybe server side compression? Now I am using the download link provided by Apex Google Drive API (looks like this:
https://xxx.salesforce.com/contenthub/download/XXXXXXXXXXX%3Afile%XXXXXXXX), it works fine when used as src="fileURL" or when pasted directly to the browser. GDrive connector add 'accesToken' and so on.
My code:
//ajax request to get files using JSZipUtils
let urlToPromise = (url) =>{
return new Promise(function(resolve, reject) {
JSZipUtils.getBinaryContent(url, function (err, data) {
if(err) {
reject(err);
} else {
resolve(data);
}
});
});
};
this.downloadAssets = () => {
let zip = new JSZip();
//here 'selectedAssets' array of objects each of them has 'assetFiles'
//with fileURL where I have url. Download and add them to 'zip' one by one
for (var a of this.selectedAssets){
for (let f of a.assetFiles){
let url = f.fileURL;
let name = a.assetName + "." + f.fileType;
let filename = name.replace(/ /g, "");
zip.file(filename, urlToPromise(url), {binary:true});
}
}
//generate zip and download using 'FileSaver.js'
zip.generateAsync({type:"blob"})
.then(function callback(blob) {
saveAs(blob, "test.zip");
});
};
I also tried to change let url = f.fileURL to let url = f.fileURL + '?alt=media'; and &access_token=CURRENT_TOKEN added by GDrive connector.
this link handled by GRDrive connector so if I just enter it in browser it download the image. However, for multiple download using JS I got CORS error.

I think this feature is not yet supported. If you check the Download Files guide from Drive API, there's no mention of downloading multiple files at once. That's because you have to make individual API requests for each file. This is confirmed in this SO thread.
But that selected multiple files are convert into single zip file and download that single zip file which is possible with google drive API. So how can i convert them into single Zip File? please tell me.

According to me, just download all files and store them at temporary directory location and then add that directory to zip file and store that zip to physical device.
public Entity.Result<Entity.GoogleDrive> DownloadMultipleFile(string[] fileidList)
{
var result = new Entity.Result<Entity.GoogleDrive>();
ZipFile zip = new ZipFile();
try
{
var service = new DriveService(new BaseClientService.Initializer()
{
HttpClientInitializer = credential,
ApplicationName = "Download File",
});
FilesResource.ListRequest listRequest = service.Files.List();
//listRequest.PageSize = 10;
listRequest.Fields = "nextPageToken, files(id, name, mimeType, fullFileExtension)";
IList<File> files = listRequest.Execute().Files;
if (files != null && files.Count > 0)
{
foreach (var fileid in fileidList)
{
foreach (var file in files)
{
if (file.Id == fileid)
{
result.Data = new Entity.GoogleDrive { FileId = fileid };
FilesResource.GetRequest request = service.Files.Get(fileid);
request.ExecuteAsync();
var stream = new System.IO.FileStream(HttpContext.Current.Server.MapPath(#"~\TempFiles") + "\\" + file.Name, System.IO.FileMode.Create, System.IO.FileAccess.Write);
request.MediaDownloader.ProgressChanged += (IDownloadProgress progress) =>
{
switch (progress.Status)
{
case DownloadStatus.Downloading:
{
break;
}
case DownloadStatus.Completed:
{
break;
}
case DownloadStatus.Failed:
{
break;
}
}
};
request.Download(stream);
stream.Close();
break;
}
}
}
}
zip.AddDirectory(HttpContext.Current.Server.MapPath(#"~\TempFiles"), "GoogleDrive");
string pathUser = Environment.GetFolderPath(Environment.SpecialFolder.UserProfile);
string pathDownload = System.IO.Path.Combine(pathUser, "Downloads");
zip.Save(pathDownload + "\\GoogleDrive.zip");
System.IO.DirectoryInfo di = new System.IO.DirectoryInfo(HttpContext.Current.Server.MapPath(#"~\TempFiles"));
foreach (var file in di.GetFiles())
{
file.Delete();
}
result.IsSucceed = true;
result.Message = "File downloaded suceessfully";
}
catch (Exception ex)
{
result.IsSucceed = false;
result.Message = ex.ToString();
}
return result;
}

My previously published code works. Forgot to post a solution.
Just instead of using content hub link I started to use direct link to Google Drive and CORS issue was solved. Still not sure if CORS might be solved somehow at Salesforce side. Tried different setups with no luck.
Direct download link to GDrive works ok in my case. The only thing I had to change is the prefix to GDrive file ID.

Apps script write to Big Query unknown error

This is supposed to read in a CSV and then write it to bigquery. When it runs, however, nothing is written, and there are no errors logged. I read that I need to write a csv and then turn it into an Octet Stream. I am not sure whether or not this is compatible with google bigquery.
function test(){
try{
var tableReference = BigQuery.newTableReference();
tableReference.setProjectId(PROJECT_ID);
tableReference.setDatasetId(datasetId);
tableReference.setTableId(tableId);
var schema = "CUSTOMER:string, CLASSNUM:integer, CLASSDESC:string, CSR:string, CSR2:string, INSURANCE:string, REFERRALGENERAL:string, REFERRALSPECIFIC:string, NOTES:string, INMIN:integer, INHR:integer, OUTMIN:integer, OUTHR:integer, WAITMIN:integer, WAITHR:integer, DATETIMESTAMP:float, DATEYR:integer,DATEMONTH:integer, DATEDAY:integer";
var load = BigQuery.newJobConfigurationLoad();
load.setDestinationTable(tableReference);
load.setSourceUris(URIs);
load.setSourceFormat('NEWLINE_DELIMITED_JSON');
load.setSchema(schema);
load.setMaxBadRecords(0);
load.setWriteDisposition('WRITE_TRUNCATE');
var configuration = BigQuery.newJobConfiguration();
configuration.setLoad(load);
var newJob = BigQuery.newJob();
newJob.setConfiguration(configuration);
var loadr = DriveApp.getFilesByName("test.csv");
var x = loadr.next().getBlob();
Logger.log(x.getDataAsString());
var d = DriveApp.getFilesByName("test.csv");
var id = d.next().getId();
Logger.log(id);
var data = DocsList.getFileById(id).getBlob().getDataAsString();
var mediaData = Utilities.newBlob(data, 'application/octet-stream');
BigQuery.Jobs.insert(newJob, PROJECT_ID, mediaData)
}
catch(error){Logger.log("A" + error.message);}
}

Your sourceFormat is wrong for CSV files:
The format of the data files. For CSV files, specify "CSV". For
datastore backups, specify "DATASTORE_BACKUP". For newline-delimited
JSON, specify "NEWLINE_DELIMITED_JSON". The default value is CSV.
https://developers.google.com/bigquery/docs/reference/v2/jobs#configuration.load.sourceUris
On the other hand I think you don't need at all the load.setSourceUris(URIs); since you try to load from local file, and not from Google Cloud Storage. Check this python example https://developers.google.com/bigquery/loading-data-into-bigquery

Develop Reference

JavaScript is the programming language of the Web.

download pdf from URL into gDrive - javascript

Related

Node JS with Axios. How to get extension of the image from url

google appscript DocumentApp invalid argument error

not able to fetch text data from web url using javascript

How to download multiple files from Google Drive as .zip using JSZip on Salesforce

Apps script write to Big Query unknown error

Categories

Resources