I am uploading files to GraphQL API with plain JS. I've been doing this from the same origin for months now and now am trying to implement the exact same thing externally with NodeJS.
My code looks something like this:
const FormData = require('form-data');
const fs = require('fs')
const axios = require('axios')
const payload = generateRequest(input)
axios.post(apiBaseUrl + "/graphql", payload, {
headers: {...payload.getHeaders()}
}).then(response => {
let res = response.data
if (res.data.triggerPcWorkflow.status === 200) {
console.log("success!")
} else {
console.error(res.data.triggerPcWorkflow.message)
}
})
.catch(err => {
if (err.response) {
console.log(err.response.data);
console.log(err.response.status);
console.log(err.response.headers);
}
})
With the generateRequest function generating the multipart payload (https://github.com/jaydenseric/graphql-multipart-request-spec).
I have two identical versions of the Backend running on localhost:5000 and mycooldomain.com. When setting apiBaseUrl to http://localhost:5000 everything works flawlessly. However just by changing the URL to https://www.mycooldmain.com I get a 400 error thrown at me with { errors: [ { message: 'Must provide query string.' } ] }
BTW: A simple query works with both URLs...
Here is my generateRequest function:
const mutation = `
mutation MyMutation($xyz: String) {
doSomething(myInput: $xyz) {
status
message
}
}
`
let sendData = new FormData();
const fileNull = [];
// map
files = generateRequestInput.files
let map = '{'
for (let i = 0; i < files.length; i++) {
fileNull.push(null);
map += `"${i}": ["variables.files.${i}"], `
}
map = map.substring(0, map.length-2);
map += '}'
// operations
const operations = JSON.stringify({
query: mutation,
variables: {
"xyz": "Hello"
}
});
// build payload
sendData.append("operations", operations)
sendData.append("map", map)
for (let i = 0; i < files.length; i++) {
sendData.append(i, files[i]);
}
return sendData
I know that map looks a bit ugly but thats not the point (unless it is).
Has anyone had a similar problem or knows what my error is here?
Thanks!
I skipped on the axios dependency and implemented the request with FormData directly.
The code below works.
function makeRequest(formData, options) {
formData.submit(options, (err, res) => {
if (err) {
console.error(err.message)
return
} else {
if (res.statusCode < 200 || res.statusCode > 299) {
console.error(`HTTP status code ${res.statusCode}`)
}
const body = []
res.on('data', (chunk) => body.push(chunk))
res.on('end', () => {
const resString = Buffer.concat(body).toString()
console.log(resString)
})
}
})
}
const options = {
host: 'mycooldomain.com',
port: 443,
path: '/graphql',
method: 'POST',
protocol: 'https:'
}
makeRequest(payload, options)
Related
am trying to build a web scraper that downloads all the pdfs in a website. i've written all the logic necessary to do this but for some reason it downloads an empty pdf file which is not suppose to be so, the problem seems to be coming from the downloadFile function when i try to pipe the data which for some reason seems not to be working because i get an empty pdf file after the function is ran. i'll would appreciate it if someone can help me out with this problem, thanks.
here's a sample of my code:
app.js
const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");
const axiosInstance = require("./getAxios");
const axios = axiosInstance();
const Surl = "https://www.health.gov.ng/";
// linkList sample: "https://www.health.gov.ng/index.php?option=com_content&view=article&id=143&Itemid=512";
let = connectionFailCount = 0;
let linkList = [];
let dlinkList = [];
const getWebsiteLinks = async (Surl) => {
try {
console.log(`Crawling all links from: ${Surl}`);
const response = await axios.get(Surl);
const $ = cheerio.load(response.data);
const ranges = $("a").each(function (idx, el) {
if ($(el).attr("href")) {
return $(el).attr("href");
}
});
for (let index = 0; index < ranges.length; index++) {
let raw_links = $("a")[index].attribs.href;
if (raw_links.startsWith("/")) {
linkList.push(Surl + raw_links);
}
}
if (linkList.length > 0) {
console.log(`Finished crawling links: Found ${linkList.length} links`);
console.log(
"--------------------------------------------------------\n\n"
);
}
return;
} catch (error) {
if (connectionFailCount === 0) {
connectionFailCount += 1;
getWebsiteLinks(Surl);
console.log(`Connection error. \n
Reconnecting to server....`);
} else if (connectionFailCount === 5) {
console.error(`Can not connect to server. Try again later.`);
}
}
};
const downloadLinks = async (linkList) => {
try {
console.log("Crawling links to find pdf links. this may take a while...");
for (const link of linkList) {
const response = await axios.get(link);
// Skip where there's delayed server response
if (response.code === "ECONNRESET") continue;
const $ = cheerio.load(response.data);
$("a").each(function (idx, el) {
if ($(el)?.attr("href")?.endsWith(".pdf")) {
let addr = $(el).attr("href");
let dlink = Surl + addr;
dlinkList.push({
pathName: addr,
url: dlink,
});
}
});
}
console.log(dlinkList);
if (dlinkList.length > 0) {
console.log(`Crawling Finish: Found ${dlinkList.length} pdf links`);
console.log(
"--------------------------------------------------------\n\n"
);
}
} catch (error) {
if (connectionFailCount === 0) {
connectionFailCount += 1;
console.log(`Connection error. \n
Reconnecting to server: ${connectionFailCount} count`);
downloadLinks(linkList);
}
if (connectionFailCount === 3) {
console.error(`Can not connect to server. Try again later.`);
return;
}
// console.error("downloadLinksError: ", error);
}
};
const downloadFiles = async (dlinkList) => {
console.log("Creating directory to save PDF files");
const appRoot = path.dirname(path.resolve(__dirname));
// Had to change and restructure code due to error
const folderName = `PDF/${Surl.split("/").pop()}`;
const subFolderName = Surl.split("/").pop();
try {
if (!fs.existsSync(path.join(appRoot, folderName))) {
fs.mkdirSync(path.join(appRoot, "PDF"));
fs.mkdirSync(path.join(`${appRoot}/PDF`, subFolderName));
}
dlinkList.forEach(async (link) => {
let name = link.pathName;
let url = link.url;
let file = fs
.createWriteStream(
`${appRoot}/${folderName}/${name.split("/").pop()}`,
"utf-8"
)
.on("error", (err) => {
console.error("createWriteStreamError: ", err);
});
try {
console.log("Downloading PDF file...");
const { data } = await axios({
url,
method: "GET",
responseType: "stream",
});
if (data) {
console.log("PDF file Downloaded");
data.pipe(file);
}
} catch (error) {
console.error(error);
}
});
return;
} catch (error) {
console.error("downloadFilesError: ", error);
}
};
(async () => {
await getWebsiteLinks(Surl);
await downloadLinks(linkList);
await downloadFiles(dlinkList);
})();
getAxios.js
const axios = require("axios");
const https = require("https");
module.exports = function () {
const domain = "https://www.health.gov.ng/";
let instance;
if (!instance) {
//create axios instance
instance = axios.create({
baseURL: domain,
timeout: 60000, // Increase time out incase of network delay or delayed server response
maxContentLength: 500 * 1000 * 1000, // Increase maximum response ata length
httpsAgent: new https.Agent({ keepAlive: true }),
headers: { "Content-Type": "application/xml" },
});
}
return instance;
};
Background
Javascript library for Microsoft Office add-ins allows you to get raw content of the DOCX file through getFileAsync() api, which returns a slice of up to 4MB in one go. You keep calling the function using a sliding window approach till you have reed entire content. I need to upload these slices to the server and the join them back to recreate the original DOCX file.
My attempt
I'm using axios on the client-side and busboy-based express-chunked-file-upload middleware on my node server. As I call getFileAsync recursively, I get a raw array of bytes that I then convert to a Blob and append to FormData before posting it to the node server. The entire thing works and I get the slice on the server. However, the chunk that gets written to the disk on the server is much larger than the blob I uploaded, normally of the order of 3 times, so it is obviously not getting what I sent.
My suspicion is that this may have to do with stream encoding, but the node middleware does not expose any options to set encoding.
Here is the current state of code:
Client-side
public sendActiveDocument(uploadAs: string, sliceSize: number): Promise<boolean> {
return new Promise<boolean>((resolve) => {
Office.context.document.getFileAsync(Office.FileType.Compressed,
{ sliceSize: sliceSize },
async (result) => {
if (result.status == Office.AsyncResultStatus.Succeeded) {
// Get the File object from the result.
const myFile = result.value;
const state = {
file: myFile,
filename: uploadAs,
counter: 0,
sliceCount: myFile.sliceCount,
chunkSize: sliceSize
} as getFileState;
console.log("Getting file of " + myFile.size + " bytes");
const hash = makeId(12)
this.getSlice(state, hash).then(resolve(true))
} else {
resolve(false)
}
})
})
}
private async getSlice(state: getFileState, fileHash: string): Promise<boolean> {
const result = await this.getSliceAsyncPromise(state.file, state.counter)
if (result.status == Office.AsyncResultStatus.Succeeded) {
const data = result.value.data;
if (data) {
const formData = new FormData();
formData.append("file", new Blob([data]), state.filename);
const boundary = makeId(12);
const start = state.counter * state.chunkSize
const end = (state.counter + 1) * state.chunkSize
const total = state.file.size
return await Axios.post('/upload', formData, {
headers: {
"Content-Type": `multipart/form-data; boundary=${boundary}`,
"file-chunk-id": fileHash,
"file-chunk-size": state.chunkSize,
"Content-Range": 'bytes ' + start + '-' + end + '/' + total,
},
}).then(async res => {
if (res.status === 200) {
state.counter++;
if (state.counter < state.sliceCount) {
return await this.getSlice(state, fileHash);
}
else {
this.closeFile(state);
return true
}
}
else {
return false
}
}).catch(err => {
console.log(err)
this.closeFile(state)
return false
})
} else {
return false
}
}
else {
console.log(result.status);
return false
}
}
private getSliceAsyncPromise(file: Office.File, sliceNumber: number): Promise<Office.AsyncResult<Office.Slice>> {
return new Promise(function (resolve) {
file.getSliceAsync(sliceNumber, result => resolve(result))
})
}
Server-side
This code is totally from the npm package (link above), so I'm not supposed to change anything in here, but still for reference:
makeMiddleware = () => {
return (req, res, next) => {
const busboy = new Busboy({ headers: req.headers });
busboy.on('file', (fieldName, file, filename, _0, _1) => {
if (this.fileField !== fieldName) { // Current field is not handled.
return next();
}
const chunkSize = req.headers[this.chunkSizeHeader] || 500000; // Default: 500Kb.
const chunkId = req.headers[this.chunkIdHeader] || 'unique-file-id'; // If not specified, will reuse same chunk id.
// NOTE: Using the same chunk id for multiple file uploads in parallel will corrupt the result.
const contentRangeHeader = req.headers['content-range'];
let contentRange;
const errorMessage = util.format(
'Invalid Content-Range header: %s', contentRangeHeader
);
try {
contentRange = parse(contentRangeHeader);
} catch (err) {
return next(new Error(errorMessage));
}
if (!contentRange) {
return next(new Error(errorMessage));
}
const part = contentRange.start / chunkSize;
const partFilename = util.format('%i.part', part);
const tmpDir = util.format('/tmp/%s', chunkId);
this._makeSureDirExists(tmpDir);
const partPath = path.join(tmpDir, partFilename);
const writableStream = fs.createWriteStream(partPath);
file.pipe(writableStream);
file.on('end', () => {
req.filePart = part;
if (this._isLastPart(contentRange)) {
req.isLastPart = true;
this._buildOriginalFile(chunkId, chunkSize, contentRange, filename).then(() => {
next();
}).catch(_ => {
const errorMessage = 'Failed merging parts.';
next(new Error(errorMessage));
});
} else {
req.isLastPart = false;
next();
}
});
});
req.pipe(busboy);
};
}
Update
So it looks like I have found the problem at least. busboy appears to be writing my array of bytes as text in the output file. I get 80,75,3,4,20,0,6,0,8,0,0,0,33,0,44,25 (as text) when I upload the array of bytes [80,75,3,4,20,0,6,0,8,0,0,0,33,0,44,25]. Now need to figure out how to force it to write it as a binary stream.
Figured out. Just in case it helps anyone, there was no problem with busboy or office.js or axios. I just had to convert the incoming chunk of data to Uint8Array before creating a blob from it. So instead of:
formData.append("file", new Blob([data]), state.filename);
like this:
const blob = new Blob([ new Uint8Array(data) ])
formData.append("file", blob, state.filename);
And it worked like a charm.
Being new to java script and node.js, I ran into an error.
I'm attempting to scrape newspaper articles from a french newspaper using password and serchterms.
This is the code I'm using:
const fs = require("fs");
const request = require("request");
const cheerio = require("cheerio");
const jsonTab = []; // We create our table
function writeFile() {
// Will write the json file
fs.writeFile("output.json", JSON.stringify(jsonTab, null, 4), (err) => {
console.log("File successfully written!");
});
}
// The URL of the advanced search feature with our keywords
const url = 'http://www.lemonde.fr/recherche/?keywords="Rap+"+"hip-hop"+"hip%20hop"+"rappeur"+"rappeurs"+"raps"+"rappers"&page_num=1&operator=or&exclude_keywords=&qt=recherche_texte_title&author=&period=custom_date&start_day=01&start_month=01&start_year=1970&end_day=20&end_month=09&end_year=2017&sort=asc';
/* The first request call, our goal here is to get the number of results and then
to calculate the number of pages */
request(url, (error, response, html) => {
const $ = cheerio.load(html);
// All the variables we will use later
let number;
let description;
let date;
let title;
let link;
if (!error) {
$(".bg_gris_clair").filter(() => {
/* We want to select all the HTML
elements with the class ".bg_gris_clair" (and we already know there is
only one) */
const data = $(this);
const str = data.children("strong").text().trim();
number = parseInt(str.substring(0, str.indexOf("e")).replace(/\s/g, ""), 10);
});
}
let count = 1;
for (let i = 1; i <= number / 10; i++) {
const urlPerPage = 'http://www.lemonde.fr/recherche/?keywords="Rap+"+"hip-hop"+"hip%20hop"+"rappeur"+"rappeurs"+"raps"+"rappers"&page_num=' + i + "&operator=or&exclude_keywords=&qt=recherche_texte_title&author=&period=custom_date&start_day=01&start_month=01&start_year=1970&end_day=20&end_month=09&end_year=2017&sort=asc";
request(urlPerPage, (err, response2, html2) => {
if (!err) {
const $ = cheerio.load(html2);
$(".grid_11.omega.resultat").filter(() => {
const json = {
date: "",
title: "",
description: "",
url: ""
};
const data = $(this);
title = data.children("h3").children("a").text().trim();
link = "http://lemonde.fr" + data.children("h3").children("a").attr("href").trim();
description = data.children("p").text().trim();
const dateStr = data.children("span").text();
date = dateStr.replace(/.+?(?=\d)/, "");
json.title = title;
json.url = link;
json.description = description;
json.date = date;
jsonTab.push(json);
});
} else if (err) {
console.log(err);
}
count += 1;
// Write the file once we iterated through all the pages.
if (count === parseInt(number / 10, 10)) {
writeFile();
}
});
}
});
const fs = require("fs");
const request = require("request");
const cheerio = require("cheerio");
// Prepare all the variables needed later
let count = 0;
let timeout = 0;
const id = "myusername";
const mdp = "mypassword";
let obj;
// The URLs we will scrape from
const connexionUrl = "https://secure.lemonde.fr/sfuser/connexion";
// Will write an "output.json" file
function writeFile() {
fs.writeFile("output.json", JSON.stringify(obj, null, 4), (err) => {
console.log(
"File successfully written! - Check your project directory for the output.json file"
);
});
}
// creating a clean jar to store the cookies
const j = request.jar();
// First Get Request Call
request(
{
url: connexionUrl,
jar: j
},
(err, httpResponse, html) => {
const $ = cheerio.load(html);
// We use Cheerio to load the HTML and be able to find the connection__token
const token = $("#connection__token")[0].attribs.value; // here is the connection__token
// Construction of the form required in the POST request to login
const form = {
"connection[mail]": id,
"connection[password]": mdp,
"connection[stay_connected]": 1,
"connection[save]": "",
"connection[_token]": token
};
// POST REQUEST to Log IN. Same url with "request headers" and the complete form.
request.post(
{
url: connexionUrl,
jar: j,
headers: {
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4",
"Cache-Control": "no-cache",
"Content-Type": "application/x-www-form-urlencoded",
Origin: "http://secure.lemonde.fr/",
Host: "secure.lemonde.fr",
"Upgrade-Insecure-Requests": 1,
"User-Agents":
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
Connection: "keep-alive",
Pragma: "no-cache",
Referer: "https://secure.lemonde.fr/sfuser/connexion"
},
form: form
},
(error, response, body) => {
// WE ARE CONNECTED :D
/* Second GET request call : this time, we use the response of the POST
request to request the right URL */
request(
{
url: response.headers.location,
jar: j
},
(err, httpResponse, html2) => {
const json = fs.readFileSync("./firstStep.json"); // Load the JSON created in step one
obj = JSON.parse(json); // We create our JSON in a usable javascript object
// forEach loop to iterate through all the object and request each link
obj.forEach((e) => {
let articleUrl = e.url;
/* We use a setTimeout to be sure that all the requests are performed
one by one and not all at the same time */
setTimeout(() => {
request(
{
url: articleUrl,
jar: j
},
(error1, httpResponse, html3) => {
if (!error1) {
const $ = cheerio.load(html3); // load the HTML of the article page
$(".contenu_article.js_article_body").filter(() => {
const data = $(this);
// get the content, remove all the new lines (better for Excel)
let text = data
.text()
.trim()
.replace(/\n/g, "\t");
e.text = text; // push the content in the table
});
$(".txt3.description-article").filter(() => {
const data = $(this);
const description = data
.text()
.trim()
.replace(/\n/g, "\t");
e.description = description;
});
}
}
);
count += 1;
// Write a new JSON file once we get the content of all the articles
if (count === obj.length) {
writeFile();
}
}, timeout);
timeout += 50; // increase the timeout length each time
});
}
);
}
);
}
);
Running step3 it throws an error:
Error: ENOENT: no such file or directory, open './firstStep.json'
The tutorial can be found here: https://www.freecodecamp.org/news/how-i-scraped-7000-articles-from-a-newspaper-website-using-node-1309133a5070/
I am more used to using R, this is a new playing field for me.
I suspect the problem is straight forward, and related to the directory.
Thank you
I'm uploading files from the browser via a multipart request to a GraphQL-API which is powered by graphql-yoga which is powered by express.
Now I want to forward this exact same request body to another GraphQL-API.
const fetch = require('node-fetch');
async passThrough(args, opts) {
const { body, getRawBody, headers, method } = opts.request;
var rawBody;
if (body.files && body.files.length) {
rawBody = await getRawBody;
} else {
rawBody = typeof body == 'string' ? body : JSON.stringify(body)
}
let options = {
body: rawBody,
method, headers
};
var res = await fetch(otherApiUrl, options).then((res) => {
return res.json();
});
return res;
}
In this function I get the body as an object. But it includes "files" as promises which I can't simply forward (Couldn't find anything to do it). So I tried to get the raw body through a express middleware and access it like above with await getRawBody.
function getRawBody(req, res, next) {
req.getRawBody = new Promise(resolve => {
var buf = '';
req.on('data', x => buf += x);
req.on('end', () => {
resolve(buf);
});
});
next();
}
server.express.use(getRawBody);
It passes the request to the other API but the files are no valid jpegs anymore. I found out, that the uploaded file is shifted some bits from the original file. What am I maybe doing wrong?
I found a solution here and adapted the function to get the raw body. Now the file contents are not shifted anymore on the target host.
const concatStream = require('concat-stream');
function getRawBody(req, res, next) {
req.getRawBody = new Promise(resolve => {
req.pipe(concatStream(function (data) {
resolve(data);
}));
});
next();
}
I'm trying to get some data from a website containing html with node.js. However, the response is garbled, such as ��ӭǰ���Ա���ʵ�����̣�ѡ��Apple.
Could anyone tell me how can I do to solve that??
Thank you very much!
I have tried as below.
website url enter link description here
const urlLib = require('url');
const dataLib = require('./data');
dataLib.getUrl(website url,(buffer)=>{
console.log('done',buffer.toString('utf-8'));
},()=>{
console.log('error')
})
Here is the data.js
const urlLib = require('url');
exports.getUrl = function(sUrl, success, error) {
_req(sUrl);
function _req(sUrl){
let obj = urlLib.parse(sUrl);
let mod = null;
if (obj.protocol == 'http:') {
mod = require('http')
} else {
mod = require('https')
}
let req = mod.request({
hostname: obj.hostname,
path: obj.path
}, res => {
if (res.statusCode == 200) {
let arr = [];
res.on('data', (buffer) => {
arr.push(buffer)
});
res.on('end', () => {
let b = Buffer.concat(arr)
success && success(b)
})
} else if (res.statusCode == 301 || res.statusCode == 302) {
_req(res.headers['location'])
} else {
console.log(res.statusCode)
error && error();
}
})
req.on('error', (err) => {
console.log( err)
error && error(err)
})
req.end()
}
}
It seems like you're getting the data but you are not able to decode to it. This is typically the case with garbled characters like ��ӭǰ���Ա���ʵ�����̣�ѡ��.
I notice you are specifying 'utf-8' as the encoding in buf.toString(). It should be 'utf8' (without hyphen).