Dealing with multiple encoding schemes while downloading the XML feed - javascript

I am trying to read the feed at the following URL:
http://www.chinanews.com/rss/scroll-news.xml
using request module. But I get stuff that has ���� ʷ����)������(�й�)���޹�.
On reviewing the XML I see that the encoding is being set as <?xml version="1.0" encoding="gb2312"?>
But on trying to set the encoding to gb2312, I get the unknown encoding error.
request({
url: "http://www.chinanews.com/rss/scroll-news.xml",
method: "GET",
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Host": "www.chinanews.com",
"Accept-Language": "en-GB,en-US;q=0.8,en;q=0.6"
},
"gzip": true,
"encoding": "utf8"
}, (err, resp, data) => {
console.log(data);
});
Is there a way I could get the data irrespective of the encoding it has? How should I approach this?

You missed the concept of character encoding.
var iconv=require('iconv-lite'), request=require('request');
request({
url: "http://www.chinanews.com/rss/scroll-news.xml",
method: "GET",
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Host": "www.chinanews.com",
"Accept-Language": "" // client accept language
},
gzip: true,
encoding: null // or 'ascii'
}, (err, resp, body) => {
console.log(iconv.decode(Buffer.from(body, 'ascii'), 'gb2312'));
});
chunk is a Buffer instance in node.js. According to the official documention, there are only
'ascii' - For 7-bit ASCII data only. This encoding is fast and will strip the high bit if set.
'utf8' - Multibyte encoded Unicode characters. Many web pages and other document formats use UTF-8.
'utf16le' - 2 or 4 bytes, little-endian encoded Unicode characters. Surrogate pairs (U+10000 to U+10FFFF) are supported.
'ucs2' - Alias of 'utf16le'.
'base64' - Base64 encoding. When creating a Buffer from a string, this encoding will also correctly accept "URL and Filename Safe Alphabet" as specified in RFC4648, Section 5.
'latin1' - A way of encoding the Buffer into a one-byte encoded string (as defined by the IANA in RFC1345, page 63, to be the Latin-1 supplement block and C0/C1 control codes).
'binary' - Alias for 'latin1'.
'hex' - Encode each byte as two hexadecimal characters.
currently supported by node.js include. To use the encodings not natively supported by node.js, use iconv, iconv-lite or other libraries to grab the character mapping table. This is very similar to this answer.
The Accept-Language implies the languages accepted by client. en-gb represents English (United Kingdom), but not Chinese. The Chinese one is zh-cn, zh, according to RFC 7231.

The tricky part is to pass encoding as null to get a Buffer instead of a string.
encoding - encoding to be used on setEncoding of response data.
If null, the body is returned as a Buffer.
—request
var request = require('request');
var legacy = require('legacy-encoding');
var requestSettings = {
method: 'GET',
url: 'http://www.chinanews.com/rss/scroll-news.xml',
encoding: null,
};
request(requestSettings, function(error, response, body) {
var text = legacy.decode(body, 'gb2312');
console.log(text);
});
Again, in the context of the follow-up question, "
Is there a way I could detect encoding?"
By "detect" I hope you mean, find the declaration. (…as opposed to guessing. If you have to guess then you have a failed communication.) The HTTP response header Content-Type is the primary way to communicate the encoding (if applicable to the MIME type). Some MIME types allow the encoding to be declared within the content, as servers quite rightly defer to that.
In the case of your RSS response. The server sends Content-Type:text/xml. which is without an encoding override. And the content's XML declaration is <?xml version="1.0" encoding="gb2312"?> The XML specification has procedures for finding such a declaration. It basically amounts to reading with different encodings until the XML declaration becomes intelligible, and then re-read with the declared encoding.
var request = require('request');
var legacy = require('legacy-encoding');
var convert = require('xml-js');
// specials listed here: https://www.w3.org/Protocols/rfc1341/4_Content-Type.html
var charsetFromContentTypeRegex = (/charset=([^()<>#,;:\"/[\]?.=\s]*)/i).compile();
var requestSettings = {
method: 'GET',
url: 'http://www.chinanews.com/rss/scroll-news.xml',
encoding: null,
};
request(requestSettings, function(error, response, body) {
var contentType = charsetFromContentTypeRegex.exec(response.headers['content-type'])
var encodingFromHeader = contentType.length > 1 ? contentType[1] : null;
var doc = convert.xml2js(body);
var encoding = doc.declaration.attributes.encoding;
doc = convert.xml2js(
legacy.decode(body, encodingFromHeader ? encodingFromHeader : encoding));
// xpath /rss/channel/title
console.log(doc.elements[1].elements[0].elements[0].elements[0].text);
});

Related

How can I send new line feed in a POST call using plain text in JavaScript ES6 / Prometheus

I am trying to send a payload using POST to a Prometheus gateway server.
This server expects the request to be text/plain and it must end with a new line feed character.
"Using the Prometheus text protocol, pushing metrics is so easy that no separate CLI is provided. Simply use a command-line HTTP tool like curl. Your favorite scripting language has most likely some built-in HTTP capabilities you can leverage here as well.
Note that in the text protocol, each line has to end with a line-feed character (aka 'LF' or '\n'). Ending a line in other ways, e.g. with 'CR' aka '\r', 'CRLF' aka '\r\n', or just the end of the packet, will result in a protocol error."
var payload = 'http_requests_total{method="post", code="200"} 1000 1295000000000'
var apiRequest = http.request{{
'endpoint': 'http//myserver/api/metrics',
'method': 'POST',
'headers': {
'content-type': 'text/plain',
'Accept-Encoding': 'UTF-8'
}
)}
var resp = apiRequest.write(payload);
If I send as is, I receive a 400 response saying "unexpected end of input stream"
This is because the payload does not end with a line feed character.
I have tried to add "\n" but this doesn't work.
var payload = 'http_requests_total{method="post", code="200"} 1000 1295000000000' + '\n'
Am I missing something fundamental? If I send a curl request is works! but I am limited to using JavaScript ES6.

Need to add value into my URL while doing HTTP post request using Google Cloud Function

I'm creating a OTP type of registration for my react native based mobile app. By using google cloud function to generate otp and post http request to my SMS provider.
The problem i am facing is that, whenever i try to add the random code to my sms provider url with ${code}, the message simply displays the same ${code} not the randomly generated code.
In other words, don't know how to interpolate the code into my url (as i am a newbie).
Here is my code for Random Number :
const code = Math.floor((Math.random() * 8999 + 1000));
And my request using request package is as follows:
const options = {
method: 'POST',
uri: 'http://smsprovider.com/numbers=${numbers}&route=2&message=Your OTP is ${code}',
body: {
numbers: phone,
code: code
},
json: true
};
So, whenever I get a message, it says Your OTP is ${code}. But what I actually need is to show the random number generated by the math.floor function. Expected "Your OTP is 5748"
Kindly guide
For string interpolation with JavaScript be sure to use the
`
character instead of
'
Try this instead:
const options = {
method: 'POST',
uri: `http://smsprovider.com/numbers=${numbers}&route=2&message=Your OTP is ${code}`,
body: {
numbers: phone,
code: code
},
json: true
};
String interpolation and url encoding are two distinct paradigms, one doesn't replace the other.
string interpolation allows you to dynamically insert a variable's content into a string with ${}. For this to work you must enclose your string between back quotes as #Ben Beck instructed. Some interpreters will be forgiving, meaning that even if you use single quotes, the interpreter will nonetheless parse the string with the interpolation, however not all interpreters do that, and it is bad practice to rely on it. Make sure you format these correctly.
url component encoding converts the url parameters containing special characters into a valid uri component with encodeURIComponent(). This is how you get rid of spaces and other special characters, however it might not be needed here as most browsers do that for you. Use Chrome to be sure, but again it is good practice to write fully portable code, I recommend to encode any parameter featuring any special character.
The fact that your Postman test failed is most certainly due to a faulty request. Check this screenshot for a working Postman POST request based on your case, leveraging Pre-request Script.
While testing with your code directly (not through Postman), if you keep getting the literal ${code} in place of the actual value, it likely means that the definition const code = Math.floor((Math.random() * 8999 + 1000)) is not in the same scope as the interpolation call. Check below for an example of working script using both string interpolation and url encoding based on your case:
const request = require('request');
const code = Math.floor((Math.random() * 8999 + 1000));
var p1 = encodeURIComponent(`Your OTP is ${code}`);
var uri = `http://smsprovider.com/?message=${p1}`;
const options = {
method: 'POST',
url: uri,
json: true
};
function callback(error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body);
}
else {
console.log(error);
}
}
request(options, callback);
same but without url encoding and with the message parameter embedded in body element:
var uri = `http://smsprovider.com/`;
const options = {
method: 'POST',
url: uri,
body: {
message: `Your OTP is ${code}`,
},
json: true
};

Google Drive API: Correct way to upload binary files via the Multipart API

I'm trying to upload a binary file to Google Drive via the
multipart upload API v3.
Here's the hex representation of the content of the file:
FF FE
For some reason the above content gets encoded as UTF-8 (I assume)
when I try to POST it, enclosed in a multipart payload:
--BOUNDARY
Content-Type: application/json
{"name": "F.ini"}
--BOUNDARY
Content-Type: application/octet-stream
ÿþ <-- in the outbound request, this gets UTF-8 encoded
--BOUNDARY--
Hex representation of the file that ultimately gets stored on server side:
C3 BF C3 BE
The problem only occurs in the sending stage:
if I check the length of the content read from the file I always get 2;
regardless of whether I use FileReader#readAsBinaryString or FileReader#readAsArrayBuffer
(producing a string with length 2, and an ArrayBuffer with byteLength 2, respectively).
Here's the minimal code that I'm using to generate the multipart payload:
file = picker.files[0]; // 'picker' is a file picker
reader = new FileReader();
reader.onload = function (e) {
content = e.target.result;
boundary = "BOUNDARY";
meta = '{"name": "' + file.name + '"}';
console.log(content.length); // gives 2 as expected
payload = [
"--" + boundary, "Content-Type: application/json", "", meta, "", "--" + boundary,
"Content-Type: application/octet-stream", "", content, "--" + boundary + "--"
].join("\r\n");
console.log(payload.length); // say this gives n
xhr = new XMLHttpRequest();
xhr.open("POST", "/", false);
xhr.setRequestHeader("Content-Type", "multipart/related; boundary=" + boundary);
xhr.send(payload); // this produces a request with a 'Content-Length: n+2' header
// (corresponding to the length increase due to UTF-8 encoding)
};
reader.readAsBinaryString(file);
My question is twofold:
Is there a way to avoid this automatic UTF-8 encoding? (Probably not, because
this answer
implies that the UTF-8 encoding is part of the XHR spec.)
If not, what is the correct way to "inform" the Drive API that my file content is UTF-8 encoded?
I have tried these approaches, with no success:
appending ; charset=utf-8 or ; charset=UTF-8 to the binary part's Content-Type header
doing the same to the HTTP header on the parent request
(Content-Type: multipart/related; boundary=blablabla, charset=utf-8;
also tried replacing the comma with a semicolon)
I need the multipart API because AFAIU the "simple" API
does not allow me to upload into a folder
(it only accepts a filename as metadata, via the Slug HTTP header,
whereas the JSON metadata object in the multipart case allows a parent folder ID to be specified as well).
(Just thought of mentioning this because the "simple" API handles things correctly
when I directly POST the File (from the picker) or ArrayBuffer (from FileReader#readAsArrayBuffer) as the XHR's payload.)
I do not want to utilize any third-party libraries because
I want to keep things as light as possible, and
keeping aside reinventing-the-wheel and best-practices stuff, anything that is accomplished by a third party library should be doable via plain JS as well (this is just a fun exercise).
For the sake of completeness I tried uploading the same file via the GDrive web interface, and it got uploaded just fine;
however the web interface seems to base64-encode the payload, which I would rather like to avoid
(as it unnecessarily bloats up the payload, esp. for larger payloads which is my eventual goal).
How about this modification?
Modification points:
Used new FormData() for creating the multipart/form-data.
Used reader.readAsArrayBuffer(file) instead of reader.readAsBinaryString(file).
Send the file as a blob. In this case, the data is sent as application/octet-stream.
Modified script:
file = picker.files[0]; // 'picker' is a file picker
reader = new FileReader();
reader.onload = function (e) {
var content = new Blob([file]);
var meta = {name: file.name, mimeType: file.type};
var accessToken = gapi.auth.getToken().access_token;
var payload = new FormData();
payload.append('metadata', new Blob([JSON.stringify(meta)], {type: 'application/json'}));
payload.append('file', content);
xhr = new XMLHttpRequest();
xhr.open('post', 'https://www.googleapis.com/upload/drive/v3/files?uploadType=multipart');
xhr.setRequestHeader('Authorization', 'Bearer ' + accessToken);
xhr.onload = function() {
console.log(xhr.response);
};
xhr.send(payload);
};
reader.readAsArrayBuffer(file);
Note:
In this modified script, I put the endpoint and the header including the access token. So please modify this for your environment.
In this case, I used a scope of https://www.googleapis.com/auth/drive.
Reference:
Using FormData Objects
In my environment, I could confirmed that this script worked. But if this didn't work in your environment, I'm sorry.

How to remove invalid characters in an HTTP response header in JavaScript/node.js?

I have the following problem: When a client enters a certain URL it receives an .mp3 file back via sendFile() from Express. The name of the file is defined in the response header as follow:
var fileName = (JSON.stringify(data.videoTitle).replace(/["']/g, "") + fileType);
headers: {
'x-timestamp': Date.now(),
'x-sent': true,
'Content-type': 'application/octet-stream',
'Content-Disposition': 'attachment; filename="' + fileName + '"'
}
The problem is that fileName is fetched from another website and I have no control over what the title of the file will be. So far it was no problem but it just happened to me that a file contained the character ú which lead to the following error:
throw new TypeError('The header content contains invalid characters');
This could potentially happen very often as there are many characters that the headers might not like. Is there any possibility that I only keep valid characters before setting the header? I assume a whitelist approach would be better than a blacklist approach as there are nearly infinite possibilities of invalid characters.
Thank you very much in advance
Here is an extended answer which basically tells you that only a subset of ASCII or ISO-8859-1 is allowed.
What character encoding should I use for a HTTP header?
And here is your solution: transliterate the filename into ASCII characters to make it safe for use in a header value:
https://www.npmjs.com/package/transliteration

Request returning unicode replacement character

Using the request module to load a webpage, I notice that for he UK pound symbol £ I sometimes get back the unicode replacement character \uFFFD.
An example URL that I'm parsing is this Amazon UK page: http://www.amazon.co.uk/gp/product/B00R3P1NSI/ref=s9_newr_gw_d38_g351_i2?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-2&pf_rd_r=0Q529EEEZWKPCVQBRHT9&pf_rd_t=101&pf_rd_p=455333147&pf_rd_i=468294
I'm also using the iconv-lite module to decode using the charset returned in the response header:
request(urlEntry.url, function(err, response, html) {
const contType = response.headers['content-type'];
const charset = contType.substring(contType.indexOf('charset=') + 8, contType.length);
const encBody = iconv.decode(html, charset);
...
But this doesn't seem to be helping. I've also tried decoding the response HTML as UTF-8.
How can I avoid this Unicode replacement char?
Firstly, the Amazon webpage is encoded in ISO-8859-1, not UTF-8. This is what causes the Unicode replacement character. You can check this in the response headers. I used curl -i.
Secondly, the README for requests says:
encoding - Encoding to be used on setEncoding of response data. If
null, the body is returned as a Buffer. Anything else (including the
default value of undefined) will be passed as the encoding parameter
to toString() (meaning this is effectively utf8 by default).
It is UTF-8 by default... and (after a little experimentation) we find that it sadly it doesn't support ISO-8859-1. However, if we set the encoding to null we can then decode the resulting Buffer using iconv-lite.
Here is a sample program.
var request = require('request');
var iconvlite = require('iconv-lite');
var url = "http://www.amazon.co.uk/gp/product/B00R3P1NSI/ref=s9_newr_gw_d38_g351_i2?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-2&pf_rd_r=0Q529EEEZWKPCVQBRHT9&pf_rd_t=101&pf_rd_p=455333147&pf_rd_i=468294";
request({url: url, encoding: null}, function (error, response, body) {
if (!error && response.statusCode == 200) {
var encoding = 'ISO-8859-1';
var content = iconvlite.decode(body, encoding);
console.log(content);
}
});
This question is somewhat related, and I used it whilst figuring this out:
http.get and ISO-8859-1 encoded responses

Categories

Resources