From 5399cb751765f7c81ab5d4fab33f873eb3866f61 Mon Sep 17 00:00:00 2001
From: Justin Wind
Date: Wed, 11 Aug 2021 12:00:20 -0700
Subject: [PATCH] parse topic content-types to recode content with non-utf8
charsets
Parse the entire content-type header, to make use of any charset
parameter, before parsing content for links.
---
CHANGELOG.md | 8 ++++
package-lock.json | 5 +++
package.json | 1 +
src/enum.js | 1 +
src/link-helper.js | 74 +++++++++++++++++++++++++---------
test/src/link-helper.js | 66 ++++++++++++++++++++++++++++++
test/test-data/link-helper.js | 75 +++++++++++++++++++++++++++++------
7 files changed, 198 insertions(+), 32 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index aaa0b4f..0a7d491 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,14 @@ Releases and notable changes to this project are documented here.
## [Unreleased]
+### Added
+
+- Make use of the content-type charset when parsing topic content, recoding to UTF8 when needed.
+
+### Fixed
+
+- Feed parser could return a non-list for a single link entry, handle that case.
+
## [v1.1.1] - 2021-08-09
### Fixed
diff --git a/package-lock.json b/package-lock.json
index 4a11338..f297d8b 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1783,6 +1783,11 @@
"debug": "4"
}
},
+ "iconv": {
+ "version": "3.0.0",
+ "resolved": "https://registry.npmjs.org/iconv/-/iconv-3.0.0.tgz",
+ "integrity": "sha512-bKTEP55J/e+UutBE3BDBWq6KukPWh3GBYCZGbLEY9vxRDUU2F3bqvPsp/a/DEdIamgF2MvW5lF0Rj1U/7KRL+g=="
+ },
"ieee754": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
diff --git a/package.json b/package.json
index f1fe201..7571ca9 100644
--- a/package.json
+++ b/package.json
@@ -39,6 +39,7 @@
"better-sqlite3": "^7.4.3",
"feedparser": "^2.2.10",
"htmlparser2": "^6.1.0",
+ "iconv": "^3.0.0",
"pg-promise": "^10.11.0"
},
"devDependencies": {
diff --git a/src/enum.js b/src/enum.js
index 8752bfb..96f0940 100644
--- a/src/enum.js
+++ b/src/enum.js
@@ -27,6 +27,7 @@ const Enum = common.mergeDeep(DingusEnum, {
ContentType: {
ApplicationAtom: 'application/atom+xml',
+ ApplicationOctetStream: 'application/octet-stream',
ApplicationRDF: 'application/rdf+xml',
ApplicationRSS: 'application/rss+xml',
ApplicationXML: 'application/xml',
diff --git a/src/link-helper.js b/src/link-helper.js
index 5c6b839..2b6a833 100644
--- a/src/link-helper.js
+++ b/src/link-helper.js
@@ -11,6 +11,7 @@ const Enum = require('./enum');
const FeedParser = require('feedparser');
const { Readable } = require('stream');
const htmlparser2 = require('htmlparser2');
+const { Iconv } = require('iconv');
const _fileScope = common.fileScope(__filename);
@@ -45,6 +46,7 @@ class LinkHelper {
try {
links.push(...parseLinkHeader(linkHeader));
} catch (e) {
+ /* istanbul ignore else */
if (e instanceof ParseSyntaxError) {
this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader });
} else {
@@ -52,29 +54,39 @@ class LinkHelper {
}
}
}
- const contentType = getHeader(headers, Enum.Header.ContentType);
- if (contentType) {
- const [contentTypeBase, _contentTypeEncoding] = contentType.split(/; +/);
- let bodyLinks = [];
- switch (contentTypeBase) {
- case Enum.ContentType.ApplicationAtom:
- case Enum.ContentType.ApplicationRDF:
- case Enum.ContentType.ApplicationRSS:
- case Enum.ContentType.ApplicationXML:
- case Enum.ContentType.TextXML: {
- bodyLinks = await this.linksFromFeedBody(url, body);
- break;
- }
- case Enum.ContentType.TextHTML:
- bodyLinks = this.linksFromHTMLBody(body);
- break;
+ const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType));
+ const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset;
+ if (nonUTF8Charset) {
+ const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore');
+ try {
+ body = iconv.convert(body);
+ } catch (e) {
+ /* istanbul ignore next */
+ this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url });
+ // But try to carry on, anyhow.
+ }
+ }
- default:
- this.logger.debug(_scope, 'no parser for content type', { contentType });
+ let bodyLinks = [];
+ switch (contentType.mediaType) {
+ case Enum.ContentType.ApplicationAtom:
+ case Enum.ContentType.ApplicationRDF:
+ case Enum.ContentType.ApplicationRSS:
+ case Enum.ContentType.ApplicationXML:
+ case Enum.ContentType.TextXML: {
+ bodyLinks = await this.linksFromFeedBody(url, body);
+ break;
}
- links.push(...bodyLinks);
+
+ case Enum.ContentType.TextHTML:
+ bodyLinks = this.linksFromHTMLBody(body);
+ break;
+
+ default:
+ this.logger.debug(_scope, 'no parser for content type', { contentType });
}
+ links.push(...bodyLinks);
// Fetch all hub relation targets from headers, resolving relative URIs.
const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url));
@@ -85,6 +97,30 @@ class LinkHelper {
}
+ /**
+ * Convert a Content-Type string to normalized components.
+ * RFC7231 §3.1.1
+ * N.B. this non-parser implementation will not work if a parameter
+ * value for some reason includes a ; or = within a quoted-string.
+ * @param {String} contentTypeHeader
+ * @returns {Object} contentType
+ * @returns {String} contentType.mediaType
+ * @returns {Object} contentType.params
+ */
+ static parseContentType(contentTypeHeader) {
+ const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */);
+ return {
+ mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream,
+ params: params.reduce((obj, param) => {
+ const [field, value] = param.split('=');
+ const isQuoted = value.charAt(0) === '"' && value.charAt(value.length - 1) === '"';
+ obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value;
+ return obj;
+ }, {}),
+ };
+ }
+
+
/**
* Parse XML-ish feed content, extracting link elements into our own format.
* @param {String} feedurl
diff --git a/test/src/link-helper.js b/test/src/link-helper.js
index 535d4c9..1f91493 100644
--- a/test/src/link-helper.js
+++ b/test/src/link-helper.js
@@ -63,6 +63,15 @@ describe('LinkHelper', function () {
const result = await lh.validHub(url, headers, body);
assert.strictEqual(result, expected);
});
+ it('covers link in HTML body with charset translation', async function () {
+ headers = {
+ 'content-type': 'text/html; charset=ASCII',
+ };
+ body = '';
+ const expected = true;
+ const result = await lh.validHub(url, headers, body);
+ assert.strictEqual(result, expected);
+ });
it('covers parser failure', async function () {
headers = {
link: 'Invalid Link Header',
@@ -78,6 +87,46 @@ describe('LinkHelper', function () {
});
}); // validHub
+ describe('parseContentType', function () {
+ it('handles no data', function () {
+ const expected = {
+ mediaType: 'application/octet-stream',
+ params: {},
+ };
+ const result = LinkHelper.parseContentType();
+ assert.deepStrictEqual(result, expected);
+ });
+ it('handles only media type', function () {
+ const expected = {
+ mediaType: 'application/json',
+ params: {},
+ };
+ const result = LinkHelper.parseContentType('application/json');
+ assert.deepStrictEqual(result, expected);
+ });
+ it('handles parameters', function () {
+ const expected = {
+ mediaType: 'text/html',
+ params: {
+ charset: 'ISO-8859-4',
+ },
+ };
+ const result = LinkHelper.parseContentType('text/html; charset=ISO-8859-4');
+ assert.deepStrictEqual(result, expected);
+ });
+ it('handles more parameters', function () {
+ const expected = {
+ mediaType: 'multipart/form-data',
+ params: {
+ boundary: '--123--',
+ other: 'foo',
+ },
+ };
+ const result = LinkHelper.parseContentType('multipart/form-data; boundary="--123--"; other=foo');
+ assert.deepStrictEqual(result, expected);
+ });
+ }); // parseContentType
+
describe('absoluteURI', function () {
it('success', function () {
const uri = '../rel';
@@ -136,6 +185,23 @@ describe('LinkHelper', function () {
it('parses rss', async function () {
const feedData = testData.rssFeedBody;
const feedUrl = testData.rssFeedUrl;
+ const expected = [
+ {
+ attributes: [
+ {
+ name: 'rel',
+ value: 'hub',
+ },
+ ],
+ target: 'https://hub.squeep.com/',
+ },
+ ];
+ const result = await lh.linksFromFeedBody(feedUrl, feedData);
+ assert.deepStrictEqual(result, expected);
+ });
+ it('parses more rss', async function () {
+ const feedData = testData.rssFeedBody2;
+ const feedUrl = testData.rssFeedUrl2;
const expected = [
{
attributes: [
diff --git a/test/test-data/link-helper.js b/test/test-data/link-helper.js
index f402915..818d5b4 100644
--- a/test/test-data/link-helper.js
+++ b/test/test-data/link-helper.js
@@ -1,7 +1,60 @@
'use strict';
-const rssFeedUrl = 'https://puppetcircuits.wordpress.com/feed/';
-const rssFeedBody = `
+
+
+ Things To Share
+ https://squeep.com/share
+
+ Miscellaneous contents what interest some harmless mammal.
+
+ https://squeep.com/share/img/ratmap-128.png
+ 128128
+
+ Tue, 10 Aug 2021 23:21:46 GMT
+ Blogofile
+ weekly
+ 1
+ -
+ arts I shall never be able to collect
+ https://squeep.com/share/2015/08/00000011
+ Thu, 20 Aug 2015 20:42:37 PDT
+
+ 00000011
+ arts I shall never be able to collect
+
+Beth Cavener does phenomenal work.
+This is the piece I'd most want for my wall. I could spend a lifetime appreciating those expressions.
+
+
+]]>
+
+
+`;
+
+const rssFeedUrl2 = 'https://puppetcircuits.wordpress.com/feed/';
+const rssFeedBody2 = `https://puppetcircuits.wordpress.com/2012/02/10/been-a-while-and-fun-video-from-eric/feed/
0
-
-
raphaelabrams
@@ -75,8 +126,6 @@ const rssFeedBody = `https://puppetcircuits.wordpress.com/2011/02/22/botacon/feed/
0
-
-
raphaelabrams
@@ -100,8 +149,6 @@ const rssFeedBody = `https://puppetcircuits.wordpress.com/2011/01/22/oddball-hotplate/feed/
1
-
-
raphaelabrams
@@ -170,10 +217,12 @@ const htmlBody = `
`;
module.exports = {
- atomFeedBody,
- atomFeedUrl,
- htmlBody,
- htmlUrl,
+ atomFeedBody,
+ atomFeedUrl,
+ htmlBody,
+ htmlUrl,
rssFeedBody,
- rssFeedUrl,
+ rssFeedUrl,
+ rssFeedBody2,
+ rssFeedUrl2,
};
--
2.45.2