From 5399cb751765f7c81ab5d4fab33f873eb3866f61 Mon Sep 17 00:00:00 2001 From: Justin Wind Date: Wed, 11 Aug 2021 12:00:20 -0700 Subject: [PATCH] parse topic content-types to recode content with non-utf8 charsets Parse the entire content-type header, to make use of any charset parameter, before parsing content for links. --- CHANGELOG.md | 8 ++++ package-lock.json | 5 +++ package.json | 1 + src/enum.js | 1 + src/link-helper.js | 74 +++++++++++++++++++++++++--------- test/src/link-helper.js | 66 ++++++++++++++++++++++++++++++ test/test-data/link-helper.js | 75 +++++++++++++++++++++++++++++------ 7 files changed, 198 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aaa0b4f..0a7d491 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ Releases and notable changes to this project are documented here. ## [Unreleased] +### Added + +- Make use of the content-type charset when parsing topic content, recoding to UTF8 when needed. + +### Fixed + +- Feed parser could return a non-list for a single link entry, handle that case. + ## [v1.1.1] - 2021-08-09 ### Fixed diff --git a/package-lock.json b/package-lock.json index 4a11338..f297d8b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1783,6 +1783,11 @@ "debug": "4" } }, + "iconv": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/iconv/-/iconv-3.0.0.tgz", + "integrity": "sha512-bKTEP55J/e+UutBE3BDBWq6KukPWh3GBYCZGbLEY9vxRDUU2F3bqvPsp/a/DEdIamgF2MvW5lF0Rj1U/7KRL+g==" + }, "ieee754": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", diff --git a/package.json b/package.json index f1fe201..7571ca9 100644 --- a/package.json +++ b/package.json @@ -39,6 +39,7 @@ "better-sqlite3": "^7.4.3", "feedparser": "^2.2.10", "htmlparser2": "^6.1.0", + "iconv": "^3.0.0", "pg-promise": "^10.11.0" }, "devDependencies": { diff --git a/src/enum.js b/src/enum.js index 8752bfb..96f0940 100644 --- a/src/enum.js +++ b/src/enum.js @@ -27,6 +27,7 @@ const Enum = common.mergeDeep(DingusEnum, { ContentType: { ApplicationAtom: 'application/atom+xml', + ApplicationOctetStream: 'application/octet-stream', ApplicationRDF: 'application/rdf+xml', ApplicationRSS: 'application/rss+xml', ApplicationXML: 'application/xml', diff --git a/src/link-helper.js b/src/link-helper.js index 5c6b839..2b6a833 100644 --- a/src/link-helper.js +++ b/src/link-helper.js @@ -11,6 +11,7 @@ const Enum = require('./enum'); const FeedParser = require('feedparser'); const { Readable } = require('stream'); const htmlparser2 = require('htmlparser2'); +const { Iconv } = require('iconv'); const _fileScope = common.fileScope(__filename); @@ -45,6 +46,7 @@ class LinkHelper { try { links.push(...parseLinkHeader(linkHeader)); } catch (e) { + /* istanbul ignore else */ if (e instanceof ParseSyntaxError) { this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader }); } else { @@ -52,29 +54,39 @@ class LinkHelper { } } } - const contentType = getHeader(headers, Enum.Header.ContentType); - if (contentType) { - const [contentTypeBase, _contentTypeEncoding] = contentType.split(/; +/); - let bodyLinks = []; - switch (contentTypeBase) { - case Enum.ContentType.ApplicationAtom: - case Enum.ContentType.ApplicationRDF: - case Enum.ContentType.ApplicationRSS: - case Enum.ContentType.ApplicationXML: - case Enum.ContentType.TextXML: { - bodyLinks = await this.linksFromFeedBody(url, body); - break; - } - case Enum.ContentType.TextHTML: - bodyLinks = this.linksFromHTMLBody(body); - break; + const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType)); + const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset; + if (nonUTF8Charset) { + const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore'); + try { + body = iconv.convert(body); + } catch (e) { + /* istanbul ignore next */ + this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url }); + // But try to carry on, anyhow. + } + } - default: - this.logger.debug(_scope, 'no parser for content type', { contentType }); + let bodyLinks = []; + switch (contentType.mediaType) { + case Enum.ContentType.ApplicationAtom: + case Enum.ContentType.ApplicationRDF: + case Enum.ContentType.ApplicationRSS: + case Enum.ContentType.ApplicationXML: + case Enum.ContentType.TextXML: { + bodyLinks = await this.linksFromFeedBody(url, body); + break; } - links.push(...bodyLinks); + + case Enum.ContentType.TextHTML: + bodyLinks = this.linksFromHTMLBody(body); + break; + + default: + this.logger.debug(_scope, 'no parser for content type', { contentType }); } + links.push(...bodyLinks); // Fetch all hub relation targets from headers, resolving relative URIs. const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url)); @@ -85,6 +97,30 @@ class LinkHelper { } + /** + * Convert a Content-Type string to normalized components. + * RFC7231 §3.1.1 + * N.B. this non-parser implementation will not work if a parameter + * value for some reason includes a ; or = within a quoted-string. + * @param {String} contentTypeHeader + * @returns {Object} contentType + * @returns {String} contentType.mediaType + * @returns {Object} contentType.params + */ + static parseContentType(contentTypeHeader) { + const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */); + return { + mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream, + params: params.reduce((obj, param) => { + const [field, value] = param.split('='); + const isQuoted = value.charAt(0) === '"' && value.charAt(value.length - 1) === '"'; + obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value; + return obj; + }, {}), + }; + } + + /** * Parse XML-ish feed content, extracting link elements into our own format. * @param {String} feedurl diff --git a/test/src/link-helper.js b/test/src/link-helper.js index 535d4c9..1f91493 100644 --- a/test/src/link-helper.js +++ b/test/src/link-helper.js @@ -63,6 +63,15 @@ describe('LinkHelper', function () { const result = await lh.validHub(url, headers, body); assert.strictEqual(result, expected); }); + it('covers link in HTML body with charset translation', async function () { + headers = { + 'content-type': 'text/html; charset=ASCII', + }; + body = ''; + const expected = true; + const result = await lh.validHub(url, headers, body); + assert.strictEqual(result, expected); + }); it('covers parser failure', async function () { headers = { link: 'Invalid Link Header', @@ -78,6 +87,46 @@ describe('LinkHelper', function () { }); }); // validHub + describe('parseContentType', function () { + it('handles no data', function () { + const expected = { + mediaType: 'application/octet-stream', + params: {}, + }; + const result = LinkHelper.parseContentType(); + assert.deepStrictEqual(result, expected); + }); + it('handles only media type', function () { + const expected = { + mediaType: 'application/json', + params: {}, + }; + const result = LinkHelper.parseContentType('application/json'); + assert.deepStrictEqual(result, expected); + }); + it('handles parameters', function () { + const expected = { + mediaType: 'text/html', + params: { + charset: 'ISO-8859-4', + }, + }; + const result = LinkHelper.parseContentType('text/html; charset=ISO-8859-4'); + assert.deepStrictEqual(result, expected); + }); + it('handles more parameters', function () { + const expected = { + mediaType: 'multipart/form-data', + params: { + boundary: '--123--', + other: 'foo', + }, + }; + const result = LinkHelper.parseContentType('multipart/form-data; boundary="--123--"; other=foo'); + assert.deepStrictEqual(result, expected); + }); + }); // parseContentType + describe('absoluteURI', function () { it('success', function () { const uri = '../rel'; @@ -136,6 +185,23 @@ describe('LinkHelper', function () { it('parses rss', async function () { const feedData = testData.rssFeedBody; const feedUrl = testData.rssFeedUrl; + const expected = [ + { + attributes: [ + { + name: 'rel', + value: 'hub', + }, + ], + target: 'https://hub.squeep.com/', + }, + ]; + const result = await lh.linksFromFeedBody(feedUrl, feedData); + assert.deepStrictEqual(result, expected); + }); + it('parses more rss', async function () { + const feedData = testData.rssFeedBody2; + const feedUrl = testData.rssFeedUrl2; const expected = [ { attributes: [ diff --git a/test/test-data/link-helper.js b/test/test-data/link-helper.js index f402915..818d5b4 100644 --- a/test/test-data/link-helper.js +++ b/test/test-data/link-helper.js @@ -1,7 +1,60 @@ 'use strict'; -const rssFeedUrl = 'https://puppetcircuits.wordpress.com/feed/'; -const rssFeedBody = ` + + + Things To Share + https://squeep.com/share + + Miscellaneous contents what interest some harmless mammal. + + https://squeep.com/share/img/ratmap-128.png + 128128 + + Tue, 10 Aug 2021 23:21:46 GMT + Blogofile + weekly + 1 + + arts I shall never be able to collect + https://squeep.com/share/2015/08/00000011 + Thu, 20 Aug 2015 20:42:37 PDT + + 00000011 + arts I shall never be able to collect + +Beth Cavener does phenomenal work. +This is the piece I'd most want for my wall. I could spend a lifetime appreciating those expressions. +

+ +]]>
+
+
+
`; + +const rssFeedUrl2 = 'https://puppetcircuits.wordpress.com/feed/'; +const rssFeedBody2 = `https://puppetcircuits.wordpress.com/2012/02/10/been-a-while-and-fun-video-from-eric/feed/ 0 - - raphaelabrams @@ -75,8 +126,6 @@ const rssFeedBody = `https://puppetcircuits.wordpress.com/2011/02/22/botacon/feed/ 0 - - raphaelabrams @@ -100,8 +149,6 @@ const rssFeedBody = `https://puppetcircuits.wordpress.com/2011/01/22/oddball-hotplate/feed/ 1 - - raphaelabrams @@ -170,10 +217,12 @@ const htmlBody = `
`; module.exports = { - atomFeedBody, - atomFeedUrl, - htmlBody, - htmlUrl, + atomFeedBody, + atomFeedUrl, + htmlBody, + htmlUrl, rssFeedBody, - rssFeedUrl, + rssFeedUrl, + rssFeedBody2, + rssFeedUrl2, }; -- 2.43.2