From: Justin Wind <justin.wind+git@gmail.com>
Date: Wed, 11 Aug 2021 19:00:20 +0000 (-0700)
Subject: parse topic content-types to recode content with non-utf8 charsets
X-Git-Tag: v1.1.2^2~2
X-Git-Url: https://git.squeep.com/?a=commitdiff_plain;h=5399cb751765f7c81ab5d4fab33f873eb3866f61;p=websub-hub

parse topic content-types to recode content with non-utf8 charsets

Parse the entire content-type header, to make use of any charset
parameter, before parsing content for links.
---

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aaa0b4f..0a7d491 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,14 @@ Releases and notable changes to this project are documented here.
 
 ## [Unreleased]
 
+### Added
+
+- Make use of the content-type charset when parsing topic content, recoding to UTF8 when needed.
+
+### Fixed
+
+- Feed parser could return a non-list for a single link entry, handle that case.
+
 ## [v1.1.1] - 2021-08-09
 
 ### Fixed
diff --git a/package-lock.json b/package-lock.json
index 4a11338..f297d8b 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1783,6 +1783,11 @@
         "debug": "4"
       }
     },
+    "iconv": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/iconv/-/iconv-3.0.0.tgz",
+      "integrity": "sha512-bKTEP55J/e+UutBE3BDBWq6KukPWh3GBYCZGbLEY9vxRDUU2F3bqvPsp/a/DEdIamgF2MvW5lF0Rj1U/7KRL+g=="
+    },
     "ieee754": {
       "version": "1.2.1",
       "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
diff --git a/package.json b/package.json
index f1fe201..7571ca9 100644
--- a/package.json
+++ b/package.json
@@ -39,6 +39,7 @@
     "better-sqlite3": "^7.4.3",
     "feedparser": "^2.2.10",
     "htmlparser2": "^6.1.0",
+    "iconv": "^3.0.0",
     "pg-promise": "^10.11.0"
   },
   "devDependencies": {
diff --git a/src/enum.js b/src/enum.js
index 8752bfb..96f0940 100644
--- a/src/enum.js
+++ b/src/enum.js
@@ -27,6 +27,7 @@ const Enum = common.mergeDeep(DingusEnum, {
 
   ContentType: {
     ApplicationAtom: 'application/atom+xml',
+    ApplicationOctetStream: 'application/octet-stream',
     ApplicationRDF: 'application/rdf+xml',
     ApplicationRSS: 'application/rss+xml',
     ApplicationXML: 'application/xml',
diff --git a/src/link-helper.js b/src/link-helper.js
index 5c6b839..2b6a833 100644
--- a/src/link-helper.js
+++ b/src/link-helper.js
@@ -11,6 +11,7 @@ const Enum = require('./enum');
 const FeedParser = require('feedparser');
 const { Readable } = require('stream');
 const htmlparser2 = require('htmlparser2');
+const { Iconv } = require('iconv');
 
 const _fileScope = common.fileScope(__filename);
 
@@ -45,6 +46,7 @@ class LinkHelper {
       try {
         links.push(...parseLinkHeader(linkHeader));
       } catch (e) {
+        /* istanbul ignore else */
         if (e instanceof ParseSyntaxError) {
           this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader });
         } else {
@@ -52,29 +54,39 @@ class LinkHelper {
         }
       }
     }
-    const contentType = getHeader(headers, Enum.Header.ContentType);
-    if (contentType) {
-      const [contentTypeBase, _contentTypeEncoding] = contentType.split(/; +/);
-      let bodyLinks = [];
-      switch (contentTypeBase) {
-        case Enum.ContentType.ApplicationAtom:
-        case Enum.ContentType.ApplicationRDF:
-        case Enum.ContentType.ApplicationRSS:
-        case Enum.ContentType.ApplicationXML:
-        case Enum.ContentType.TextXML: {
-          bodyLinks = await this.linksFromFeedBody(url, body);
-          break;
-        }
 
-        case Enum.ContentType.TextHTML:
-          bodyLinks = this.linksFromHTMLBody(body);
-          break;
+    const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType));
+    const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset;
+    if (nonUTF8Charset) {
+      const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore');
+      try {
+        body = iconv.convert(body);
+      } catch (e) {
+        /* istanbul ignore next */
+        this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url });
+        // But try to carry on, anyhow.
+      }
+    }
 
-        default:
-          this.logger.debug(_scope, 'no parser for content type', { contentType });
+    let bodyLinks = [];
+    switch (contentType.mediaType) {
+      case Enum.ContentType.ApplicationAtom:
+      case Enum.ContentType.ApplicationRDF:
+      case Enum.ContentType.ApplicationRSS:
+      case Enum.ContentType.ApplicationXML:
+      case Enum.ContentType.TextXML: {
+        bodyLinks = await this.linksFromFeedBody(url, body);
+        break;
       }
-      links.push(...bodyLinks);
+
+      case Enum.ContentType.TextHTML:
+        bodyLinks = this.linksFromHTMLBody(body);
+        break;
+
+      default:
+        this.logger.debug(_scope, 'no parser for content type', { contentType });
     }
+    links.push(...bodyLinks);
 
     // Fetch all hub relation targets from headers, resolving relative URIs.
     const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url));
@@ -85,6 +97,30 @@ class LinkHelper {
   }
 
 
+  /**
+   * Convert a Content-Type string to normalized components.
+   * RFC7231 Â§3.1.1
+   * N.B. this non-parser implementation will not work if a parameter
+   * value for some reason includes a ; or = within a quoted-string.
+   * @param {String} contentTypeHeader
+   * @returns {Object} contentType
+   * @returns {String} contentType.mediaType
+   * @returns {Object} contentType.params
+   */
+  static parseContentType(contentTypeHeader) {
+    const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */);
+    return {
+      mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream,
+      params: params.reduce((obj, param) => {
+        const [field, value] = param.split('=');
+        const isQuoted = value.charAt(0) === '"' && value.charAt(value.length - 1) === '"';
+        obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value;
+        return obj;
+      }, {}),
+    };
+  }
+
+
   /**
    * Parse XML-ish feed content, extracting link elements into our own format.
    * @param {String} feedurl
diff --git a/test/src/link-helper.js b/test/src/link-helper.js
index 535d4c9..1f91493 100644
--- a/test/src/link-helper.js
+++ b/test/src/link-helper.js
@@ -63,6 +63,15 @@ describe('LinkHelper', function () {
       const result = await lh.validHub(url, headers, body);
       assert.strictEqual(result, expected);
     });
+    it('covers link in HTML body with charset translation', async function () {
+      headers = {
+        'content-type': 'text/html; charset=ASCII',
+      };
+      body = '<html><head><link rel="hub" href="https://example.com/hub/"></head></html>';
+      const expected = true;
+      const result = await lh.validHub(url, headers, body);
+      assert.strictEqual(result, expected);
+    });
     it('covers parser failure', async function () {
       headers = {
         link: 'Invalid Link Header',
@@ -78,6 +87,46 @@ describe('LinkHelper', function () {
     });
   }); // validHub
 
+  describe('parseContentType', function () {
+    it('handles no data', function () {
+      const expected = {
+        mediaType: 'application/octet-stream',
+        params: {},
+      };
+      const result = LinkHelper.parseContentType();
+      assert.deepStrictEqual(result, expected);
+    });
+    it('handles only media type', function () {
+      const expected = {
+        mediaType: 'application/json',
+        params: {},
+      };
+      const result = LinkHelper.parseContentType('application/json');
+      assert.deepStrictEqual(result, expected);
+    });
+    it('handles parameters', function () {
+      const expected = {
+        mediaType: 'text/html',
+        params: {
+          charset: 'ISO-8859-4',
+        },
+      };
+      const result = LinkHelper.parseContentType('text/html; charset=ISO-8859-4');
+      assert.deepStrictEqual(result, expected);
+    });
+    it('handles more parameters', function () {
+      const expected = {
+        mediaType: 'multipart/form-data',
+        params: {
+          boundary: '--123--',
+          other: 'foo',
+        },
+      };
+      const result = LinkHelper.parseContentType('multipart/form-data; boundary="--123--"; other=foo');
+      assert.deepStrictEqual(result, expected);
+    });
+  }); // parseContentType
+
   describe('absoluteURI', function () {
     it('success', function () {
       const uri = '../rel';
@@ -136,6 +185,23 @@ describe('LinkHelper', function () {
     it('parses rss', async function () {
       const feedData = testData.rssFeedBody;
       const feedUrl = testData.rssFeedUrl;
+      const expected = [
+        {
+          attributes: [
+            {
+              name: 'rel',
+              value: 'hub',
+            },
+          ],
+          target: 'https://hub.squeep.com/',
+        },
+      ];
+      const result = await lh.linksFromFeedBody(feedUrl, feedData);
+      assert.deepStrictEqual(result, expected);
+    });
+    it('parses more rss', async function () {
+      const feedData = testData.rssFeedBody2;
+      const feedUrl = testData.rssFeedUrl2;
       const expected = [
         {
           attributes: [
diff --git a/test/test-data/link-helper.js b/test/test-data/link-helper.js
index f402915..818d5b4 100644
--- a/test/test-data/link-helper.js
+++ b/test/test-data/link-helper.js
@@ -1,7 +1,60 @@
 'use strict';
 
-const rssFeedUrl = 'https://puppetcircuits.wordpress.com/feed/';
-const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
+const rssFeedUrl = 'https://squeep.com/share/';
+const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+     xmlns:content="http://purl.org/rss/1.0/modules/content/"
+     xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
+     xmlns:atom="http://www.w3.org/2005/Atom"
+     xmlns:dc="http://purl.org/dc/elements/1.1/"
+     xmlns:wfw="http://wellformedweb.org/CommentAPI/" >
+	<channel>
+		<title>Things To Share</title>
+		<link>https://squeep.com/share</link>
+		<atom:link rel="hub" href="https://hub.squeep.com/" />
+		<description>Miscellaneous contents what interest some harmless mammal.</description>
+		<image>
+			<url>https://squeep.com/share/img/ratmap-128.png</url>
+			<width>128</width><height>128</height>
+		</image>
+		<pubDate>Tue, 10 Aug 2021 23:21:46 GMT</pubDate>
+		<generator>Blogofile</generator>
+		<sy:updatePeriod>weekly</sy:updatePeriod>
+		<sy:updateFrequency>1</sy:updateFrequency>
+		<item>
+			<title>arts I shall never be able to collect</title>
+			<link>https://squeep.com/share/2015/08/00000011</link>
+			<pubDate>Thu, 20 Aug 2015 20:42:37 PDT</pubDate>
+			<category><![CDATA[art]]></category>
+			<guid isPermaLink="false">00000011</guid>
+			<description>arts I shall never be able to collect</description>
+			<content:encoded><![CDATA[
+<p>
+<a href="http://www.followtheblackrabbit.com/">Beth Cavener</a> does phenomenal work.
+This is the piece I'd most want for my wall.  I could spend a lifetime appreciating those expressions.
+</p>
+<div>
+	<a href="http://www.followtheblackrabbit.com/gallery/the-sentimental-question-2/" title="1/2">
+		<img src="https://squeep.com/share/assets/2b7354e42e91cd42e161ad90243c9d6ffa1deba1-0000"
+			style="display:inline; height:40ex;"
+			alt="1/2"
+		/>
+	</a>
+
+	<a href="http://www.followtheblackrabbit.com/gallery/the-sentimental-question-2/" title="2/2">
+		<img src="https://squeep.com/share/assets/470998dab71830e0dde4cd20d17a5c96ceb19278-0000"
+			style="display:inline; height:40ex;"
+			alt="2/2"
+		/>
+	</a>
+</div>
+]]></content:encoded>
+		</item>
+	</channel>
+</rss>`;
+
+const rssFeedUrl2 = 'https://puppetcircuits.wordpress.com/feed/';
+const rssFeedBody2 = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
 	xmlns:content="http://purl.org/rss/1.0/modules/content/"
 	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
 	xmlns:dc="http://purl.org/dc/elements/1.1/"
@@ -50,8 +103,6 @@ const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
 					<wfw:commentRss>https://puppetcircuits.wordpress.com/2012/02/10/been-a-while-and-fun-video-from-eric/feed/</wfw:commentRss>
 			<slash:comments>0</slash:comments>
 
-
-
 		<media:content url="https://0.gravatar.com/avatar/c19e5ba7f3abc36656779edcc9c6b6eb?s=96&#38;d=identicon&#38;r=G" medium="image">
 			<media:title type="html">raphaelabrams</media:title>
 		</media:content>
@@ -75,8 +126,6 @@ const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
 					<wfw:commentRss>https://puppetcircuits.wordpress.com/2011/02/22/botacon/feed/</wfw:commentRss>
 			<slash:comments>0</slash:comments>
 
-
-
 		<media:content url="https://0.gravatar.com/avatar/c19e5ba7f3abc36656779edcc9c6b6eb?s=96&#38;d=identicon&#38;r=G" medium="image">
 			<media:title type="html">raphaelabrams</media:title>
 		</media:content>
@@ -100,8 +149,6 @@ const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
 					<wfw:commentRss>https://puppetcircuits.wordpress.com/2011/01/22/oddball-hotplate/feed/</wfw:commentRss>
 			<slash:comments>1</slash:comments>
 
-
-
 		<media:content url="https://0.gravatar.com/avatar/c19e5ba7f3abc36656779edcc9c6b6eb?s=96&#38;d=identicon&#38;r=G" medium="image">
 			<media:title type="html">raphaelabrams</media:title>
 		</media:content>
@@ -170,10 +217,12 @@ const htmlBody = `<?xml version="1.0" encoding="UTF-8" ?>
 <hr />`;
 
 module.exports = {
-	atomFeedBody,
-	atomFeedUrl,
-	htmlBody,
-	htmlUrl,
+  atomFeedBody,
+  atomFeedUrl,
+  htmlBody,
+  htmlUrl,
   rssFeedBody,
-	rssFeedUrl,
+  rssFeedUrl,
+  rssFeedBody2,
+  rssFeedUrl2,
 };