parse topic content-types to recode content with non-utf8 charsets
authorJustin Wind <justin.wind+git@gmail.com>
Wed, 11 Aug 2021 19:00:20 +0000 (12:00 -0700)
committerJustin Wind <justin.wind+git@gmail.com>
Wed, 11 Aug 2021 19:05:16 +0000 (12:05 -0700)
Parse the entire content-type header, to make use of any charset
parameter, before parsing content for links.

CHANGELOG.md
package-lock.json
package.json
src/enum.js
src/link-helper.js
test/src/link-helper.js
test/test-data/link-helper.js

index aaa0b4fa5d4864eab46723c38bc00dfe2986dfc5..0a7d49117f0a521ab287f126256bb134aee86038 100644 (file)
@@ -4,6 +4,14 @@ Releases and notable changes to this project are documented here.
 
 ## [Unreleased]
 
+### Added
+
+- Make use of the content-type charset when parsing topic content, recoding to UTF8 when needed.
+
+### Fixed
+
+- Feed parser could return a non-list for a single link entry, handle that case.
+
 ## [v1.1.1] - 2021-08-09
 
 ### Fixed
index 4a113388f2d35e1091bb99950f8c91299de372dc..f297d8b50b3bb6427b0915e5e891037944e628b5 100644 (file)
         "debug": "4"
       }
     },
+    "iconv": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/iconv/-/iconv-3.0.0.tgz",
+      "integrity": "sha512-bKTEP55J/e+UutBE3BDBWq6KukPWh3GBYCZGbLEY9vxRDUU2F3bqvPsp/a/DEdIamgF2MvW5lF0Rj1U/7KRL+g=="
+    },
     "ieee754": {
       "version": "1.2.1",
       "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
index f1fe201d950858fdb9d7aad31fc45d9e682e67ae..7571ca9980f106b77d45c60db49945b99a0e385e 100644 (file)
@@ -39,6 +39,7 @@
     "better-sqlite3": "^7.4.3",
     "feedparser": "^2.2.10",
     "htmlparser2": "^6.1.0",
+    "iconv": "^3.0.0",
     "pg-promise": "^10.11.0"
   },
   "devDependencies": {
index 8752bfbcc022c60dc5e2013665021b4e3ccde8d6..96f094069316150ed91f935ab2e950c5be023f4d 100644 (file)
@@ -27,6 +27,7 @@ const Enum = common.mergeDeep(DingusEnum, {
 
   ContentType: {
     ApplicationAtom: 'application/atom+xml',
+    ApplicationOctetStream: 'application/octet-stream',
     ApplicationRDF: 'application/rdf+xml',
     ApplicationRSS: 'application/rss+xml',
     ApplicationXML: 'application/xml',
index 5c6b83939f579b18491fc935c87fddd65a93131b..2b6a8334acce4b8c879448093fc1a522b5629380 100644 (file)
@@ -11,6 +11,7 @@ const Enum = require('./enum');
 const FeedParser = require('feedparser');
 const { Readable } = require('stream');
 const htmlparser2 = require('htmlparser2');
+const { Iconv } = require('iconv');
 
 const _fileScope = common.fileScope(__filename);
 
@@ -45,6 +46,7 @@ class LinkHelper {
       try {
         links.push(...parseLinkHeader(linkHeader));
       } catch (e) {
+        /* istanbul ignore else */
         if (e instanceof ParseSyntaxError) {
           this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader });
         } else {
@@ -52,29 +54,39 @@ class LinkHelper {
         }
       }
     }
-    const contentType = getHeader(headers, Enum.Header.ContentType);
-    if (contentType) {
-      const [contentTypeBase, _contentTypeEncoding] = contentType.split(/; +/);
-      let bodyLinks = [];
-      switch (contentTypeBase) {
-        case Enum.ContentType.ApplicationAtom:
-        case Enum.ContentType.ApplicationRDF:
-        case Enum.ContentType.ApplicationRSS:
-        case Enum.ContentType.ApplicationXML:
-        case Enum.ContentType.TextXML: {
-          bodyLinks = await this.linksFromFeedBody(url, body);
-          break;
-        }
 
-        case Enum.ContentType.TextHTML:
-          bodyLinks = this.linksFromHTMLBody(body);
-          break;
+    const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType));
+    const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset;
+    if (nonUTF8Charset) {
+      const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore');
+      try {
+        body = iconv.convert(body);
+      } catch (e) {
+        /* istanbul ignore next */
+        this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url });
+        // But try to carry on, anyhow.
+      }
+    }
 
-        default:
-          this.logger.debug(_scope, 'no parser for content type', { contentType });
+    let bodyLinks = [];
+    switch (contentType.mediaType) {
+      case Enum.ContentType.ApplicationAtom:
+      case Enum.ContentType.ApplicationRDF:
+      case Enum.ContentType.ApplicationRSS:
+      case Enum.ContentType.ApplicationXML:
+      case Enum.ContentType.TextXML: {
+        bodyLinks = await this.linksFromFeedBody(url, body);
+        break;
       }
-      links.push(...bodyLinks);
+
+      case Enum.ContentType.TextHTML:
+        bodyLinks = this.linksFromHTMLBody(body);
+        break;
+
+      default:
+        this.logger.debug(_scope, 'no parser for content type', { contentType });
     }
+    links.push(...bodyLinks);
 
     // Fetch all hub relation targets from headers, resolving relative URIs.
     const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url));
@@ -85,6 +97,30 @@ class LinkHelper {
   }
 
 
+  /**
+   * Convert a Content-Type string to normalized components.
+   * RFC7231 ยง3.1.1
+   * N.B. this non-parser implementation will not work if a parameter
+   * value for some reason includes a ; or = within a quoted-string.
+   * @param {String} contentTypeHeader
+   * @returns {Object} contentType
+   * @returns {String} contentType.mediaType
+   * @returns {Object} contentType.params
+   */
+  static parseContentType(contentTypeHeader) {
+    const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */);
+    return {
+      mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream,
+      params: params.reduce((obj, param) => {
+        const [field, value] = param.split('=');
+        const isQuoted = value.charAt(0) === '"' && value.charAt(value.length - 1) === '"';
+        obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value;
+        return obj;
+      }, {}),
+    };
+  }
+
+
   /**
    * Parse XML-ish feed content, extracting link elements into our own format.
    * @param {String} feedurl
index 535d4c9d6e94a0e9553d6878ebd7860d98212a0b..1f914935735f70abc1b0acceed37c534d5a4bc64 100644 (file)
@@ -63,6 +63,15 @@ describe('LinkHelper', function () {
       const result = await lh.validHub(url, headers, body);
       assert.strictEqual(result, expected);
     });
+    it('covers link in HTML body with charset translation', async function () {
+      headers = {
+        'content-type': 'text/html; charset=ASCII',
+      };
+      body = '<html><head><link rel="hub" href="https://example.com/hub/"></head></html>';
+      const expected = true;
+      const result = await lh.validHub(url, headers, body);
+      assert.strictEqual(result, expected);
+    });
     it('covers parser failure', async function () {
       headers = {
         link: 'Invalid Link Header',
@@ -78,6 +87,46 @@ describe('LinkHelper', function () {
     });
   }); // validHub
 
+  describe('parseContentType', function () {
+    it('handles no data', function () {
+      const expected = {
+        mediaType: 'application/octet-stream',
+        params: {},
+      };
+      const result = LinkHelper.parseContentType();
+      assert.deepStrictEqual(result, expected);
+    });
+    it('handles only media type', function () {
+      const expected = {
+        mediaType: 'application/json',
+        params: {},
+      };
+      const result = LinkHelper.parseContentType('application/json');
+      assert.deepStrictEqual(result, expected);
+    });
+    it('handles parameters', function () {
+      const expected = {
+        mediaType: 'text/html',
+        params: {
+          charset: 'ISO-8859-4',
+        },
+      };
+      const result = LinkHelper.parseContentType('text/html; charset=ISO-8859-4');
+      assert.deepStrictEqual(result, expected);
+    });
+    it('handles more parameters', function () {
+      const expected = {
+        mediaType: 'multipart/form-data',
+        params: {
+          boundary: '--123--',
+          other: 'foo',
+        },
+      };
+      const result = LinkHelper.parseContentType('multipart/form-data; boundary="--123--"; other=foo');
+      assert.deepStrictEqual(result, expected);
+    });
+  }); // parseContentType
+
   describe('absoluteURI', function () {
     it('success', function () {
       const uri = '../rel';
@@ -136,6 +185,23 @@ describe('LinkHelper', function () {
     it('parses rss', async function () {
       const feedData = testData.rssFeedBody;
       const feedUrl = testData.rssFeedUrl;
+      const expected = [
+        {
+          attributes: [
+            {
+              name: 'rel',
+              value: 'hub',
+            },
+          ],
+          target: 'https://hub.squeep.com/',
+        },
+      ];
+      const result = await lh.linksFromFeedBody(feedUrl, feedData);
+      assert.deepStrictEqual(result, expected);
+    });
+    it('parses more rss', async function () {
+      const feedData = testData.rssFeedBody2;
+      const feedUrl = testData.rssFeedUrl2;
       const expected = [
         {
           attributes: [
index f402915ee35f99506226890d2451b56b387f2c18..818d5b4c4cf28f2c846c2f79db45c5b1b2d11391 100644 (file)
@@ -1,7 +1,60 @@
 'use strict';
 
-const rssFeedUrl = 'https://puppetcircuits.wordpress.com/feed/';
-const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
+const rssFeedUrl = 'https://squeep.com/share/';
+const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+     xmlns:content="http://purl.org/rss/1.0/modules/content/"
+     xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
+     xmlns:atom="http://www.w3.org/2005/Atom"
+     xmlns:dc="http://purl.org/dc/elements/1.1/"
+     xmlns:wfw="http://wellformedweb.org/CommentAPI/" >
+       <channel>
+               <title>Things To Share</title>
+               <link>https://squeep.com/share</link>
+               <atom:link rel="hub" href="https://hub.squeep.com/" />
+               <description>Miscellaneous contents what interest some harmless mammal.</description>
+               <image>
+                       <url>https://squeep.com/share/img/ratmap-128.png</url>
+                       <width>128</width><height>128</height>
+               </image>
+               <pubDate>Tue, 10 Aug 2021 23:21:46 GMT</pubDate>
+               <generator>Blogofile</generator>
+               <sy:updatePeriod>weekly</sy:updatePeriod>
+               <sy:updateFrequency>1</sy:updateFrequency>
+               <item>
+                       <title>arts I shall never be able to collect</title>
+                       <link>https://squeep.com/share/2015/08/00000011</link>
+                       <pubDate>Thu, 20 Aug 2015 20:42:37 PDT</pubDate>
+                       <category><![CDATA[art]]></category>
+                       <guid isPermaLink="false">00000011</guid>
+                       <description>arts I shall never be able to collect</description>
+                       <content:encoded><![CDATA[
+<p>
+<a href="http://www.followtheblackrabbit.com/">Beth Cavener</a> does phenomenal work.
+This is the piece I'd most want for my wall.  I could spend a lifetime appreciating those expressions.
+</p>
+<div>
+       <a href="http://www.followtheblackrabbit.com/gallery/the-sentimental-question-2/" title="1/2">
+               <img src="https://squeep.com/share/assets/2b7354e42e91cd42e161ad90243c9d6ffa1deba1-0000"
+                       style="display:inline; height:40ex;"
+                       alt="1/2"
+               />
+       </a>
+
+       <a href="http://www.followtheblackrabbit.com/gallery/the-sentimental-question-2/" title="2/2">
+               <img src="https://squeep.com/share/assets/470998dab71830e0dde4cd20d17a5c96ceb19278-0000"
+                       style="display:inline; height:40ex;"
+                       alt="2/2"
+               />
+       </a>
+</div>
+]]></content:encoded>
+               </item>
+       </channel>
+</rss>`;
+
+const rssFeedUrl2 = 'https://puppetcircuits.wordpress.com/feed/';
+const rssFeedBody2 = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
        xmlns:content="http://purl.org/rss/1.0/modules/content/"
        xmlns:wfw="http://wellformedweb.org/CommentAPI/"
        xmlns:dc="http://purl.org/dc/elements/1.1/"
@@ -50,8 +103,6 @@ const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
                                        <wfw:commentRss>https://puppetcircuits.wordpress.com/2012/02/10/been-a-while-and-fun-video-from-eric/feed/</wfw:commentRss>
                        <slash:comments>0</slash:comments>
 
-
-
                <media:content url="https://0.gravatar.com/avatar/c19e5ba7f3abc36656779edcc9c6b6eb?s=96&#38;d=identicon&#38;r=G" medium="image">
                        <media:title type="html">raphaelabrams</media:title>
                </media:content>
@@ -75,8 +126,6 @@ const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
                                        <wfw:commentRss>https://puppetcircuits.wordpress.com/2011/02/22/botacon/feed/</wfw:commentRss>
                        <slash:comments>0</slash:comments>
 
-
-
                <media:content url="https://0.gravatar.com/avatar/c19e5ba7f3abc36656779edcc9c6b6eb?s=96&#38;d=identicon&#38;r=G" medium="image">
                        <media:title type="html">raphaelabrams</media:title>
                </media:content>
@@ -100,8 +149,6 @@ const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
                                        <wfw:commentRss>https://puppetcircuits.wordpress.com/2011/01/22/oddball-hotplate/feed/</wfw:commentRss>
                        <slash:comments>1</slash:comments>
 
-
-
                <media:content url="https://0.gravatar.com/avatar/c19e5ba7f3abc36656779edcc9c6b6eb?s=96&#38;d=identicon&#38;r=G" medium="image">
                        <media:title type="html">raphaelabrams</media:title>
                </media:content>
@@ -170,10 +217,12 @@ const htmlBody = `<?xml version="1.0" encoding="UTF-8" ?>
 <hr />`;
 
 module.exports = {
-       atomFeedBody,
-       atomFeedUrl,
-       htmlBody,
-       htmlUrl,
+  atomFeedBody,
+  atomFeedUrl,
+  htmlBody,
+  htmlUrl,
   rssFeedBody,
-       rssFeedUrl,
+  rssFeedUrl,
+  rssFeedBody2,
+  rssFeedUrl2,
 };