X-Git-Url: http://git.squeep.com/?a=blobdiff_plain;f=src%2Flink-helper.js;h=93a947be58f284d63939000f5f342a3bf4fc5770;hb=fddda3a0f044d889dfa70781d2f415f2d5f64169;hp=0517dec6e6d73ba3d9986578e488b87289cbf1b4;hpb=83c2fbfb85a6b47983ef94cff240dd1660b59495;p=websub-hub

diff --git a/src/link-helper.js b/src/link-helper.js
index 0517dec..93a947b 100644
--- a/src/link-helper.js
+++ b/src/link-helper.js
@@ -11,6 +11,7 @@ const Enum = require('./enum');
 const FeedParser = require('feedparser');
 const { Readable } = require('stream');
 const htmlparser2 = require('htmlparser2');
+const { Iconv } = require('iconv');
 
 const _fileScope = common.fileScope(__filename);
 
@@ -45,6 +46,7 @@ class LinkHelper {
       try {
         links.push(...parseLinkHeader(linkHeader));
       } catch (e) {
+        /* istanbul ignore else */
         if (e instanceof ParseSyntaxError) {
           this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader });
         } else {
@@ -52,29 +54,39 @@ class LinkHelper {
         }
       }
     }
-    const contentType = getHeader(headers, Enum.Header.ContentType);
-    if (contentType) {
-      const [contentTypeBase, _contentTypeEncoding] = contentType.split(/; +/);
-      let bodyLinks = [];
-      switch (contentTypeBase) {
-        case Enum.ContentType.ApplicationAtom:
-        case Enum.ContentType.ApplicationRDF:
-        case Enum.ContentType.ApplicationRSS:
-        case Enum.ContentType.ApplicationXML:
-        case Enum.ContentType.TextXML: {
-          bodyLinks = await this.linksFromFeedBody(url, body);
-          break;
-        }
 
-        case Enum.ContentType.TextHTML:
-          bodyLinks = this.linksFromHTMLBody(body);
-          break;
+    const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType));
+    const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset;
+    if (nonUTF8Charset) {
+      const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore');
+      try {
+        body = iconv.convert(body).toString('utf8');
+      } catch (e) {
+        /* istanbul ignore next */
+        this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url });
+        // But try to carry on, anyhow.
+      }
+    }
 
-        default:
-          this.logger.debug(_scope, 'no parser for content type', { contentType });
+    let bodyLinks = [];
+    switch (contentType.mediaType) {
+      case Enum.ContentType.ApplicationAtom:
+      case Enum.ContentType.ApplicationRDF:
+      case Enum.ContentType.ApplicationRSS:
+      case Enum.ContentType.ApplicationXML:
+      case Enum.ContentType.TextXML: {
+        bodyLinks = await this.linksFromFeedBody(url, body);
+        break;
       }
-      links.push(...bodyLinks);
+
+      case Enum.ContentType.TextHTML:
+        bodyLinks = this.linksFromHTMLBody(body);
+        break;
+
+      default:
+        this.logger.debug(_scope, 'no parser for content type', { contentType });
     }
+    links.push(...bodyLinks);
 
     // Fetch all hub relation targets from headers, resolving relative URIs.
     const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url));
@@ -85,6 +97,30 @@ class LinkHelper {
   }
 
 
+  /**
+   * Convert a Content-Type string to normalized components.
+   * RFC7231 Â§3.1.1
+   * N.B. this non-parser implementation will not work if a parameter
+   * value for some reason includes a ; or = within a quoted-string.
+   * @param {String} contentTypeHeader
+   * @returns {Object} contentType
+   * @returns {String} contentType.mediaType
+   * @returns {Object} contentType.params
+   */
+  static parseContentType(contentTypeHeader) {
+    const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */);
+    return {
+      mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream,
+      params: params.reduce((obj, param) => {
+        const [field, value] = param.split('=');
+        const isQuoted = value.charAt(0) === '"' && value.charAt(value.length - 1) === '"';
+        obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value;
+        return obj;
+      }, {}),
+    };
+  }
+
+
   /**
    * Parse XML-ish feed content, extracting link elements into our own format.
    * @param {String} feedurl
@@ -112,7 +148,11 @@ class LinkHelper {
       });
       feedParser.on('meta', (meta) => {
         this.logger.debug(_scope, 'FeedParser meta', { meta });
-        const feedLinks = meta['atom:link'] || [];
+        let feedLinks = meta['atom:link'] || [];
+        if (!Array.isArray(feedLinks)) {
+          // Parsing RSS seems to return a single entry for this rather than a list.
+          feedLinks = [feedLinks];
+        }
         feedLinks
           .map((l) => l['@'])
           .forEach((l) => {