X-Git-Url: http://git.squeep.com/?a=blobdiff_plain;f=src%2Flink-helper.js;h=93a947be58f284d63939000f5f342a3bf4fc5770;hb=fddda3a0f044d889dfa70781d2f415f2d5f64169;hp=0517dec6e6d73ba3d9986578e488b87289cbf1b4;hpb=83c2fbfb85a6b47983ef94cff240dd1660b59495;p=websub-hub diff --git a/src/link-helper.js b/src/link-helper.js index 0517dec..93a947b 100644 --- a/src/link-helper.js +++ b/src/link-helper.js @@ -11,6 +11,7 @@ const Enum = require('./enum'); const FeedParser = require('feedparser'); const { Readable } = require('stream'); const htmlparser2 = require('htmlparser2'); +const { Iconv } = require('iconv'); const _fileScope = common.fileScope(__filename); @@ -45,6 +46,7 @@ class LinkHelper { try { links.push(...parseLinkHeader(linkHeader)); } catch (e) { + /* istanbul ignore else */ if (e instanceof ParseSyntaxError) { this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader }); } else { @@ -52,29 +54,39 @@ class LinkHelper { } } } - const contentType = getHeader(headers, Enum.Header.ContentType); - if (contentType) { - const [contentTypeBase, _contentTypeEncoding] = contentType.split(/; +/); - let bodyLinks = []; - switch (contentTypeBase) { - case Enum.ContentType.ApplicationAtom: - case Enum.ContentType.ApplicationRDF: - case Enum.ContentType.ApplicationRSS: - case Enum.ContentType.ApplicationXML: - case Enum.ContentType.TextXML: { - bodyLinks = await this.linksFromFeedBody(url, body); - break; - } - case Enum.ContentType.TextHTML: - bodyLinks = this.linksFromHTMLBody(body); - break; + const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType)); + const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset; + if (nonUTF8Charset) { + const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore'); + try { + body = iconv.convert(body).toString('utf8'); + } catch (e) { + /* istanbul ignore next */ + this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url }); + // But try to carry on, anyhow. + } + } - default: - this.logger.debug(_scope, 'no parser for content type', { contentType }); + let bodyLinks = []; + switch (contentType.mediaType) { + case Enum.ContentType.ApplicationAtom: + case Enum.ContentType.ApplicationRDF: + case Enum.ContentType.ApplicationRSS: + case Enum.ContentType.ApplicationXML: + case Enum.ContentType.TextXML: { + bodyLinks = await this.linksFromFeedBody(url, body); + break; } - links.push(...bodyLinks); + + case Enum.ContentType.TextHTML: + bodyLinks = this.linksFromHTMLBody(body); + break; + + default: + this.logger.debug(_scope, 'no parser for content type', { contentType }); } + links.push(...bodyLinks); // Fetch all hub relation targets from headers, resolving relative URIs. const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url)); @@ -85,6 +97,30 @@ class LinkHelper { } + /** + * Convert a Content-Type string to normalized components. + * RFC7231 §3.1.1 + * N.B. this non-parser implementation will not work if a parameter + * value for some reason includes a ; or = within a quoted-string. + * @param {String} contentTypeHeader + * @returns {Object} contentType + * @returns {String} contentType.mediaType + * @returns {Object} contentType.params + */ + static parseContentType(contentTypeHeader) { + const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */); + return { + mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream, + params: params.reduce((obj, param) => { + const [field, value] = param.split('='); + const isQuoted = value.charAt(0) === '"' && value.charAt(value.length - 1) === '"'; + obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value; + return obj; + }, {}), + }; + } + + /** * Parse XML-ish feed content, extracting link elements into our own format. * @param {String} feedurl @@ -112,7 +148,11 @@ class LinkHelper { }); feedParser.on('meta', (meta) => { this.logger.debug(_scope, 'FeedParser meta', { meta }); - const feedLinks = meta['atom:link'] || []; + let feedLinks = meta['atom:link'] || []; + if (!Array.isArray(feedLinks)) { + // Parsing RSS seems to return a single entry for this rather than a list. + feedLinks = [feedLinks]; + } feedLinks .map((l) => l['@']) .forEach((l) => {