From: Justin Wind
Date: Wed, 11 Aug 2021 19:58:12 +0000 (-0700)
Subject: Merge branch 'v1.1-dev' as v1.1.2
X-Git-Tag: v1.1.2
X-Git-Url: http://git.squeep.com/?a=commitdiff_plain;h=409ff988982a5edfcd51c02c681187969db57d0a;hp=ac22211a9bc13cfe4dc3e66a55b1c7f9fd84268c;p=websub-hub
Merge branch 'v1.1-dev' as v1.1.2
---
diff --git a/CHANGELOG.md b/CHANGELOG.md
index aaa0b4f..71d4dfe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,16 @@ Releases and notable changes to this project are documented here.
## [Unreleased]
+## [v1.1.2] - 2021-08-11
+
+### Added
+
+- Make use of the content-type charset when parsing topic content, recoding to UTF8 when needed.
+
+### Fixed
+
+- Feed parser could return a non-list for a single link entry, handle that case.
+
## [v1.1.1] - 2021-08-09
### Fixed
@@ -25,6 +35,8 @@ Releases and notable changes to this project are documented here.
---
-[Unreleased]: https://git.squeep.com/?p=websub-hub;a=commitdiff;h=HEAD;hp=v1.1.0
+[Unreleased]: https://git.squeep.com/?p=websub-hub;a=commitdiff;h=HEAD;hp=v1.1.2
+[v1.1.2]: https://git.squeep.com/?p=websub-hub;a=commitdiff;h=v1.1.2;hp=v1.1.1
+[v1.1.1]: https://git.squeep.com/?p=websub-hub;a=commitdiff;h=v1.1.1;hp=v1.1.0
[v1.1.0]: https://git.squeep.com/?p=websub-hub;a=commitdiff;h=v1.1.0;hp=v1.0.0
[v1.0.0]: https://git.squeep.com/?p=websub-hub;a=commitdiff;h=v1.0.0;hp=v0.0.0
diff --git a/package-lock.json b/package-lock.json
index 4a11338..fe33cd0 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,6 +1,6 @@
{
"name": "websub-hub",
- "version": "1.1.1",
+ "version": "1.1.2",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
@@ -1783,6 +1783,11 @@
"debug": "4"
}
},
+ "iconv": {
+ "version": "3.0.0",
+ "resolved": "https://registry.npmjs.org/iconv/-/iconv-3.0.0.tgz",
+ "integrity": "sha512-bKTEP55J/e+UutBE3BDBWq6KukPWh3GBYCZGbLEY9vxRDUU2F3bqvPsp/a/DEdIamgF2MvW5lF0Rj1U/7KRL+g=="
+ },
"ieee754": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
diff --git a/package.json b/package.json
index f1fe201..ef0eaf9 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "websub-hub",
- "version": "1.1.1",
+ "version": "1.1.2",
"description": "A WebSub Hub server implementation.",
"main": "server.js",
"scripts": {
@@ -39,6 +39,7 @@
"better-sqlite3": "^7.4.3",
"feedparser": "^2.2.10",
"htmlparser2": "^6.1.0",
+ "iconv": "^3.0.0",
"pg-promise": "^10.11.0"
},
"devDependencies": {
diff --git a/src/enum.js b/src/enum.js
index 8752bfb..96f0940 100644
--- a/src/enum.js
+++ b/src/enum.js
@@ -27,6 +27,7 @@ const Enum = common.mergeDeep(DingusEnum, {
ContentType: {
ApplicationAtom: 'application/atom+xml',
+ ApplicationOctetStream: 'application/octet-stream',
ApplicationRDF: 'application/rdf+xml',
ApplicationRSS: 'application/rss+xml',
ApplicationXML: 'application/xml',
diff --git a/src/link-helper.js b/src/link-helper.js
index 0517dec..2b6a833 100644
--- a/src/link-helper.js
+++ b/src/link-helper.js
@@ -11,6 +11,7 @@ const Enum = require('./enum');
const FeedParser = require('feedparser');
const { Readable } = require('stream');
const htmlparser2 = require('htmlparser2');
+const { Iconv } = require('iconv');
const _fileScope = common.fileScope(__filename);
@@ -45,6 +46,7 @@ class LinkHelper {
try {
links.push(...parseLinkHeader(linkHeader));
} catch (e) {
+ /* istanbul ignore else */
if (e instanceof ParseSyntaxError) {
this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader });
} else {
@@ -52,29 +54,39 @@ class LinkHelper {
}
}
}
- const contentType = getHeader(headers, Enum.Header.ContentType);
- if (contentType) {
- const [contentTypeBase, _contentTypeEncoding] = contentType.split(/; +/);
- let bodyLinks = [];
- switch (contentTypeBase) {
- case Enum.ContentType.ApplicationAtom:
- case Enum.ContentType.ApplicationRDF:
- case Enum.ContentType.ApplicationRSS:
- case Enum.ContentType.ApplicationXML:
- case Enum.ContentType.TextXML: {
- bodyLinks = await this.linksFromFeedBody(url, body);
- break;
- }
- case Enum.ContentType.TextHTML:
- bodyLinks = this.linksFromHTMLBody(body);
- break;
+ const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType));
+ const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset;
+ if (nonUTF8Charset) {
+ const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore');
+ try {
+ body = iconv.convert(body);
+ } catch (e) {
+ /* istanbul ignore next */
+ this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url });
+ // But try to carry on, anyhow.
+ }
+ }
- default:
- this.logger.debug(_scope, 'no parser for content type', { contentType });
+ let bodyLinks = [];
+ switch (contentType.mediaType) {
+ case Enum.ContentType.ApplicationAtom:
+ case Enum.ContentType.ApplicationRDF:
+ case Enum.ContentType.ApplicationRSS:
+ case Enum.ContentType.ApplicationXML:
+ case Enum.ContentType.TextXML: {
+ bodyLinks = await this.linksFromFeedBody(url, body);
+ break;
}
- links.push(...bodyLinks);
+
+ case Enum.ContentType.TextHTML:
+ bodyLinks = this.linksFromHTMLBody(body);
+ break;
+
+ default:
+ this.logger.debug(_scope, 'no parser for content type', { contentType });
}
+ links.push(...bodyLinks);
// Fetch all hub relation targets from headers, resolving relative URIs.
const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url));
@@ -85,6 +97,30 @@ class LinkHelper {
}
+ /**
+ * Convert a Content-Type string to normalized components.
+ * RFC7231 §3.1.1
+ * N.B. this non-parser implementation will not work if a parameter
+ * value for some reason includes a ; or = within a quoted-string.
+ * @param {String} contentTypeHeader
+ * @returns {Object} contentType
+ * @returns {String} contentType.mediaType
+ * @returns {Object} contentType.params
+ */
+ static parseContentType(contentTypeHeader) {
+ const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */);
+ return {
+ mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream,
+ params: params.reduce((obj, param) => {
+ const [field, value] = param.split('=');
+ const isQuoted = value.charAt(0) === '"' && value.charAt(value.length - 1) === '"';
+ obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value;
+ return obj;
+ }, {}),
+ };
+ }
+
+
/**
* Parse XML-ish feed content, extracting link elements into our own format.
* @param {String} feedurl
@@ -112,7 +148,11 @@ class LinkHelper {
});
feedParser.on('meta', (meta) => {
this.logger.debug(_scope, 'FeedParser meta', { meta });
- const feedLinks = meta['atom:link'] || [];
+ let feedLinks = meta['atom:link'] || [];
+ if (!Array.isArray(feedLinks)) {
+ // Parsing RSS seems to return a single entry for this rather than a list.
+ feedLinks = [feedLinks];
+ }
feedLinks
.map((l) => l['@'])
.forEach((l) => {
diff --git a/src/template/template-helper.js b/src/template/template-helper.js
index b48a90a..d3962f9 100644
--- a/src/template/template-helper.js
+++ b/src/template/template-helper.js
@@ -228,7 +228,7 @@ function htmlFooter() {
+
+]]>
+
+
+`;
+
+const rssFeedUrl2 = 'https://puppetcircuits.wordpress.com/feed/';
+const rssFeedBody2 = `https://puppetcircuits.wordpress.com/2012/02/10/been-a-while-and-fun-video-from-eric/feed/
0
-
-
raphaelabrams
@@ -75,8 +126,6 @@ const rssFeedBody = `https://puppetcircuits.wordpress.com/2011/02/22/botacon/feed/
0
-
-
raphaelabrams
@@ -100,8 +149,6 @@ const rssFeedBody = `https://puppetcircuits.wordpress.com/2011/01/22/oddball-hotplate/feed/
1
-
-
raphaelabrams
@@ -170,10 +217,12 @@ const htmlBody = `
`;
module.exports = {
- atomFeedBody,
- atomFeedUrl,
- htmlBody,
- htmlUrl,
+ atomFeedBody,
+ atomFeedUrl,
+ htmlBody,
+ htmlUrl,
rssFeedBody,
- rssFeedUrl,
+ rssFeedUrl,
+ rssFeedBody2,
+ rssFeedUrl2,
};