## [Unreleased]
+### Added
+
+- Make use of the content-type charset when parsing topic content, recoding to UTF8 when needed.
+
+### Fixed
+
+- Feed parser could return a non-list for a single link entry, handle that case.
+
## [v1.1.1] - 2021-08-09
### Fixed
"debug": "4"
}
},
+ "iconv": {
+ "version": "3.0.0",
+ "resolved": "https://registry.npmjs.org/iconv/-/iconv-3.0.0.tgz",
+ "integrity": "sha512-bKTEP55J/e+UutBE3BDBWq6KukPWh3GBYCZGbLEY9vxRDUU2F3bqvPsp/a/DEdIamgF2MvW5lF0Rj1U/7KRL+g=="
+ },
"ieee754": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
"better-sqlite3": "^7.4.3",
"feedparser": "^2.2.10",
"htmlparser2": "^6.1.0",
+ "iconv": "^3.0.0",
"pg-promise": "^10.11.0"
},
"devDependencies": {
ContentType: {
ApplicationAtom: 'application/atom+xml',
+ ApplicationOctetStream: 'application/octet-stream',
ApplicationRDF: 'application/rdf+xml',
ApplicationRSS: 'application/rss+xml',
ApplicationXML: 'application/xml',
const FeedParser = require('feedparser');
const { Readable } = require('stream');
const htmlparser2 = require('htmlparser2');
+const { Iconv } = require('iconv');
const _fileScope = common.fileScope(__filename);
try {
links.push(...parseLinkHeader(linkHeader));
} catch (e) {
+ /* istanbul ignore else */
if (e instanceof ParseSyntaxError) {
this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader });
} else {
}
}
}
- const contentType = getHeader(headers, Enum.Header.ContentType);
- if (contentType) {
- const [contentTypeBase, _contentTypeEncoding] = contentType.split(/; +/);
- let bodyLinks = [];
- switch (contentTypeBase) {
- case Enum.ContentType.ApplicationAtom:
- case Enum.ContentType.ApplicationRDF:
- case Enum.ContentType.ApplicationRSS:
- case Enum.ContentType.ApplicationXML:
- case Enum.ContentType.TextXML: {
- bodyLinks = await this.linksFromFeedBody(url, body);
- break;
- }
- case Enum.ContentType.TextHTML:
- bodyLinks = this.linksFromHTMLBody(body);
- break;
+ const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType));
+ const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset;
+ if (nonUTF8Charset) {
+ const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore');
+ try {
+ body = iconv.convert(body);
+ } catch (e) {
+ /* istanbul ignore next */
+ this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url });
+ // But try to carry on, anyhow.
+ }
+ }
- default:
- this.logger.debug(_scope, 'no parser for content type', { contentType });
+ let bodyLinks = [];
+ switch (contentType.mediaType) {
+ case Enum.ContentType.ApplicationAtom:
+ case Enum.ContentType.ApplicationRDF:
+ case Enum.ContentType.ApplicationRSS:
+ case Enum.ContentType.ApplicationXML:
+ case Enum.ContentType.TextXML: {
+ bodyLinks = await this.linksFromFeedBody(url, body);
+ break;
}
- links.push(...bodyLinks);
+
+ case Enum.ContentType.TextHTML:
+ bodyLinks = this.linksFromHTMLBody(body);
+ break;
+
+ default:
+ this.logger.debug(_scope, 'no parser for content type', { contentType });
}
+ links.push(...bodyLinks);
// Fetch all hub relation targets from headers, resolving relative URIs.
const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url));
}
+ /**
+ * Convert a Content-Type string to normalized components.
+ * RFC7231 ยง3.1.1
+ * N.B. this non-parser implementation will not work if a parameter
+ * value for some reason includes a ; or = within a quoted-string.
+ * @param {String} contentTypeHeader
+ * @returns {Object} contentType
+ * @returns {String} contentType.mediaType
+ * @returns {Object} contentType.params
+ */
+ static parseContentType(contentTypeHeader) {
+ const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */);
+ return {
+ mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream,
+ params: params.reduce((obj, param) => {
+ const [field, value] = param.split('=');
+ const isQuoted = value.charAt(0) === '"' && value.charAt(value.length - 1) === '"';
+ obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value;
+ return obj;
+ }, {}),
+ };
+ }
+
+
/**
* Parse XML-ish feed content, extracting link elements into our own format.
* @param {String} feedurl
const result = await lh.validHub(url, headers, body);
assert.strictEqual(result, expected);
});
+ it('covers link in HTML body with charset translation', async function () {
+ headers = {
+ 'content-type': 'text/html; charset=ASCII',
+ };
+ body = '<html><head><link rel="hub" href="https://example.com/hub/"></head></html>';
+ const expected = true;
+ const result = await lh.validHub(url, headers, body);
+ assert.strictEqual(result, expected);
+ });
it('covers parser failure', async function () {
headers = {
link: 'Invalid Link Header',
});
}); // validHub
+ describe('parseContentType', function () {
+ it('handles no data', function () {
+ const expected = {
+ mediaType: 'application/octet-stream',
+ params: {},
+ };
+ const result = LinkHelper.parseContentType();
+ assert.deepStrictEqual(result, expected);
+ });
+ it('handles only media type', function () {
+ const expected = {
+ mediaType: 'application/json',
+ params: {},
+ };
+ const result = LinkHelper.parseContentType('application/json');
+ assert.deepStrictEqual(result, expected);
+ });
+ it('handles parameters', function () {
+ const expected = {
+ mediaType: 'text/html',
+ params: {
+ charset: 'ISO-8859-4',
+ },
+ };
+ const result = LinkHelper.parseContentType('text/html; charset=ISO-8859-4');
+ assert.deepStrictEqual(result, expected);
+ });
+ it('handles more parameters', function () {
+ const expected = {
+ mediaType: 'multipart/form-data',
+ params: {
+ boundary: '--123--',
+ other: 'foo',
+ },
+ };
+ const result = LinkHelper.parseContentType('multipart/form-data; boundary="--123--"; other=foo');
+ assert.deepStrictEqual(result, expected);
+ });
+ }); // parseContentType
+
describe('absoluteURI', function () {
it('success', function () {
const uri = '../rel';
it('parses rss', async function () {
const feedData = testData.rssFeedBody;
const feedUrl = testData.rssFeedUrl;
+ const expected = [
+ {
+ attributes: [
+ {
+ name: 'rel',
+ value: 'hub',
+ },
+ ],
+ target: 'https://hub.squeep.com/',
+ },
+ ];
+ const result = await lh.linksFromFeedBody(feedUrl, feedData);
+ assert.deepStrictEqual(result, expected);
+ });
+ it('parses more rss', async function () {
+ const feedData = testData.rssFeedBody2;
+ const feedUrl = testData.rssFeedUrl2;
const expected = [
{
attributes: [
'use strict';
-const rssFeedUrl = 'https://puppetcircuits.wordpress.com/feed/';
-const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
+const rssFeedUrl = 'https://squeep.com/share/';
+const rssFeedBody = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+ xmlns:content="http://purl.org/rss/1.0/modules/content/"
+ xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
+ xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:wfw="http://wellformedweb.org/CommentAPI/" >
+ <channel>
+ <title>Things To Share</title>
+ <link>https://squeep.com/share</link>
+ <atom:link rel="hub" href="https://hub.squeep.com/" />
+ <description>Miscellaneous contents what interest some harmless mammal.</description>
+ <image>
+ <url>https://squeep.com/share/img/ratmap-128.png</url>
+ <width>128</width><height>128</height>
+ </image>
+ <pubDate>Tue, 10 Aug 2021 23:21:46 GMT</pubDate>
+ <generator>Blogofile</generator>
+ <sy:updatePeriod>weekly</sy:updatePeriod>
+ <sy:updateFrequency>1</sy:updateFrequency>
+ <item>
+ <title>arts I shall never be able to collect</title>
+ <link>https://squeep.com/share/2015/08/00000011</link>
+ <pubDate>Thu, 20 Aug 2015 20:42:37 PDT</pubDate>
+ <category><![CDATA[art]]></category>
+ <guid isPermaLink="false">00000011</guid>
+ <description>arts I shall never be able to collect</description>
+ <content:encoded><![CDATA[
+<p>
+<a href="http://www.followtheblackrabbit.com/">Beth Cavener</a> does phenomenal work.
+This is the piece I'd most want for my wall. I could spend a lifetime appreciating those expressions.
+</p>
+<div>
+ <a href="http://www.followtheblackrabbit.com/gallery/the-sentimental-question-2/" title="1/2">
+ <img src="https://squeep.com/share/assets/2b7354e42e91cd42e161ad90243c9d6ffa1deba1-0000"
+ style="display:inline; height:40ex;"
+ alt="1/2"
+ />
+ </a>
+
+ <a href="http://www.followtheblackrabbit.com/gallery/the-sentimental-question-2/" title="2/2">
+ <img src="https://squeep.com/share/assets/470998dab71830e0dde4cd20d17a5c96ceb19278-0000"
+ style="display:inline; height:40ex;"
+ alt="2/2"
+ />
+ </a>
+</div>
+]]></content:encoded>
+ </item>
+ </channel>
+</rss>`;
+
+const rssFeedUrl2 = 'https://puppetcircuits.wordpress.com/feed/';
+const rssFeedBody2 = `<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
<wfw:commentRss>https://puppetcircuits.wordpress.com/2012/02/10/been-a-while-and-fun-video-from-eric/feed/</wfw:commentRss>
<slash:comments>0</slash:comments>
-
-
<media:content url="https://0.gravatar.com/avatar/c19e5ba7f3abc36656779edcc9c6b6eb?s=96&d=identicon&r=G" medium="image">
<media:title type="html">raphaelabrams</media:title>
</media:content>
<wfw:commentRss>https://puppetcircuits.wordpress.com/2011/02/22/botacon/feed/</wfw:commentRss>
<slash:comments>0</slash:comments>
-
-
<media:content url="https://0.gravatar.com/avatar/c19e5ba7f3abc36656779edcc9c6b6eb?s=96&d=identicon&r=G" medium="image">
<media:title type="html">raphaelabrams</media:title>
</media:content>
<wfw:commentRss>https://puppetcircuits.wordpress.com/2011/01/22/oddball-hotplate/feed/</wfw:commentRss>
<slash:comments>1</slash:comments>
-
-
<media:content url="https://0.gravatar.com/avatar/c19e5ba7f3abc36656779edcc9c6b6eb?s=96&d=identicon&r=G" medium="image">
<media:title type="html">raphaelabrams</media:title>
</media:content>
<hr />`;
module.exports = {
- atomFeedBody,
- atomFeedUrl,
- htmlBody,
- htmlUrl,
+ atomFeedBody,
+ atomFeedUrl,
+ htmlBody,
+ htmlUrl,
rssFeedBody,
- rssFeedUrl,
+ rssFeedUrl,
+ rssFeedBody2,
+ rssFeedUrl2,
};