2b6a8334acce4b8c879448093fc1a522b5629380
[websub-hub] / src / link-helper.js
1 'use strict';
2
3 /**
4 * A utility class for checking link values in a topic's data and metadata.
5 * Used to determine if we are a valid hub for topic.
6 */
7
8 const { parse: parseLinkHeader, SyntaxError: ParseSyntaxError } = require('@squeep/web-linking');
9 const common = require('./common');
10 const Enum = require('./enum');
11 const FeedParser = require('feedparser');
12 const { Readable } = require('stream');
13 const htmlparser2 = require('htmlparser2');
14 const { Iconv } = require('iconv');
15
16 const _fileScope = common.fileScope(__filename);
17
18 function getHeader(headers, header) {
19 return headers[header.toLowerCase()];
20 }
21
22
23 class LinkHelper {
24 constructor(logger, options) {
25 this.logger = logger;
26 this.options = options;
27 this.selfUrl = options.dingus.selfBaseUrl;
28 }
29
30
31 /**
32 * Determine if this hub is listed in response data from url.
33 * @param {String} url
34 * @param {Object} headers
35 * @param {String|Buffer} body
36 * @returns {Boolean}
37 */
38 async validHub(url, headers, body) {
39 const _scope = _fileScope('validHub');
40 this.logger.debug(_scope, 'called', { headers, body: common.logTruncate(body, 100) });
41
42 // Add Link headers first, as they take priority over link elements in body.
43 const linkHeader = getHeader(headers, Enum.Header.Link);
44 const links = [];
45 if (linkHeader) {
46 try {
47 links.push(...parseLinkHeader(linkHeader));
48 } catch (e) {
49 /* istanbul ignore else */
50 if (e instanceof ParseSyntaxError) {
51 this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader });
52 } else {
53 this.logger.error(_scope, 'failed to parse link header', { error: e, linkHeader });
54 }
55 }
56 }
57
58 const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType));
59 const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset;
60 if (nonUTF8Charset) {
61 const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore');
62 try {
63 body = iconv.convert(body);
64 } catch (e) {
65 /* istanbul ignore next */
66 this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url });
67 // But try to carry on, anyhow.
68 }
69 }
70
71 let bodyLinks = [];
72 switch (contentType.mediaType) {
73 case Enum.ContentType.ApplicationAtom:
74 case Enum.ContentType.ApplicationRDF:
75 case Enum.ContentType.ApplicationRSS:
76 case Enum.ContentType.ApplicationXML:
77 case Enum.ContentType.TextXML: {
78 bodyLinks = await this.linksFromFeedBody(url, body);
79 break;
80 }
81
82 case Enum.ContentType.TextHTML:
83 bodyLinks = this.linksFromHTMLBody(body);
84 break;
85
86 default:
87 this.logger.debug(_scope, 'no parser for content type', { contentType });
88 }
89 links.push(...bodyLinks);
90
91 // Fetch all hub relation targets from headers, resolving relative URIs.
92 const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url));
93
94 this.logger.debug(_scope, 'valid hubs for url', { url, hubs });
95
96 return hubs.includes(this.selfUrl);
97 }
98
99
100 /**
101 * Convert a Content-Type string to normalized components.
102 * RFC7231 ยง3.1.1
103 * N.B. this non-parser implementation will not work if a parameter
104 * value for some reason includes a ; or = within a quoted-string.
105 * @param {String} contentTypeHeader
106 * @returns {Object} contentType
107 * @returns {String} contentType.mediaType
108 * @returns {Object} contentType.params
109 */
110 static parseContentType(contentTypeHeader) {
111 const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */);
112 return {
113 mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream,
114 params: params.reduce((obj, param) => {
115 const [field, value] = param.split('=');
116 const isQuoted = value.charAt(0) === '"' && value.charAt(value.length - 1) === '"';
117 obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value;
118 return obj;
119 }, {}),
120 };
121 }
122
123
124 /**
125 * Parse XML-ish feed content, extracting link elements into our own format.
126 * @param {String} feedurl
127 * @param {String} body
128 * @returns {Object[]}
129 */
130 async linksFromFeedBody(feedurl, body) {
131 const _scope = _fileScope('linksFromFeedBody');
132 this.logger.debug(_scope, 'called', { feedurl, body: common.logTruncate(body, 100) });
133
134 const feedParser = new FeedParser({
135 feedurl,
136 addmeta: false,
137 });
138 const bodyStream = Readable.from(body);
139 const links = [];
140
141 return new Promise((resolve) => {
142 feedParser.on('error', (err) => {
143 this.logger.debug(_scope, 'FeedParser error', { err, feedurl, body });
144 });
145 feedParser.on('end', () => {
146 this.logger.debug(_scope, 'FeedParser finished', { links });
147 resolve(links);
148 });
149 feedParser.on('meta', (meta) => {
150 this.logger.debug(_scope, 'FeedParser meta', { meta });
151 let feedLinks = meta['atom:link'] || [];
152 if (!Array.isArray(feedLinks)) {
153 // Parsing RSS seems to return a single entry for this rather than a list.
154 feedLinks = [feedLinks];
155 }
156 feedLinks
157 .map((l) => l['@'])
158 .forEach((l) => {
159 const link = {
160 target: l.href,
161 attributes: Object.entries(l)
162 .filter(([name]) => name !== 'href')
163 .map(([name, value]) => ({ name, value })),
164 };
165 links.push(link);
166 });
167 });
168 feedParser.on('readable', () => {
169 let _item;
170 while ((_item = feedParser.read())) {
171 // Quietly consume remaining stream content
172 }
173 });
174
175 bodyStream.pipe(feedParser);
176 });
177 }
178
179
180 /**
181 * Parse HTML-ish content, extracting link elements into our own format.
182 * @param {String} body
183 */
184 linksFromHTMLBody(body) {
185 const _scope = _fileScope('linksFromHTMLBody');
186 this.logger.debug(_scope, 'called', { body: common.logTruncate(body, 100) });
187
188 const links = [];
189 const parser = new htmlparser2.Parser({
190 onopentag(tagName, attributes) {
191 if (tagName.toLowerCase() === 'link') {
192 const link = {
193 target: attributes.href,
194 attributes: Object.entries(attributes)
195 .filter(([name]) => name !== 'href')
196 .map(([name, value]) => ({ name, value })),
197 };
198 links.push(link);
199 }
200 },
201 });
202 parser.write(body);
203 parser.end();
204 return links;
205 }
206
207
208 /**
209 * Attempt to resolve a relative target URI
210 * @param {String} uri
211 * @param {String} context
212 * @returns {String}
213 */
214 absoluteURI(uri, context) {
215 const _scope = _fileScope('absoluteURI');
216 try {
217 new URL(uri);
218 } catch (e) {
219 try {
220 uri = new URL(uri, context).href;
221 } catch (e) {
222 this.logger.debug(_scope, 'could not resolve link URI', { uri, context });
223 }
224 }
225 return uri;
226 }
227
228
229 /**
230 * Return all link targets with a hub relation.
231 * @param {Object[]} links
232 * @returns {String[]}
233 */
234 static locateHubTargets(links) {
235 return links
236 .filter((link) => link.attributes.some((attr) => attr.name === 'rel' && ` ${attr.value} `.includes(' hub ')))
237 .map((link) => link.target);
238 }
239
240 }
241
242 module.exports = LinkHelper;