update dependencies and devDependencies, fix lint issues
[websub-hub] / src / link-helper.js
1 'use strict';
2
3 /**
4 * A utility class for checking link values in a topic's data and metadata.
5 * Used to determine if we are a valid hub for topic.
6 */
7
8 const { parse: parseLinkHeader, SyntaxError: ParseSyntaxError } = require('@squeep/web-linking');
9 const common = require('./common');
10 const Enum = require('./enum');
11 const FeedParser = require('feedparser');
12 const { Readable } = require('stream');
13 const htmlparser2 = require('htmlparser2');
14 const { Iconv } = require('iconv');
15
16 const _fileScope = common.fileScope(__filename);
17
18 /**
19 * Helper for accessing headers
20 * @param {object} headers header object
21 * @param {string} header header name
22 * @returns {string} header content
23 */
24 function getHeader(headers, header) {
25 return headers[header.toLowerCase()];
26 }
27
28
29 class LinkHelper {
30 constructor(logger, options) {
31 this.logger = logger;
32 this.options = options;
33 this.selfUrl = options.dingus.selfBaseUrl;
34 }
35
36
37 /**
38 * Determine if this hub is listed in response data from url.
39 * @param {string} url content url
40 * @param {object} headers headers from accessing url
41 * @param {string | Buffer} body body from accessing url
42 * @returns {Promise<boolean>} url lists this hub
43 */
44 async validHub(url, headers, body) {
45 const _scope = _fileScope('validHub');
46 this.logger.debug(_scope, 'called', { headers, body: common.logTruncate(body, 100) });
47
48 // Add Link headers first, as they take priority over link elements in body.
49 const linkHeader = getHeader(headers, Enum.Header.Link);
50 const links = [];
51 if (linkHeader) {
52 try {
53 links.push(...parseLinkHeader(linkHeader));
54 } catch (e) {
55 /* istanbul ignore else */
56 if (e instanceof ParseSyntaxError) {
57 this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader });
58 } else {
59 this.logger.error(_scope, 'failed to parse link header', { error: e, linkHeader });
60 }
61 }
62 }
63
64 const contentType = LinkHelper.parseContentType(getHeader(headers, Enum.Header.ContentType));
65 const nonUTF8Charset = !/utf-*8/i.test(contentType.params.charset) && contentType.params.charset;
66 if (nonUTF8Charset) {
67 const iconv = new Iconv(nonUTF8Charset, 'utf-8//translit//ignore');
68 try {
69 body = iconv.convert(body).toString('utf8');
70 } catch (e) {
71 /* istanbul ignore next */
72 this.logger.error(_scope, 'iconv conversion error', { error: e, contentType, url });
73 // But try to carry on, anyhow.
74 }
75 }
76
77 let bodyLinks = [];
78 switch (contentType.mediaType) {
79 case Enum.ContentType.ApplicationAtom:
80 case Enum.ContentType.ApplicationRDF:
81 case Enum.ContentType.ApplicationRSS:
82 case Enum.ContentType.ApplicationXML:
83 case Enum.ContentType.TextXML: {
84 bodyLinks = await this.linksFromFeedBody(url, body);
85 break;
86 }
87
88 case Enum.ContentType.TextHTML:
89 bodyLinks = this.linksFromHTMLBody(body);
90 break;
91
92 default:
93 this.logger.debug(_scope, 'no parser for content type', { contentType });
94 }
95 links.push(...bodyLinks);
96
97 // Fetch all hub relation targets from headers, resolving relative URIs.
98 const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url));
99
100 this.logger.debug(_scope, 'valid hubs for url', { url, hubs });
101
102 return hubs.includes(this.selfUrl);
103 }
104
105
106 /**
107 * @typedef {object} ContentType
108 * @property {string} mediaType media type
109 * @property {object} params map of parameters
110 */
111 /**
112 * Convert a Content-Type string to normalized components.
113 * RFC7231 ยง3.1.1
114 * N.B. this non-parser implementation will not work if a parameter
115 * value for some reason includes a ; or = within a quoted-string.
116 * @param {string} contentTypeHeader content type header
117 * @returns {ContentType} contentType
118 */
119 static parseContentType(contentTypeHeader) {
120 const [ mediaType, ...params ] = (contentTypeHeader || '').split(/ *; */);
121 return {
122 mediaType: mediaType.toLowerCase() || Enum.ContentType.ApplicationOctetStream,
123 params: params.reduce((obj, param) => {
124 const [field, value] = param.split('=');
125 const isQuoted = value.startsWith('"') && value.endsWith('"');
126 obj[field.toLowerCase()] = isQuoted ? value.slice(1, value.length - 1) : value;
127 return obj;
128 }, {}),
129 };
130 }
131
132
133 /**
134 * Parse XML-ish feed content, extracting link elements into our own format.
135 * @param {string} feedurl feed url
136 * @param {string} body feed body
137 * @returns {Promise<object[]>} array of link elements
138 */
139 async linksFromFeedBody(feedurl, body) {
140 const _scope = _fileScope('linksFromFeedBody');
141 this.logger.debug(_scope, 'called', { feedurl, body: common.logTruncate(body, 100) });
142
143 const feedParser = new FeedParser({
144 feedurl,
145 addmeta: false,
146 });
147 const bodyStream = Readable.from(body);
148 const links = [];
149
150 return new Promise((resolve) => {
151 feedParser.on('error', (err) => {
152 this.logger.debug(_scope, 'FeedParser error', { err, feedurl, body });
153 });
154 feedParser.on('end', () => {
155 this.logger.debug(_scope, 'FeedParser finished', { links });
156 resolve(links);
157 });
158 feedParser.on('meta', (meta) => {
159 this.logger.debug(_scope, 'FeedParser meta', { meta });
160 let feedLinks = meta['atom:link'] || [];
161 if (!Array.isArray(feedLinks)) {
162 // Parsing RSS seems to return a single entry for this rather than a list.
163 feedLinks = [feedLinks];
164 }
165 feedLinks
166 .map((l) => l['@'])
167 .forEach((l) => {
168 const link = {
169 target: l.href,
170 attributes: Object.entries(l)
171 .filter(([name]) => name !== 'href')
172 .map(([name, value]) => ({ name, value })),
173 };
174 links.push(link);
175 });
176 });
177 feedParser.on('readable', () => {
178 let _item;
179 while ((_item = feedParser.read())) {
180 // Quietly consume remaining stream content
181 }
182 });
183
184 bodyStream.pipe(feedParser);
185 });
186 }
187
188
189 /**
190 * Parse HTML-ish content, extracting link elements into our own format.
191 * @param {string} body html body
192 * @returns {object[]} array of link elements
193 */
194 linksFromHTMLBody(body) {
195 const _scope = _fileScope('linksFromHTMLBody');
196 this.logger.debug(_scope, 'called', { body: common.logTruncate(body, 100) });
197
198 const links = [];
199 const parser = new htmlparser2.Parser({
200 onopentag(tagName, attributes) {
201 if (tagName.toLowerCase() === 'link') {
202 const link = {
203 target: attributes.href,
204 attributes: Object.entries(attributes)
205 .filter(([name]) => name !== 'href')
206 .map(([name, value]) => ({ name, value })),
207 };
208 links.push(link);
209 }
210 },
211 });
212 parser.write(body);
213 parser.end();
214 return links;
215 }
216
217
218 /**
219 * Attempt to resolve a relative target URI
220 * @param {string} uri target
221 * @param {string} context base
222 * @returns {string} uri
223 */
224 absoluteURI(uri, context) {
225 const _scope = _fileScope('absoluteURI');
226 try {
227 new URL(uri);
228 } catch (e) { // eslint-disable-line no-unused-vars
229 try {
230 uri = new URL(uri, context).href;
231 } catch (e) { // eslint-disable-line no-unused-vars
232 this.logger.debug(_scope, 'could not resolve link URI', { uri, context });
233 }
234 }
235 return uri;
236 }
237
238
239 /**
240 * Return all link targets with a hub relation.
241 * @param {object[]} links array of link objects
242 * @returns {string[]} array of hub targets
243 */
244 static locateHubTargets(links) {
245 return links
246 .filter((link) => link.attributes.some((attr) => attr.name === 'rel' && ` ${attr.value} `.includes(' hub ')))
247 .map((link) => link.target);
248 }
249
250 }
251
252 module.exports = LinkHelper;