fix feed parser links to always be list
[websub-hub] / src / link-helper.js
1 'use strict';
2
3 /**
4 * A utility class for checking link values in a topic's data and metadata.
5 * Used to determine if we are a valid hub for topic.
6 */
7
8 const { parse: parseLinkHeader, SyntaxError: ParseSyntaxError } = require('@squeep/web-linking');
9 const common = require('./common');
10 const Enum = require('./enum');
11 const FeedParser = require('feedparser');
12 const { Readable } = require('stream');
13 const htmlparser2 = require('htmlparser2');
14
15 const _fileScope = common.fileScope(__filename);
16
17 function getHeader(headers, header) {
18 return headers[header.toLowerCase()];
19 }
20
21
22 class LinkHelper {
23 constructor(logger, options) {
24 this.logger = logger;
25 this.options = options;
26 this.selfUrl = options.dingus.selfBaseUrl;
27 }
28
29
30 /**
31 * Determine if this hub is listed in response data from url.
32 * @param {String} url
33 * @param {Object} headers
34 * @param {String|Buffer} body
35 * @returns {Boolean}
36 */
37 async validHub(url, headers, body) {
38 const _scope = _fileScope('validHub');
39 this.logger.debug(_scope, 'called', { headers, body: common.logTruncate(body, 100) });
40
41 // Add Link headers first, as they take priority over link elements in body.
42 const linkHeader = getHeader(headers, Enum.Header.Link);
43 const links = [];
44 if (linkHeader) {
45 try {
46 links.push(...parseLinkHeader(linkHeader));
47 } catch (e) {
48 if (e instanceof ParseSyntaxError) {
49 this.logger.debug(_scope, 'failed to parse link header, bad syntax', { error: e, linkHeader });
50 } else {
51 this.logger.error(_scope, 'failed to parse link header', { error: e, linkHeader });
52 }
53 }
54 }
55 const contentType = getHeader(headers, Enum.Header.ContentType);
56 if (contentType) {
57 const [contentTypeBase, _contentTypeEncoding] = contentType.split(/; +/);
58 let bodyLinks = [];
59 switch (contentTypeBase) {
60 case Enum.ContentType.ApplicationAtom:
61 case Enum.ContentType.ApplicationRDF:
62 case Enum.ContentType.ApplicationRSS:
63 case Enum.ContentType.ApplicationXML:
64 case Enum.ContentType.TextXML: {
65 bodyLinks = await this.linksFromFeedBody(url, body);
66 break;
67 }
68
69 case Enum.ContentType.TextHTML:
70 bodyLinks = this.linksFromHTMLBody(body);
71 break;
72
73 default:
74 this.logger.debug(_scope, 'no parser for content type', { contentType });
75 }
76 links.push(...bodyLinks);
77 }
78
79 // Fetch all hub relation targets from headers, resolving relative URIs.
80 const hubs = LinkHelper.locateHubTargets(links).map((link) => this.absoluteURI(link, url));
81
82 this.logger.debug(_scope, 'valid hubs for url', { url, hubs });
83
84 return hubs.includes(this.selfUrl);
85 }
86
87
88 /**
89 * Parse XML-ish feed content, extracting link elements into our own format.
90 * @param {String} feedurl
91 * @param {String} body
92 * @returns {Object[]}
93 */
94 async linksFromFeedBody(feedurl, body) {
95 const _scope = _fileScope('linksFromFeedBody');
96 this.logger.debug(_scope, 'called', { feedurl, body: common.logTruncate(body, 100) });
97
98 const feedParser = new FeedParser({
99 feedurl,
100 addmeta: false,
101 });
102 const bodyStream = Readable.from(body);
103 const links = [];
104
105 return new Promise((resolve) => {
106 feedParser.on('error', (err) => {
107 this.logger.debug(_scope, 'FeedParser error', { err, feedurl, body });
108 });
109 feedParser.on('end', () => {
110 this.logger.debug(_scope, 'FeedParser finished', { links });
111 resolve(links);
112 });
113 feedParser.on('meta', (meta) => {
114 this.logger.debug(_scope, 'FeedParser meta', { meta });
115 let feedLinks = meta['atom:link'] || [];
116 if (!Array.isArray(feedLinks)) {
117 // Parsing RSS seems to return a single entry for this rather than a list.
118 feedLinks = [feedLinks];
119 }
120 feedLinks
121 .map((l) => l['@'])
122 .forEach((l) => {
123 const link = {
124 target: l.href,
125 attributes: Object.entries(l)
126 .filter(([name]) => name !== 'href')
127 .map(([name, value]) => ({ name, value })),
128 };
129 links.push(link);
130 });
131 });
132 feedParser.on('readable', () => {
133 let _item;
134 while ((_item = feedParser.read())) {
135 // Quietly consume remaining stream content
136 }
137 });
138
139 bodyStream.pipe(feedParser);
140 });
141 }
142
143
144 /**
145 * Parse HTML-ish content, extracting link elements into our own format.
146 * @param {String} body
147 */
148 linksFromHTMLBody(body) {
149 const _scope = _fileScope('linksFromHTMLBody');
150 this.logger.debug(_scope, 'called', { body: common.logTruncate(body, 100) });
151
152 const links = [];
153 const parser = new htmlparser2.Parser({
154 onopentag(tagName, attributes) {
155 if (tagName.toLowerCase() === 'link') {
156 const link = {
157 target: attributes.href,
158 attributes: Object.entries(attributes)
159 .filter(([name]) => name !== 'href')
160 .map(([name, value]) => ({ name, value })),
161 };
162 links.push(link);
163 }
164 },
165 });
166 parser.write(body);
167 parser.end();
168 return links;
169 }
170
171
172 /**
173 * Attempt to resolve a relative target URI
174 * @param {String} uri
175 * @param {String} context
176 * @returns {String}
177 */
178 absoluteURI(uri, context) {
179 const _scope = _fileScope('absoluteURI');
180 try {
181 new URL(uri);
182 } catch (e) {
183 try {
184 uri = new URL(uri, context).href;
185 } catch (e) {
186 this.logger.debug(_scope, 'could not resolve link URI', { uri, context });
187 }
188 }
189 return uri;
190 }
191
192
193 /**
194 * Return all link targets with a hub relation.
195 * @param {Object[]} links
196 * @returns {String[]}
197 */
198 static locateHubTargets(links) {
199 return links
200 .filter((link) => link.attributes.some((attr) => attr.name === 'rel' && ` ${attr.value} `.includes(' hub ')))
201 .map((link) => link.target);
202 }
203
204 }
205
206 module.exports = LinkHelper;