4 * A utility class for checking link values in a topic's data and metadata.
5 * Used to determine if we are a valid hub for topic.
8 const { parse: parseLinkHeader
, SyntaxError: ParseSyntaxError
} = require('@squeep/web-linking');
9 const common
= require('./common');
10 const Enum
= require('./enum');
11 const FeedParser
= require('feedparser');
12 const { Readable
} = require('stream');
13 const htmlparser2
= require('htmlparser2');
14 const { Iconv
} = require('iconv');
16 const _fileScope
= common
.fileScope(__filename
);
19 * Helper for accessing headers
20 * @param {object} headers header object
21 * @param {string} header header name
22 * @returns {string} header content
24 function getHeader(headers
, header
) {
25 return headers
[header
.toLowerCase()];
30 constructor(logger
, options
) {
32 this.options
= options
;
33 this.selfUrl
= options
.dingus
.selfBaseUrl
;
38 * Determine if this hub is listed in response data from url.
39 * @param {string} url content url
40 * @param {object} headers headers from accessing url
41 * @param {string | Buffer} body body from accessing url
42 * @returns {Promise<boolean>} url lists this hub
44 async
validHub(url
, headers
, body
) {
45 const _scope
= _fileScope('validHub');
46 this.logger
.debug(_scope
, 'called', { headers
, body: common
.logTruncate(body
, 100) });
48 // Add Link headers first, as they take priority over link elements in body.
49 const linkHeader
= getHeader(headers
, Enum
.Header
.Link
);
53 links
.push(...parseLinkHeader(linkHeader
));
55 /* istanbul ignore else */
56 if (e
instanceof ParseSyntaxError
) {
57 this.logger
.debug(_scope
, 'failed to parse link header, bad syntax', { error: e
, linkHeader
});
59 this.logger
.error(_scope
, 'failed to parse link header', { error: e
, linkHeader
});
64 const contentType
= LinkHelper
.parseContentType(getHeader(headers
, Enum
.Header
.ContentType
));
65 const nonUTF8Charset
= !/utf-*8/i.test(contentType
.params
.charset
) && contentType
.params
.charset
;
67 const iconv
= new Iconv(nonUTF8Charset
, 'utf-8//translit//ignore');
69 body
= iconv
.convert(body
).toString('utf8');
71 /* istanbul ignore next */
72 this.logger
.error(_scope
, 'iconv conversion error', { error: e
, contentType
, url
});
73 // But try to carry on, anyhow.
78 switch (contentType
.mediaType
) {
79 case Enum
.ContentType
.ApplicationAtom:
80 case Enum
.ContentType
.ApplicationRDF:
81 case Enum
.ContentType
.ApplicationRSS:
82 case Enum
.ContentType
.ApplicationXML:
83 case Enum
.ContentType
.TextXML: {
84 bodyLinks
= await
this.linksFromFeedBody(url
, body
);
88 case Enum
.ContentType
.TextHTML:
89 bodyLinks
= this.linksFromHTMLBody(body
);
93 this.logger
.debug(_scope
, 'no parser for content type', { contentType
});
95 links
.push(...bodyLinks
);
97 // Fetch all hub relation targets from headers, resolving relative URIs.
98 const hubs
= LinkHelper
.locateHubTargets(links
).map((link
) => this.absoluteURI(link
, url
));
100 this.logger
.debug(_scope
, 'valid hubs for url', { url
, hubs
});
102 return hubs
.includes(this.selfUrl
);
107 * @typedef {object} ContentType
108 * @property {string} mediaType media type
109 * @property {object} params map of parameters
112 * Convert a Content-Type string to normalized components.
114 * N.B. this non-parser implementation will not work if a parameter
115 * value for some reason includes a ; or = within a quoted-string.
116 * @param {string} contentTypeHeader content type header
117 * @returns {ContentType} contentType
119 static parseContentType(contentTypeHeader
) {
120 const [ mediaType
, ...params
] = (contentTypeHeader
|| '').split(/ *; */
);
122 mediaType: mediaType
.toLowerCase() || Enum
.ContentType
.ApplicationOctetStream
,
123 params: params
.reduce((obj
, param
) => {
124 const [field
, value
] = param
.split('=');
125 const isQuoted
= value
.startsWith('"') && value
.endsWith('"');
126 obj
[field
.toLowerCase()] = isQuoted
? value
.slice(1, value
.length
- 1) : value
;
134 * Parse XML-ish feed content, extracting link elements into our own format.
135 * @param {string} feedurl feed url
136 * @param {string} body feed body
137 * @returns {Promise<object[]>} array of link elements
139 async
linksFromFeedBody(feedurl
, body
) {
140 const _scope
= _fileScope('linksFromFeedBody');
141 this.logger
.debug(_scope
, 'called', { feedurl
, body: common
.logTruncate(body
, 100) });
143 const feedParser
= new FeedParser({
147 const bodyStream
= Readable
.from(body
);
150 return new Promise((resolve
) => {
151 feedParser
.on('error', (err
) => {
152 this.logger
.debug(_scope
, 'FeedParser error', { err
, feedurl
, body
});
154 feedParser
.on('end', () => {
155 this.logger
.debug(_scope
, 'FeedParser finished', { links
});
158 feedParser
.on('meta', (meta
) => {
159 this.logger
.debug(_scope
, 'FeedParser meta', { meta
});
160 let feedLinks
= meta
['atom:link'] || [];
161 if (!Array
.isArray(feedLinks
)) {
162 // Parsing RSS seems to return a single entry for this rather than a list.
163 feedLinks
= [feedLinks
];
170 attributes: Object
.entries(l
)
171 .filter(([name
]) => name
!== 'href')
172 .map(([name
, value
]) => ({ name
, value
})),
177 feedParser
.on('readable', () => {
179 while ((_item
= feedParser
.read())) {
180 // Quietly consume remaining stream content
184 bodyStream
.pipe(feedParser
);
190 * Parse HTML-ish content, extracting link elements into our own format.
191 * @param {string} body html body
192 * @returns {object[]} array of link elements
194 linksFromHTMLBody(body
) {
195 const _scope
= _fileScope('linksFromHTMLBody');
196 this.logger
.debug(_scope
, 'called', { body: common
.logTruncate(body
, 100) });
199 const parser
= new htmlparser2
.Parser({
200 onopentag(tagName
, attributes
) {
201 if (tagName
.toLowerCase() === 'link') {
203 target: attributes
.href
,
204 attributes: Object
.entries(attributes
)
205 .filter(([name
]) => name
!== 'href')
206 .map(([name
, value
]) => ({ name
, value
})),
219 * Attempt to resolve a relative target URI
220 * @param {string} uri target
221 * @param {string} context base
222 * @returns {string} uri
224 absoluteURI(uri
, context
) {
225 const _scope
= _fileScope('absoluteURI');
228 } catch (e
) { // eslint-disable-line no-unused-vars
230 uri
= new URL(uri
, context
).href
;
231 } catch (e
) { // eslint-disable-line no-unused-vars
232 this.logger
.debug(_scope
, 'could not resolve link URI', { uri
, context
});
240 * Return all link targets with a hub relation.
241 * @param {object[]} links array of link objects
242 * @returns {string[]} array of hub targets
244 static locateHubTargets(links
) {
246 .filter((link
) => link
.attributes
.some((attr
) => attr
.name
=== 'rel' && ` ${attr.value} `.includes(' hub ')))
247 .map((link
) => link
.target
);
252 module
.exports
= LinkHelper
;