git.squeep.com Git - akkoma/blob - lib/pleroma/html.ex

   1 # Pleroma: A lightweight social networking server
   2 # Copyright © 2017-2019 Pleroma Authors <https://pleroma.social/>
   3 # SPDX-License-Identifier: AGPL-3.0-only
   4
   5 defmodule Pleroma.HTML do
   6   alias HtmlSanitizeEx.Scrubber
   7
   8   defp get_scrubbers(scrubber) when is_atom(scrubber), do: [scrubber]
   9   defp get_scrubbers(scrubbers) when is_list(scrubbers), do: scrubbers
  10   defp get_scrubbers(_), do: [Pleroma.HTML.Scrubber.Default]
  11
  12   def get_scrubbers do
  13     Pleroma.Config.get([:markup, :scrub_policy])
  14     |> get_scrubbers
  15   end
  16
  17   def filter_tags(html, nil) do
  18     filter_tags(html, get_scrubbers())
  19   end
  20
  21   def filter_tags(html, scrubbers) when is_list(scrubbers) do
  22     Enum.reduce(scrubbers, html, fn scrubber, html ->
  23       filter_tags(html, scrubber)
  24     end)
  25   end
  26
  27   def filter_tags(html, scrubber), do: Scrubber.scrub(html, scrubber)
  28   def filter_tags(html), do: filter_tags(html, nil)
  29   def strip_tags(html), do: Scrubber.scrub(html, Scrubber.StripTags)
  30
  31   def get_cached_scrubbed_html_for_activity(
  32         content,
  33         scrubbers,
  34         activity,
  35         key \\ "",
  36         callback \\ fn x -> x end
  37       ) do
  38     key = "#{key}#{generate_scrubber_signature(scrubbers)}|#{activity.id}"
  39
  40     Cachex.fetch!(:scrubber_cache, key, fn _key ->
  41       object = Pleroma.Object.normalize(activity)
  42       ensure_scrubbed_html(content, scrubbers, object.data["fake"] || false, callback)
  43     end)
  44   end
  45
  46   def get_cached_stripped_html_for_activity(content, activity, key) do
  47     get_cached_scrubbed_html_for_activity(
  48       content,
  49       HtmlSanitizeEx.Scrubber.StripTags,
  50       activity,
  51       key,
  52       &HtmlEntities.decode/1
  53     )
  54   end
  55
  56   def ensure_scrubbed_html(
  57         content,
  58         scrubbers,
  59         fake,
  60         callback
  61       ) do
  62     content =
  63       content
  64       |> filter_tags(scrubbers)
  65       |> callback.()
  66
  67     if fake do
  68       {:ignore, content}
  69     else
  70       {:commit, content}
  71     end
  72   end
  73
  74   defp generate_scrubber_signature(scrubber) when is_atom(scrubber) do
  75     generate_scrubber_signature([scrubber])
  76   end
  77
  78   defp generate_scrubber_signature(scrubbers) do
  79     Enum.reduce(scrubbers, "", fn scrubber, signature ->
  80       "#{signature}#{to_string(scrubber)}"
  81     end)
  82   end
  83
  84   def extract_first_external_url(_, nil), do: {:error, "No content"}
  85
  86   def extract_first_external_url(object, content) do
  87     key = "URL|#{object.id}"
  88
  89     Cachex.fetch!(:scrubber_cache, key, fn _key ->
  90       result =
  91         content
  92         |> Floki.filter_out("a.mention")
  93         |> Floki.attribute("a", "href")
  94         |> Enum.at(0)
  95
  96       {:commit, {:ok, result}}
  97     end)
  98   end
  99 end
 100
 101 defmodule Pleroma.HTML.Scrubber.TwitterText do
 102   @moduledoc """
 103   An HTML scrubbing policy which limits to twitter-style text.  Only
 104   paragraphs, breaks and links are allowed through the filter.
 105   """
 106
 107   @markup Application.get_env(:pleroma, :markup)
 108   @valid_schemes Pleroma.Config.get([:uri_schemes, :valid_schemes], [])
 109
 110   require HtmlSanitizeEx.Scrubber.Meta
 111   alias HtmlSanitizeEx.Scrubber.Meta
 112
 113   Meta.remove_cdata_sections_before_scrub()
 114   Meta.strip_comments()
 115
 116   # links
 117   Meta.allow_tag_with_uri_attributes("a", ["href", "data-user", "data-tag"], @valid_schemes)
 118
 119   Meta.allow_tag_with_this_attribute_values("a", "class", [
 120     "hashtag",
 121     "u-url",
 122     "mention",
 123     "u-url mention",
 124     "mention u-url"
 125   ])
 126
 127   Meta.allow_tag_with_this_attribute_values("a", "rel", [
 128     "tag",
 129     "nofollow",
 130     "noopener",
 131     "noreferrer"
 132   ])
 133
 134   Meta.allow_tag_with_these_attributes("a", ["name", "title"])
 135
 136   # paragraphs and linebreaks
 137   Meta.allow_tag_with_these_attributes("br", [])
 138   Meta.allow_tag_with_these_attributes("p", [])
 139
 140   # microformats
 141   Meta.allow_tag_with_this_attribute_values("span", "class", ["h-card"])
 142   Meta.allow_tag_with_these_attributes("span", [])
 143
 144   # allow inline images for custom emoji
 145   @allow_inline_images Keyword.get(@markup, :allow_inline_images)
 146
 147   if @allow_inline_images do
 148     # restrict img tags to http/https only, because of MediaProxy.
 149     Meta.allow_tag_with_uri_attributes("img", ["src"], ["http", "https"])
 150
 151     Meta.allow_tag_with_these_attributes("img", [
 152       "width",
 153       "height",
 154       "class",
 155       "title",
 156       "alt"
 157     ])
 158   end
 159
 160   Meta.strip_everything_not_covered()
 161 end
 162
 163 defmodule Pleroma.HTML.Scrubber.Default do
 164   @doc "The default HTML scrubbing policy: no "
 165
 166   require HtmlSanitizeEx.Scrubber.Meta
 167   alias HtmlSanitizeEx.Scrubber.Meta
 168   # credo:disable-for-previous-line
 169   # No idea how to fix this one…
 170
 171   @markup Application.get_env(:pleroma, :markup)
 172   @valid_schemes Pleroma.Config.get([:uri_schemes, :valid_schemes], [])
 173
 174   Meta.remove_cdata_sections_before_scrub()
 175   Meta.strip_comments()
 176
 177   Meta.allow_tag_with_uri_attributes("a", ["href", "data-user", "data-tag"], @valid_schemes)
 178
 179   Meta.allow_tag_with_this_attribute_values("a", "class", [
 180     "hashtag",
 181     "u-url",
 182     "mention",
 183     "u-url mention",
 184     "mention u-url"
 185   ])
 186
 187   Meta.allow_tag_with_this_attribute_values("a", "rel", [
 188     "tag",
 189     "nofollow",
 190     "noopener",
 191     "noreferrer"
 192   ])
 193
 194   Meta.allow_tag_with_these_attributes("a", ["name", "title"])
 195
 196   Meta.allow_tag_with_these_attributes("abbr", ["title"])
 197
 198   Meta.allow_tag_with_these_attributes("b", [])
 199   Meta.allow_tag_with_these_attributes("blockquote", [])
 200   Meta.allow_tag_with_these_attributes("br", [])
 201   Meta.allow_tag_with_these_attributes("code", [])
 202   Meta.allow_tag_with_these_attributes("del", [])
 203   Meta.allow_tag_with_these_attributes("em", [])
 204   Meta.allow_tag_with_these_attributes("i", [])
 205   Meta.allow_tag_with_these_attributes("li", [])
 206   Meta.allow_tag_with_these_attributes("ol", [])
 207   Meta.allow_tag_with_these_attributes("p", [])
 208   Meta.allow_tag_with_these_attributes("pre", [])
 209   Meta.allow_tag_with_these_attributes("strong", [])
 210   Meta.allow_tag_with_these_attributes("u", [])
 211   Meta.allow_tag_with_these_attributes("ul", [])
 212
 213   Meta.allow_tag_with_this_attribute_values("span", "class", ["h-card"])
 214   Meta.allow_tag_with_these_attributes("span", [])
 215
 216   @allow_inline_images Keyword.get(@markup, :allow_inline_images)
 217
 218   if @allow_inline_images do
 219     # restrict img tags to http/https only, because of MediaProxy.
 220     Meta.allow_tag_with_uri_attributes("img", ["src"], ["http", "https"])
 221
 222     Meta.allow_tag_with_these_attributes("img", [
 223       "width",
 224       "height",
 225       "class",
 226       "title",
 227       "alt"
 228     ])
 229   end
 230
 231   @allow_tables Keyword.get(@markup, :allow_tables)
 232
 233   if @allow_tables do
 234     Meta.allow_tag_with_these_attributes("table", [])
 235     Meta.allow_tag_with_these_attributes("tbody", [])
 236     Meta.allow_tag_with_these_attributes("td", [])
 237     Meta.allow_tag_with_these_attributes("th", [])
 238     Meta.allow_tag_with_these_attributes("thead", [])
 239     Meta.allow_tag_with_these_attributes("tr", [])
 240   end
 241
 242   @allow_headings Keyword.get(@markup, :allow_headings)
 243
 244   if @allow_headings do
 245     Meta.allow_tag_with_these_attributes("h1", [])
 246     Meta.allow_tag_with_these_attributes("h2", [])
 247     Meta.allow_tag_with_these_attributes("h3", [])
 248     Meta.allow_tag_with_these_attributes("h4", [])
 249     Meta.allow_tag_with_these_attributes("h5", [])
 250   end
 251
 252   @allow_fonts Keyword.get(@markup, :allow_fonts)
 253
 254   if @allow_fonts do
 255     Meta.allow_tag_with_these_attributes("font", ["face"])
 256   end
 257
 258   Meta.strip_everything_not_covered()
 259 end
 260
 261 defmodule Pleroma.HTML.Transform.MediaProxy do
 262   @moduledoc "Transforms inline image URIs to use MediaProxy."
 263
 264   alias Pleroma.Web.MediaProxy
 265
 266   def before_scrub(html), do: html
 267
 268   def scrub_attribute("img", {"src", "http" <> target}) do
 269     media_url =
 270       ("http" <> target)
 271       |> MediaProxy.url()
 272
 273     {"src", media_url}
 274   end
 275
 276   def scrub_attribute(_tag, attribute), do: attribute
 277
 278   def scrub({"img", attributes, children}) do
 279     attributes =
 280       attributes
 281       |> Enum.map(fn attr -> scrub_attribute("img", attr) end)
 282       |> Enum.reject(&is_nil(&1))
 283
 284     {"img", attributes, children}
 285   end
 286
 287   def scrub({:comment, _children}), do: ""
 288
 289   def scrub({tag, attributes, children}), do: {tag, attributes, children}
 290   def scrub({_tag, children}), do: children
 291   def scrub(text), do: text
 292 end