git.squeep.com Git - akkoma/blob - lib/pleroma/html.ex

   1 # Pleroma: A lightweight social networking server
   2 # Copyright © 2017-2019 Pleroma Authors <https://pleroma.social/>
   3 # SPDX-License-Identifier: AGPL-3.0-only
   4
   5 defmodule Pleroma.HTML do
   6   alias HtmlSanitizeEx.Scrubber
   7
   8   defp get_scrubbers(scrubber) when is_atom(scrubber), do: [scrubber]
   9   defp get_scrubbers(scrubbers) when is_list(scrubbers), do: scrubbers
  10   defp get_scrubbers(_), do: [Pleroma.HTML.Scrubber.Default]
  11
  12   def get_scrubbers do
  13     Pleroma.Config.get([:markup, :scrub_policy])
  14     |> get_scrubbers
  15   end
  16
  17   def filter_tags(html, nil) do
  18     filter_tags(html, get_scrubbers())
  19   end
  20
  21   def filter_tags(html, scrubbers) when is_list(scrubbers) do
  22     Enum.reduce(scrubbers, html, fn scrubber, html ->
  23       filter_tags(html, scrubber)
  24     end)
  25   end
  26
  27   def filter_tags(html, scrubber), do: Scrubber.scrub(html, scrubber)
  28   def filter_tags(html), do: filter_tags(html, nil)
  29   def strip_tags(html), do: Scrubber.scrub(html, Scrubber.StripTags)
  30
  31   def get_cached_scrubbed_html_for_activity(
  32         content,
  33         scrubbers,
  34         activity,
  35         key \\ "",
  36         callback \\ fn x -> x end
  37       ) do
  38     key = "#{key}#{generate_scrubber_signature(scrubbers)}|#{activity.id}"
  39
  40     Cachex.fetch!(:scrubber_cache, key, fn _key ->
  41       object = Pleroma.Object.normalize(activity)
  42       ensure_scrubbed_html(content, scrubbers, object.data["fake"] || false, callback)
  43     end)
  44   end
  45
  46   def get_cached_stripped_html_for_activity(content, activity, key) do
  47     get_cached_scrubbed_html_for_activity(
  48       content,
  49       HtmlSanitizeEx.Scrubber.StripTags,
  50       activity,
  51       key,
  52       &HtmlEntities.decode/1
  53     )
  54   end
  55
  56   def ensure_scrubbed_html(
  57         content,
  58         scrubbers,
  59         fake,
  60         callback
  61       ) do
  62     content =
  63       content
  64       |> filter_tags(scrubbers)
  65       |> callback.()
  66
  67     if fake do
  68       {:ignore, content}
  69     else
  70       {:commit, content}
  71     end
  72   end
  73
  74   defp generate_scrubber_signature(scrubber) when is_atom(scrubber) do
  75     generate_scrubber_signature([scrubber])
  76   end
  77
  78   defp generate_scrubber_signature(scrubbers) do
  79     Enum.reduce(scrubbers, "", fn scrubber, signature ->
  80       "#{signature}#{to_string(scrubber)}"
  81     end)
  82   end
  83
  84   def extract_first_external_url(_, nil), do: {:error, "No content"}
  85
  86   def extract_first_external_url(object, content) do
  87     key = "URL|#{object.id}"
  88
  89     Cachex.fetch!(:scrubber_cache, key, fn _key ->
  90       result =
  91         content
  92         |> Floki.filter_out("a.mention")
  93         |> Floki.attribute("a", "href")
  94         |> Enum.at(0)
  95
  96       {:commit, {:ok, result}}
  97     end)
  98   end
  99 end
 100
 101 defmodule Pleroma.HTML.Scrubber.TwitterText do
 102   @moduledoc """
 103   An HTML scrubbing policy which limits to twitter-style text.  Only
 104   paragraphs, breaks and links are allowed through the filter.
 105   """
 106
 107   @markup Application.get_env(:pleroma, :markup)
 108   @valid_schemes Pleroma.Config.get([:uri_schemes, :valid_schemes], [])
 109
 110   require HtmlSanitizeEx.Scrubber.Meta
 111   alias HtmlSanitizeEx.Scrubber.Meta
 112
 113   Meta.remove_cdata_sections_before_scrub()
 114   Meta.strip_comments()
 115
 116   # links
 117   Meta.allow_tag_with_uri_attributes("a", ["href", "data-user", "data-tag"], @valid_schemes)
 118
 119   Meta.allow_tag_with_this_attribute_values("a", "class", [
 120     "hashtag",
 121     "u-url",
 122     "mention",
 123     "u-url mention",
 124     "mention u-url"
 125   ])
 126
 127   Meta.allow_tag_with_this_attribute_values("a", "rel", [
 128     "tag",
 129     "nofollow",
 130     "noopener",
 131     "noreferrer"
 132   ])
 133
 134   Meta.allow_tag_with_these_attributes("a", ["name", "title"])
 135
 136   # paragraphs and linebreaks
 137   Meta.allow_tag_with_these_attributes("br", [])
 138   Meta.allow_tag_with_these_attributes("p", [])
 139
 140   # microformats
 141   Meta.allow_tag_with_this_attribute_values("span", "class", ["h-card"])
 142   Meta.allow_tag_with_these_attributes("span", [])
 143
 144   # allow inline images for custom emoji
 145   @allow_inline_images Keyword.get(@markup, :allow_inline_images)
 146
 147   if @allow_inline_images do
 148     # restrict img tags to http/https only, because of MediaProxy.
 149     Meta.allow_tag_with_uri_attributes("img", ["src"], ["http", "https"])
 150
 151     Meta.allow_tag_with_these_attributes("img", [
 152       "width",
 153       "height",
 154       "title",
 155       "alt"
 156     ])
 157   end
 158
 159   Meta.strip_everything_not_covered()
 160 end
 161
 162 defmodule Pleroma.HTML.Scrubber.Default do
 163   @doc "The default HTML scrubbing policy: no "
 164
 165   require HtmlSanitizeEx.Scrubber.Meta
 166   alias HtmlSanitizeEx.Scrubber.Meta
 167   # credo:disable-for-previous-line
 168   # No idea how to fix this one…
 169
 170   @markup Application.get_env(:pleroma, :markup)
 171   @valid_schemes Pleroma.Config.get([:uri_schemes, :valid_schemes], [])
 172
 173   Meta.remove_cdata_sections_before_scrub()
 174   Meta.strip_comments()
 175
 176   Meta.allow_tag_with_uri_attributes("a", ["href", "data-user", "data-tag"], @valid_schemes)
 177
 178   Meta.allow_tag_with_this_attribute_values("a", "class", [
 179     "hashtag",
 180     "u-url",
 181     "mention",
 182     "u-url mention",
 183     "mention u-url"
 184   ])
 185
 186   Meta.allow_tag_with_this_attribute_values("a", "rel", [
 187     "tag",
 188     "nofollow",
 189     "noopener",
 190     "noreferrer"
 191   ])
 192
 193   Meta.allow_tag_with_these_attributes("a", ["name", "title"])
 194
 195   Meta.allow_tag_with_these_attributes("abbr", ["title"])
 196
 197   Meta.allow_tag_with_these_attributes("b", [])
 198   Meta.allow_tag_with_these_attributes("blockquote", [])
 199   Meta.allow_tag_with_these_attributes("br", [])
 200   Meta.allow_tag_with_these_attributes("code", [])
 201   Meta.allow_tag_with_these_attributes("del", [])
 202   Meta.allow_tag_with_these_attributes("em", [])
 203   Meta.allow_tag_with_these_attributes("i", [])
 204   Meta.allow_tag_with_these_attributes("li", [])
 205   Meta.allow_tag_with_these_attributes("ol", [])
 206   Meta.allow_tag_with_these_attributes("p", [])
 207   Meta.allow_tag_with_these_attributes("pre", [])
 208   Meta.allow_tag_with_these_attributes("strong", [])
 209   Meta.allow_tag_with_these_attributes("u", [])
 210   Meta.allow_tag_with_these_attributes("ul", [])
 211
 212   Meta.allow_tag_with_this_attribute_values("span", "class", ["h-card"])
 213   Meta.allow_tag_with_these_attributes("span", [])
 214
 215   @allow_inline_images Keyword.get(@markup, :allow_inline_images)
 216
 217   if @allow_inline_images do
 218     # restrict img tags to http/https only, because of MediaProxy.
 219     Meta.allow_tag_with_uri_attributes("img", ["src"], ["http", "https"])
 220
 221     Meta.allow_tag_with_these_attributes("img", [
 222       "width",
 223       "height",
 224       "title",
 225       "alt"
 226     ])
 227   end
 228
 229   @allow_tables Keyword.get(@markup, :allow_tables)
 230
 231   if @allow_tables do
 232     Meta.allow_tag_with_these_attributes("table", [])
 233     Meta.allow_tag_with_these_attributes("tbody", [])
 234     Meta.allow_tag_with_these_attributes("td", [])
 235     Meta.allow_tag_with_these_attributes("th", [])
 236     Meta.allow_tag_with_these_attributes("thead", [])
 237     Meta.allow_tag_with_these_attributes("tr", [])
 238   end
 239
 240   @allow_headings Keyword.get(@markup, :allow_headings)
 241
 242   if @allow_headings do
 243     Meta.allow_tag_with_these_attributes("h1", [])
 244     Meta.allow_tag_with_these_attributes("h2", [])
 245     Meta.allow_tag_with_these_attributes("h3", [])
 246     Meta.allow_tag_with_these_attributes("h4", [])
 247     Meta.allow_tag_with_these_attributes("h5", [])
 248   end
 249
 250   @allow_fonts Keyword.get(@markup, :allow_fonts)
 251
 252   if @allow_fonts do
 253     Meta.allow_tag_with_these_attributes("font", ["face"])
 254   end
 255
 256   Meta.strip_everything_not_covered()
 257 end
 258
 259 defmodule Pleroma.HTML.Transform.MediaProxy do
 260   @moduledoc "Transforms inline image URIs to use MediaProxy."
 261
 262   alias Pleroma.Web.MediaProxy
 263
 264   def before_scrub(html), do: html
 265
 266   def scrub_attribute("img", {"src", "http" <> target}) do
 267     media_url =
 268       ("http" <> target)
 269       |> MediaProxy.url()
 270
 271     {"src", media_url}
 272   end
 273
 274   def scrub_attribute(_tag, attribute), do: attribute
 275
 276   def scrub({"img", attributes, children}) do
 277     attributes =
 278       attributes
 279       |> Enum.map(fn attr -> scrub_attribute("img", attr) end)
 280       |> Enum.reject(&is_nil(&1))
 281
 282     {"img", attributes, children}
 283   end
 284
 285   def scrub({:comment, _children}), do: ""
 286
 287   def scrub({tag, attributes, children}), do: {tag, attributes, children}
 288   def scrub({_tag, children}), do: children
 289   def scrub(text), do: text
 290 end