[#3213] Hashtag-filtering functions in ActivityPub. Mix task for migrating hashtags...
[akkoma] / lib / pleroma / web / rich_media / parser.ex
1 # Pleroma: A lightweight social networking server
2 # Copyright © 2017-2020 Pleroma Authors <https://pleroma.social/>
3 # SPDX-License-Identifier: AGPL-3.0-only
4
5 defmodule Pleroma.Web.RichMedia.Parser do
6 require Logger
7
8 defp parsers do
9 Pleroma.Config.get([:rich_media, :parsers])
10 end
11
12 def parse(nil), do: {:error, "No URL provided"}
13
14 if Pleroma.Config.get(:env) == :test do
15 @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
16 def parse(url), do: parse_url(url)
17 else
18 @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
19 def parse(url) do
20 with {:ok, data} <- get_cached_or_parse(url),
21 {:ok, _} <- set_ttl_based_on_image(data, url) do
22 {:ok, data}
23 end
24 end
25
26 defp get_cached_or_parse(url) do
27 case Cachex.fetch(:rich_media_cache, url, fn ->
28 case parse_url(url) do
29 {:ok, _} = res ->
30 {:commit, res}
31
32 {:error, reason} = e ->
33 # Unfortunately we have to log errors here, instead of doing that
34 # along with ttl setting at the bottom. Otherwise we can get log spam
35 # if more than one process was waiting for the rich media card
36 # while it was generated. Ideally we would set ttl here as well,
37 # so we don't override it number_of_waiters_on_generation
38 # times, but one, obviously, can't set ttl for not-yet-created entry
39 # and Cachex doesn't support returning ttl from the fetch callback.
40 log_error(url, reason)
41 {:commit, e}
42 end
43 end) do
44 {action, res} when action in [:commit, :ok] ->
45 case res do
46 {:ok, _data} = res ->
47 res
48
49 {:error, reason} = e ->
50 if action == :commit, do: set_error_ttl(url, reason)
51 e
52 end
53
54 {:error, e} ->
55 {:error, {:cachex_error, e}}
56 end
57 end
58
59 defp set_error_ttl(_url, :body_too_large), do: :ok
60 defp set_error_ttl(_url, {:content_type, _}), do: :ok
61
62 # The TTL is not set for the errors above, since they are unlikely to change
63 # with time
64
65 defp set_error_ttl(url, _reason) do
66 ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000)
67 Cachex.expire(:rich_media_cache, url, ttl)
68 :ok
69 end
70
71 defp log_error(url, {:invalid_metadata, data}) do
72 Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end)
73 end
74
75 defp log_error(url, reason) do
76 Logger.warn(fn -> "Rich media error for #{url}: #{inspect(reason)}" end)
77 end
78 end
79
80 @doc """
81 Set the rich media cache based on the expiration time of image.
82
83 Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL`
84
85 ## Example
86
87 defmodule MyModule do
88 @behaviour Pleroma.Web.RichMedia.Parser.TTL
89 def ttl(data, url) do
90 image_url = Map.get(data, :image)
91 # do some parsing in the url and get the ttl of the image
92 # and return ttl is unix time
93 parse_ttl_from_url(image_url)
94 end
95 end
96
97 Define the module in the config
98
99 config :pleroma, :rich_media,
100 ttl_setters: [MyModule]
101 """
102 @spec set_ttl_based_on_image(map(), String.t()) ::
103 {:ok, Integer.t() | :noop} | {:error, :no_key}
104 def set_ttl_based_on_image(data, url) do
105 case get_ttl_from_image(data, url) do
106 {:ok, ttl} when is_number(ttl) ->
107 ttl = ttl * 1000
108
109 case Cachex.expire_at(:rich_media_cache, url, ttl) do
110 {:ok, true} -> {:ok, ttl}
111 {:ok, false} -> {:error, :no_key}
112 end
113
114 _ ->
115 {:ok, :noop}
116 end
117 end
118
119 defp get_ttl_from_image(data, url) do
120 [:rich_media, :ttl_setters]
121 |> Pleroma.Config.get()
122 |> Enum.reduce({:ok, nil}, fn
123 module, {:ok, _ttl} ->
124 module.ttl(data, url)
125
126 _, error ->
127 error
128 end)
129 end
130
131 def parse_url(url) do
132 with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
133 {:ok, html} <- Floki.parse_document(html) do
134 html
135 |> maybe_parse()
136 |> Map.put("url", url)
137 |> clean_parsed_data()
138 |> check_parsed_data()
139 end
140 end
141
142 defp maybe_parse(html) do
143 Enum.reduce_while(parsers(), %{}, fn parser, acc ->
144 case parser.parse(html, acc) do
145 data when data != %{} -> {:halt, data}
146 _ -> {:cont, acc}
147 end
148 end)
149 end
150
151 defp check_parsed_data(%{"title" => title} = data)
152 when is_binary(title) and title != "" do
153 {:ok, data}
154 end
155
156 defp check_parsed_data(data) do
157 {:error, {:invalid_metadata, data}}
158 end
159
160 defp clean_parsed_data(data) do
161 data
162 |> Enum.reject(fn {key, val} ->
163 not match?({:ok, _}, Jason.encode(%{key => val}))
164 end)
165 |> Map.new()
166 end
167 end