add the rich media ttl based on image exp time
authorSachin Joshi <satchin.joshi@gmail.com>
Tue, 16 Jul 2019 16:52:36 +0000 (22:37 +0545)
committerSachin Joshi <satchin.joshi@gmail.com>
Tue, 16 Jul 2019 18:35:34 +0000 (00:20 +0545)
CHANGELOG.md
config/config.exs
docs/config/howto_set_richmedia_cache_ttl_based_on_image.md [new file with mode: 0644]
lib/pleroma/web/rich_media/parser.ex
lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex [new file with mode: 0644]
test/fixtures/rich_media/amz.html [new file with mode: 0644]
test/web/rich_media/aws_signed_url_test.exs [new file with mode: 0644]

index f3630a1c575ba23a0212be67e6df21bc1c9255b2..4e58b0a9f407de1966ff307a250008741d1aa89a 100644 (file)
@@ -45,6 +45,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Configuration: Filter.AnonymizeFilename added ability to retain file extension with custom text
 - Admin API: changed json structure for saving config settings.
 - RichMedia: parsers and their order are configured in `rich_media` config.
+- RichMedia: add the rich media ttl based on image expiration time.
 
 ## [1.0.1] - 2019-07-14
 ### Security
index 7d539f994fdd43e596135b8064e539439ae9c960..aa5bd0da9564d6705aedc2bc9b4b74320f6684f2 100644 (file)
@@ -344,7 +344,8 @@ config :pleroma, :rich_media,
     Pleroma.Web.RichMedia.Parsers.TwitterCard,
     Pleroma.Web.RichMedia.Parsers.OGP,
     Pleroma.Web.RichMedia.Parsers.OEmbed
-  ]
+  ],
+  ttl_setters: [Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl]
 
 config :pleroma, :media_proxy,
   enabled: false,
diff --git a/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md b/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md
new file mode 100644 (file)
index 0000000..489f9ec
--- /dev/null
@@ -0,0 +1,32 @@
+# How to set rich media cache ttl based on image ttl
+## Explanation
+
+Richmedia are cached without the ttl but the rich media may have image which can expire, like aws signed url.
+In such cases the old image url (expired) is returned from the media cache.
+
+So to avoid such situation we can define a moddule that will set ttl based no image.
+
+The module must have a `run` function and it should be registered in the config.
+
+### Example
+
+```exs
+defmodule MyModule do
+  def run(data, url) do
+    image_url = Map.get(data, :image)
+    # do some parsing in the url and get the ttl of the image
+    # ttl is unix time
+    ttl = parse_ttl_from_url(image_url)  
+    Cachex.expire_at(:rich_media_cache, url, ttl * 1000)
+  end
+end
+```
+
+And update the config
+
+```exs
+config :pleroma, :rich_media,
+  ttl_setters: [Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl, MyModule]
+```
+
+> For reference there is a parser for AWS signed URL `Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl`, it's enabled by default.
index 0d25233388a67ff977bbee8622bab2393ab944e9..ba8dc6f2a4b0812a9d3a715ce36deefdd921b215 100644 (file)
@@ -24,6 +24,7 @@ defmodule Pleroma.Web.RichMedia.Parser do
         Cachex.fetch!(:rich_media_cache, url, fn _ ->
           {:commit, parse_url(url)}
         end)
+        |> set_ttl_based_on_image(url)
       rescue
         e ->
           {:error, "Cachex error: #{inspect(e)}"}
@@ -31,6 +32,46 @@ defmodule Pleroma.Web.RichMedia.Parser do
     end
   end
 
+  @doc """
+  Set the rich media cache based on the expiration time of image.
+
+  Define a module that has `run` function
+
+  ## Example
+
+      defmodule MyModule do
+        def run(data, url) do
+          image_url = Map.get(data, :image)
+          # do some parsing in the url and get the ttl of the image
+          # ttl is unix time
+          ttl = parse_ttl_from_url(image_url)  
+          Cachex.expire_at(:rich_media_cache, url, ttl * 1000)
+        end
+      end
+
+  Define the module in the config
+
+      config :pleroma, :rich_media,
+        ttl_setters: [MyModule]
+  """
+  def set_ttl_based_on_image({:ok, data}, url) do
+    case Cachex.ttl(:rich_media_cache, url) do
+      {:ok, nil} ->
+        modules = Pleroma.Config.get([:rich_media, :ttl_setters])
+
+        if Enum.count(modules) > 0 do
+          Enum.each(modules, & &1.run(data, url))
+        end
+
+        {:ok, data}
+
+      _ ->
+        {:ok, data}
+    end
+  end
+
+  def set_ttl_based_on_image(data, _url), do: data
+
   defp parse_url(url) do
     try do
       {:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options)
diff --git a/lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex b/lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex
new file mode 100644 (file)
index 0000000..d571079
--- /dev/null
@@ -0,0 +1,54 @@
+defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do
+  def run(data, url) do
+    image = Map.get(data, :image)
+
+    if is_aws_signed_url(image) do
+      image
+      |> parse_query_params()
+      |> format_query_params()
+      |> get_expiration_timestamp()
+      |> set_ttl(url)
+    end
+  end
+
+  defp is_aws_signed_url(""), do: nil
+  defp is_aws_signed_url(nil), do: nil
+
+  defp is_aws_signed_url(image) when is_binary(image) do
+    %URI{host: host, query: query} = URI.parse(image)
+
+    if String.contains?(host, "amazonaws.com") and
+         String.contains?(query, "X-Amz-Expires") do
+      image
+    else
+      nil
+    end
+  end
+
+  defp is_aws_signed_url(_), do: nil
+
+  defp parse_query_params(image) do
+    %URI{query: query} = URI.parse(image)
+    query
+  end
+
+  defp format_query_params(query) do
+    query
+    |> String.split(~r/&|=/)
+    |> Enum.chunk_every(2)
+    |> Map.new(fn [k, v] -> {k, v} end)
+  end
+
+  defp get_expiration_timestamp(params) when is_map(params) do
+    {:ok, date} =
+      params
+      |> Map.get("X-Amz-Date")
+      |> Timex.parse("{ISO:Basic:Z}")
+
+    Timex.to_unix(date) + String.to_integer(Map.get(params, "X-Amz-Expires"))
+  end
+
+  defp set_ttl(ttl, url) do
+    Cachex.expire_at(:rich_media_cache, url, ttl * 1000)
+  end
+end
diff --git a/test/fixtures/rich_media/amz.html b/test/fixtures/rich_media/amz.html
new file mode 100644 (file)
index 0000000..d4f8bd1
--- /dev/null
@@ -0,0 +1,5 @@
+<meta name="twitter:card" content="summary" />
+<meta name="twitter:site" content="@flickr" />
+<meta name="twitter:title" content="Small Island Developing States Photo Submission" />
+<meta name="twitter:description" content="View the album on Flickr." />
+<meta name="twitter:image" content="https://pleroma.s3.ap-southeast-1.amazonaws.com/sachin%20%281%29%20_a%20-%25%2Aasdasd%20BNN%20bnnn%20.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIBLWWK6RGDQXDLJQ%2F20190716%2Fap-southeast-1%2Fs3%2Faws4_request&X-Amz-Date=20190716T175105Z&X-Amz-Expires=300000&X-Amz-Signature=04ffd6b98634f4b1bbabc62e0fac4879093cd54a6eed24fe8eb38e8369526bbf&X-Amz-SignedHeaders=host" />
diff --git a/test/web/rich_media/aws_signed_url_test.exs b/test/web/rich_media/aws_signed_url_test.exs
new file mode 100644 (file)
index 0000000..75bf6c6
--- /dev/null
@@ -0,0 +1,37 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2019 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.TTL.AwsSignedUrlTest do
+  use ExUnit.Case, async: true
+
+  test "amazon signed url is parsed and correct ttl is set for rich media" do
+    url = "https://pleroma.social/amz"
+
+    {:ok, timestamp} =
+      Timex.now()
+      |> DateTime.truncate(:second)
+      |> Timex.format("{ISO:Basic:Z}")
+
+    # in seconds
+    valid_till = 30
+
+    data = %{
+      image:
+        "https://pleroma.s3.ap-southeast-1.amazonaws.com/sachin%20%281%29%20_a%20-%25%2Aasdasd%20BNN%20bnnn%20.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIBLWWK6RGDQXDLJQ%2F20190716%2Fap-southeast-1%2Fs3%2Faws4_request&X-Amz-Date=#{
+          timestamp
+        }&X-Amz-Expires=#{valid_till}&X-Amz-Signature=04ffd6b98634f4b1bbabc62e0fac4879093cd54a6eed24fe8eb38e8369526bbf&X-Amz-SignedHeaders=host",
+      locale: "en_US",
+      site_name: "Pleroma",
+      title: "PLeroma",
+      url: url
+    }
+
+    Cachex.put(:rich_media_cache, url, data)
+    assert {:ok, _} = Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl.run(data, url)
+    {:ok, cache_ttl} = Cachex.ttl(:rich_media_cache, url)
+
+    # as there is delay in setting and pulling the data from cache we ignore 1 second
+    assert_in_delta(valid_till * 1000, cache_ttl, 1000)
+  end
+end