Merge branch 'delete_orphaned_activities' into develop
authorFloatingGhost <hannah@coffee-and-dreams.uk>
Sun, 26 Feb 2023 22:11:30 +0000 (22:11 +0000)
committerFloatingGhost <hannah@coffee-and-dreams.uk>
Sun, 26 Feb 2023 22:11:30 +0000 (22:11 +0000)
CHANGELOG.md
docs/docs/administration/CLI_tasks/database.md
lib/mix/tasks/pleroma/database.ex
test/mix/tasks/pleroma/database_test.exs

index a611b3c069f5bc5d200afdf651e2d91cc11c0b95..ef288366e2ae8239e7a500f8bec548222ae84055 100644 (file)
@@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Changed
 - Restoring the database from a dump now goes much faster without need for work-arounds
 
+### Added
+- Extend the mix task `prune_objects` with option `--prune-orphaned-activities` to also prune orphaned activities, allowing to reclaim even more database space
+
 ## 2023.02
 
 ### Added
index 915139cf7db0e5829551fba1ee78b140556dcb8b..3d7424d1c38ce9d85078a2ff0a9feabefbbf4393 100644 (file)
@@ -21,7 +21,6 @@ Replaces embedded objects with references to them in the `objects` table. Only n
     mix pleroma.database remove_embedded_objects [option ...]
     ```
 
-
 ### Options
 - `--vacuum` - run `VACUUM FULL` after the embedded objects are replaced with their references
 
@@ -29,8 +28,11 @@ Replaces embedded objects with references to them in the `objects` table. Only n
 
 This will prune remote posts older than 90 days (configurable with [`config :pleroma, :instance, remote_post_retention_days`](../../configuration/cheatsheet.md#instance)) from the database. Pruned posts may be refetched in some cases.
 
+!!! note
+    The disk space will only be reclaimed after a proper vacuum. By default Postgresql does this for you on a regular basis, but if your instance has been running for a long time and there are many rows deleted, it may be advantageous to use `VACUUM FULL` (e.g. by using the `--vacuum` option).
+
 !!! danger
-    The disk space will only be reclaimed after `VACUUM FULL`. You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free.
+    You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free. Vacuum causes a substantial increase in I/O traffic, and may lead to a degraded experience while it is running.
 
 === "OTP"
 
@@ -46,9 +48,10 @@ This will prune remote posts older than 90 days (configurable with [`config :ple
 
 ### Options
 
-- `--keep-threads` - don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...)
-- `--keep-non-public` - keep non-public posts like DM's and followers-only, even if they are remote
-- `--vacuum` - run `VACUUM FULL` after the objects are pruned
+- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in that thread is kept (e.g. because one of the posts has seen recent activity).
+- `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote.
+- `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size.
+- `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning.
 
 ## Create a conversation for all existing DMs
 
@@ -96,6 +99,9 @@ Can be safely re-run
 
 ## Vacuum the database
 
+!!! note
+    By default Postgresql has an autovacuum deamon running. While the tasks described here can help in some cases, they shouldn't be needed on a regular basis. See [the Postgresql docs on vacuuming](https://www.postgresql.org/docs/current/sql-vacuum.html) for more information on this.
+
 ### Analyze
 
 Running an `analyze` vacuum job can improve performance by updating statistics used by the query planner. **It is safe to cancel this.**
index be59e2271e6b748aacb2da44d71138a07ee2eb00..726a22d411a81e65fb52b4c564f8f6ff7e824ef3 100644 (file)
@@ -69,7 +69,8 @@ defmodule Mix.Tasks.Pleroma.Database do
         strict: [
           vacuum: :boolean,
           keep_threads: :boolean,
-          keep_non_public: :boolean
+          keep_non_public: :boolean,
+          prune_orphaned_activities: :boolean
         ]
       )
 
@@ -94,6 +95,21 @@ defmodule Mix.Tasks.Pleroma.Database do
         log_message
       end
 
+    log_message =
+      if Keyword.get(options, :prune_orphaned_activities) do
+        log_message <> ", pruning orphaned activities"
+      else
+        log_message
+      end
+
+    log_message =
+      if Keyword.get(options, :vacuum) do
+        log_message <>
+          ", doing a full vacuum (you shouldn't do this as a recurring maintanance task)"
+      else
+        log_message
+      end
+
     Logger.info(log_message)
 
     if Keyword.get(options, :keep_threads) do
@@ -155,14 +171,49 @@ defmodule Mix.Tasks.Pleroma.Database do
     end
     |> Repo.delete_all(timeout: :infinity)
 
-    prune_hashtags_query = """
+    if Keyword.get(options, :prune_orphaned_activities) do
+      # Prune activities who link to a single object
+      """
+      delete from public.activities
+      where id in (
+        select a.id from public.activities a
+        left join public.objects o on a.data ->> 'object' = o.data ->> 'id'
+        left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id'
+        left join public.users u  on a.data ->> 'object' = u.ap_id
+        where not a.local
+        and jsonb_typeof(a."data" -> 'object') = 'string'
+        and o.id is null
+        and a2.id is null
+        and u.id is null
+      )
+      """
+      |> Repo.query([], timeout: :infinity)
+
+      # Prune activities who link to an array of objects
+      """
+      delete from public.activities
+      where id in (
+        select a.id from public.activities a
+        join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array'
+        left join public.objects o on j.value = o.data ->> 'id'
+        left join public.activities a2 on j.value = a2.data ->> 'id'
+        left join public.users u  on j.value = u.ap_id
+        group by a.id
+        having max(o.data ->> 'id') is null
+        and max(a2.data ->> 'id') is null
+        and max(u.ap_id) is null
+      )
+      """
+      |> Repo.query([], timeout: :infinity)
+    end
+
+    """
     DELETE FROM hashtags AS ht
     WHERE NOT EXISTS (
       SELECT 1 FROM hashtags_objects hto
       WHERE ht.id = hto.hashtag_id)
     """
-
-    Repo.query(prune_hashtags_query)
+    |> Repo.query()
 
     if Keyword.get(options, :vacuum) do
       Maintenance.vacuum("full")
index 447a4404e26620061098885467a4e8ba1bd29765..9edb2c115b18be950e69d60add3f3c08b9a12459 100644 (file)
@@ -353,6 +353,186 @@ defmodule Mix.Tasks.Pleroma.DatabaseTest do
 
       assert length(Repo.all(Object)) == 1
     end
+
+    test "We don't have unexpected tables which may contain objects that are referenced by activities" do
+      # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table.
+      # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we 
+      # add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities.
+      # So when someone adds (or removes) a table, this test will fail.
+      # Either the table contains objects which can be referenced from the activities table
+      # => in that case the prune_objects job should be adapted so we don't delete activities who still have the referenced object.
+      # Or it doesn't contain objects which can be referenced from the activities table
+      # => in that case you can add/remove the table to/from this (sorted) list.
+
+      assert Repo.query!(
+               "SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';"
+             ).rows
+             |> Enum.sort() == [
+               ["activities"],
+               ["announcement_read_relationships"],
+               ["announcements"],
+               ["apps"],
+               ["backups"],
+               ["bookmarks"],
+               ["chat_message_references"],
+               ["chats"],
+               ["config"],
+               ["conversation_participation_recipient_ships"],
+               ["conversation_participations"],
+               ["conversations"],
+               ["counter_cache"],
+               ["data_migration_failed_ids"],
+               ["data_migrations"],
+               ["deliveries"],
+               ["filters"],
+               ["following_relationships"],
+               ["hashtags"],
+               ["hashtags_objects"],
+               ["instances"],
+               ["lists"],
+               ["markers"],
+               ["mfa_tokens"],
+               ["moderation_log"],
+               ["notifications"],
+               ["oauth_authorizations"],
+               ["oauth_tokens"],
+               ["oban_jobs"],
+               ["oban_peers"],
+               ["objects"],
+               ["password_reset_tokens"],
+               ["push_subscriptions"],
+               ["registrations"],
+               ["report_notes"],
+               ["scheduled_activities"],
+               ["schema_migrations"],
+               ["thread_mutes"],
+               ["user_follows_hashtag"],
+               ["user_frontend_setting_profiles"],
+               ["user_invite_tokens"],
+               ["user_notes"],
+               ["user_relationships"],
+               ["users"]
+             ]
+    end
+
+    test "it prunes orphaned activities with the --prune-orphaned-activities" do
+      %Object{} |> Map.merge(%{data: %{"id" => "object_for_activity"}}) |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{"id" => "remote_activity_with_object", "object" => "object_for_activity"}
+      })
+      |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{
+          "id" => "remote_activity_with_activity",
+          "object" => "remote_activity_with_object"
+        }
+      })
+      |> Repo.insert()
+
+      %User{} |> Map.merge(%{ap_id: "actor"}) |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{"id" => "remote_activity_with_actor", "object" => "actor"}
+      })
+      |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{
+          "id" => "remote_activity_without_existing_referenced_object",
+          "object" => "non_existing"
+        }
+      })
+      |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: true,
+        data: %{"id" => "local_activity_with_actor", "object" => "non_existing"}
+      })
+      |> Repo.insert()
+
+      assert length(Repo.all(Activity)) == 5
+      Mix.Tasks.Pleroma.Database.run(["prune_objects"])
+      assert length(Repo.all(Activity)) == 5
+      Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"])
+      activities = Repo.all(Activity)
+
+      assert "remote_activity_without_existing_referenced_object" not in Enum.map(
+               activities,
+               fn a -> a.data["id"] end
+             )
+
+      assert length(activities) == 4
+    end
+
+    test "it prunes orphaned activities with the --prune-orphaned-activities when the objects are referenced from an array" do
+      %Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert()
+      %User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{
+          "id" => "remote_activity_existing_object",
+          "object" => ["non_ existing_object", "existing_object"]
+        }
+      })
+      |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{
+          "id" => "remote_activity_existing_actor",
+          "object" => ["non_ existing_object", "existing_actor"]
+        }
+      })
+      |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{
+          "id" => "remote_activity_existing_activity",
+          "object" => ["non_ existing_object", "remote_activity_existing_actor"]
+        }
+      })
+      |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{
+          "id" => "remote_activity_without_existing_referenced_object",
+          "object" => ["owo", "whats_this"]
+        }
+      })
+      |> Repo.insert()
+
+      assert length(Repo.all(Activity)) == 4
+      Mix.Tasks.Pleroma.Database.run(["prune_objects"])
+      assert length(Repo.all(Activity)) == 4
+      Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"])
+      activities = Repo.all(Activity)
+      assert length(activities) == 3
+
+      assert "remote_activity_without_existing_referenced_object" not in Enum.map(
+               activities,
+               fn a -> a.data["id"] end
+             )
+
+      assert length(activities) == 3
+    end
   end
 
   describe "running update_users_following_followers_counts" do