prune_objects can prune orphaned activities
authorilja <git@ilja.space>
Sat, 7 Jan 2023 19:52:02 +0000 (20:52 +0100)
committerilja <git@ilja.space>
Sun, 26 Feb 2023 13:41:50 +0000 (14:41 +0100)
We add an option to also prune remote activities who don't have existing objects any more they reference.
Rn, we only check for activities who only reference one object, not an array or embeded object.

lib/mix/tasks/pleroma/database.ex
test/mix/tasks/pleroma/database_test.exs

index be59e2271e6b748aacb2da44d71138a07ee2eb00..0f428ca0340e6881639bc5d8d5ac88dbfae23945 100644 (file)
@@ -69,7 +69,8 @@ defmodule Mix.Tasks.Pleroma.Database do
         strict: [
           vacuum: :boolean,
           keep_threads: :boolean,
-          keep_non_public: :boolean
+          keep_non_public: :boolean,
+          prune_orphaned_activities: :boolean
         ]
       )
 
@@ -94,6 +95,21 @@ defmodule Mix.Tasks.Pleroma.Database do
         log_message
       end
 
+    log_message =
+      if Keyword.get(options, :prune_orphaned_activities) do
+        log_message <> ", pruning orphaned activities"
+      else
+        log_message
+      end
+
+    log_message =
+      if Keyword.get(options, :vacuum) do
+        log_message <>
+          ", doing a full vacuum (you shouldn't do this as a recurring maintanance task)"
+      else
+        log_message
+      end
+
     Logger.info(log_message)
 
     if Keyword.get(options, :keep_threads) do
@@ -155,6 +171,28 @@ defmodule Mix.Tasks.Pleroma.Database do
     end
     |> Repo.delete_all(timeout: :infinity)
 
+    if Keyword.get(options, :prune_orphaned_activities) do
+      """
+      delete from public.activities
+      where id in (
+      select a.id from public.activities a 
+      left join public.objects o on a.data ->> 'object' = o.data ->> 'id'
+      left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id'
+      left join public.users u  on a.data ->> 'object' = u.ap_id
+      -- Only clean up remote activities
+      where not a.local
+      -- For now we only focus on activities with direct links to objects
+      --     e.g. not json objects (in case of embedded objects) or json arrays (in case of multiple objects)
+      and jsonb_typeof(a."data" -> 'object') = 'string'
+      -- Find Activities that don't have existing objects
+      and o.id is null 
+      and a2.id is null
+      and u.id is null
+      )
+      """
+      |> Repo.query()
+    end
+
     prune_hashtags_query = """
     DELETE FROM hashtags AS ht
     WHERE NOT EXISTS (
index 447a4404e26620061098885467a4e8ba1bd29765..7f5cd91a9c89da36b6812cf914bd27dbdf377e08 100644 (file)
@@ -353,6 +353,134 @@ defmodule Mix.Tasks.Pleroma.DatabaseTest do
 
       assert length(Repo.all(Object)) == 1
     end
+
+    test "We don't have unexpected tables which can contain objects that are referenced by activities" do
+      # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table.
+      # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we 
+      # add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities.
+      # So when someone adds (or removes) a table, this test will fail.
+      # Either the table contains objects which can be referenced from the activities table
+      # => in that case the prune_objects job should be adapted so we don't delete activities who still have the referenced object.
+      # Or it doesn't contain objects which can be referenced from the activities table
+      # => in that case you can add/remove the table to/from this (sorted) list.
+
+      assert Repo.query!(
+               "SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';"
+             ).rows
+             |> Enum.sort() == [
+               ["activities"],
+               ["announcement_read_relationships"],
+               ["announcements"],
+               ["apps"],
+               ["backups"],
+               ["bookmarks"],
+               ["chat_message_references"],
+               ["chats"],
+               ["config"],
+               ["conversation_participation_recipient_ships"],
+               ["conversation_participations"],
+               ["conversations"],
+               ["counter_cache"],
+               ["data_migration_failed_ids"],
+               ["data_migrations"],
+               ["deliveries"],
+               ["filters"],
+               ["following_relationships"],
+               ["hashtags"],
+               ["hashtags_objects"],
+               ["instances"],
+               ["lists"],
+               ["markers"],
+               ["mfa_tokens"],
+               ["moderation_log"],
+               ["notifications"],
+               ["oauth_authorizations"],
+               ["oauth_tokens"],
+               ["oban_jobs"],
+               ["oban_peers"],
+               ["objects"],
+               ["password_reset_tokens"],
+               ["push_subscriptions"],
+               ["registrations"],
+               ["report_notes"],
+               ["scheduled_activities"],
+               ["schema_migrations"],
+               ["thread_mutes"],
+               ["user_follows_hashtag"],
+               ["user_frontend_setting_profiles"],
+               ["user_invite_tokens"],
+               ["user_notes"],
+               ["user_relationships"],
+               ["users"]
+             ]
+    end
+
+    test "it prunes orphaned activities with the --prune-orphaned-activities" do
+      # Add a remote activity which references an Object
+      %Object{} |> Map.merge(%{data: %{"id" => "object_for_activity"}}) |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{"id" => "remote_activity_with_object", "object" => "object_for_activity"}
+      })
+      |> Repo.insert()
+
+      # Add a remote activity which references an activity
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{
+          "id" => "remote_activity_with_activity",
+          "object" => "remote_activity_with_object"
+        }
+      })
+      |> Repo.insert()
+
+      # Add a remote activity which references an Actor
+      %User{} |> Map.merge(%{ap_id: "actor"}) |> Repo.insert()
+
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{"id" => "remote_activity_with_actor", "object" => "actor"}
+      })
+      |> Repo.insert()
+
+      # Add a remote activity without existing referenced object, activity or actor
+      %Activity{}
+      |> Map.merge(%{
+        local: false,
+        data: %{
+          "id" => "remote_activity_without_existing_referenced_object",
+          "object" => "non_existing"
+        }
+      })
+      |> Repo.insert()
+
+      # Add a local activity without existing referenced object, activity or actor
+      %Activity{}
+      |> Map.merge(%{
+        local: true,
+        data: %{"id" => "local_activity_with_actor", "object" => "non_existing"}
+      })
+      |> Repo.insert()
+
+      # The remote activities without existing reference, and only the remote activities without existing reference, are deleted
+      # if, and only if, we provide the --prune-orphaned-activities option
+      assert length(Repo.all(Activity)) == 5
+      Mix.Tasks.Pleroma.Database.run(["prune_objects"])
+      assert length(Repo.all(Activity)) == 5
+      Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"])
+      activities = Repo.all(Activity)
+
+      assert "remote_activity_without_existing_referenced_object" not in Enum.map(
+               activities,
+               fn a -> a.data["id"] end
+             )
+
+      assert length(activities) == 4
+    end
   end
 
   describe "running update_users_following_followers_counts" do