Add documentation for ES search
authorFloatingGhost <hannah@coffee-and-dreams.uk>
Thu, 30 Jun 2022 16:36:57 +0000 (17:36 +0100)
committerFloatingGhost <hannah@coffee-and-dreams.uk>
Thu, 30 Jun 2022 16:36:57 +0000 (17:36 +0100)
config/description.exs
docs/configuration/search.md
lib/mix/tasks/pleroma/search/elasticsearch.ex [new file with mode: 0644]
lib/mix/tasks/pleroma/search/meilisearch.ex [new file with mode: 0644]

index 2d068556f7f78480cd4a326a8dec67998836f91f..ac3faa346e9617975bc63738d2b203ced246ae97 100644 (file)
@@ -3472,5 +3472,90 @@ config :pleroma, :config_description, [
         suggestion: [100_000]
       }
     ]
+  },
+  %{
+    group: :pleroma,
+    key: Pleroma.Search.Elasticsearch.Cluster,
+    type: :group,
+    description: "Elasticsearch settings.",
+    children: [
+      %{
+        key: :url,
+        type: :string,
+        description: "Elasticsearch URL.",
+        suggestion: ["http://127.0.0.1:9200/"]
+      },
+      %{
+        key: :username,
+        type: :string,
+        description: "Username to connect to ES. Set to nil if your cluster is unauthenticated.",
+        suggestion: ["elastic"]
+      },
+      %{
+        key: :password,
+        type: :string,
+        description: "Password to connect to ES. Set to nil if your cluster is unauthenticated.",
+        suggestion: ["changeme"]
+      },
+      %{
+        key: :api,
+        type: :module,
+        description:
+          "The API module used by Elasticsearch. Should always be Elasticsearch.API.HTTP",
+        suggestion: [Elasticsearch.API.HTTP]
+      },
+      %{
+        key: :json_library,
+        type: :module,
+        description:
+          "The JSON module used to encode/decode when communicating with Elasticsearch",
+        suggestion: [Jason]
+      },
+      %{
+        key: :indexes,
+        type: :map,
+        description: "The indices to set up in Elasticsearch",
+        children: [
+          %{
+            key: :activities,
+            type: :map,
+            description: "Config for the index to use for activities",
+            children: [
+              %{
+                key: :settings,
+                type: :string,
+                description:
+                  "Path to the file containing index settings for the activities index. Should contain a mapping.",
+                suggestion: ["priv/es-mappings/activity.json"]
+              },
+              %{
+                key: :store,
+                type: :module,
+                description: "The internal store module",
+                suggestion: [Pleroma.Search.Elasticsearch.Store]
+              },
+              %{
+                key: :sources,
+                type: {:list, :module},
+                description: "The internal types to use for this index",
+                suggestion: [[Pleroma.Activity]]
+              },
+              %{
+                key: :bulk_page_size,
+                type: :int,
+                description: "Size for bulk put requests, mostly used on building the index",
+                suggestion: [5000]
+              },
+              %{
+                key: :bulk_wait_interval,
+                type: :int,
+                description: "Time to wait between bulk put requests (in ms)",
+                suggestion: [15_000]
+              }
+            ]
+          }
+        ]
+      }
+    ]
   }
 ]
index f131948a72f1dda86a6ef4d7d63e27fa875d21bd..7c1093ab9c2d9828cb30debe69fa7f0c18f96a03 100644 (file)
@@ -121,3 +121,43 @@ This will clear **all** the posts from the search index. Note, that deleted post
 there is no need to actually clear the whole index, unless you want **all** of it gone. That said, the index does not hold any information
 that cannot be re-created from the database, it should also generally be a lot smaller than the size of your database. Still, the size
 depends on the amount of text in posts.
+
+## Elasticsearch
+
+As with meilisearch, this can be rather memory-hungry, but it is very good at what it does.
+
+To use [elasticsearch](https://www.elastic.co/), set the search module to `Pleroma.Search.Elasticsearch`:
+
+> config :pleroma, Pleroma.Search, module: Pleroma.Search.Elasticsearch
+
+You then need to set the URL and authentication credentials if relevant.
+
+> config :pleroma, Pleroma.Search.Elasticsearch.Cluster,
+>    url: "http://127.0.0.1:9200/",
+>    username: "elastic",
+>    password: "changeme",
+
+### Initial indexing
+
+After setting up the configuration, you'll want to index all of your already existsing posts. Only public posts are indexed.  You'll only
+have to do it one time, but it might take a while, depending on the amount of posts your instance has seen. 
+
+The sequence of actions is as follows:
+
+1. First, change the configuration to use `Pleroma.Search.Elasticsearch` as the search backend
+2. Restart your instance, at this point it can be used while the search indexing is running, though search won't return anything
+3. Start the initial indexing process (as described below with `index`),
+   and wait until the task says it sent everything from the database to index
+4. Wait until the index tasks exits
+
+To start the initial indexing, run the `build` command:
+
+=== "OTP"
+```sh
+./bin/pleroma_ctl search.elasticsearch index activities --cluster Pleroma.Search.Elasticsearch.Cluster
+```
+
+=== "From Source"
+```sh
+mix elasticsearch.build activities --cluster Pleroma.Search.Elasticsearch.Cluster
+```
\ No newline at end of file
diff --git a/lib/mix/tasks/pleroma/search/elasticsearch.ex b/lib/mix/tasks/pleroma/search/elasticsearch.ex
new file mode 100644 (file)
index 0000000..1d7d7a2
--- /dev/null
@@ -0,0 +1,9 @@
+defmodule Mix.Tasks.Pleroma.Search.Elasticsearch do
+  alias Mix.Tasks.Elasticsearch.Build
+  import Mix.Pleroma
+
+  def run(["index" | args]) do
+    start_pleroma()
+    Build.run(args)
+  end
+end
diff --git a/lib/mix/tasks/pleroma/search/meilisearch.ex b/lib/mix/tasks/pleroma/search/meilisearch.ex
new file mode 100644 (file)
index 0000000..d4a83c3
--- /dev/null
@@ -0,0 +1,144 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2021 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Mix.Tasks.Pleroma.Search.Meilisearch do
+  require Pleroma.Constants
+
+  import Mix.Pleroma
+  import Ecto.Query
+
+  import Pleroma.Search.Meilisearch,
+    only: [meili_post: 2, meili_put: 2, meili_get: 1, meili_delete!: 1]
+
+  def run(["index"]) do
+    start_pleroma()
+
+    meili_version =
+      (
+        {:ok, result} = meili_get("/version")
+
+        result["pkgVersion"]
+      )
+
+    # The ranking rule syntax was changed but nothing about that is mentioned in the changelog
+    if not Version.match?(meili_version, ">= 0.25.0") do
+      raise "Meilisearch <0.24.0 not supported"
+    end
+
+    {:ok, _} =
+      meili_post(
+        "/indexes/objects/settings/ranking-rules",
+        [
+          "published:desc",
+          "words",
+          "exactness",
+          "proximity",
+          "typo",
+          "attribute",
+          "sort"
+        ]
+      )
+
+    {:ok, _} =
+      meili_post(
+        "/indexes/objects/settings/searchable-attributes",
+        [
+          "content"
+        ]
+      )
+
+    IO.puts("Created indices. Starting to insert posts.")
+
+    chunk_size = Pleroma.Config.get([Pleroma.Search.Meilisearch, :initial_indexing_chunk_size])
+
+    Pleroma.Repo.transaction(
+      fn ->
+        query =
+          from(Pleroma.Object,
+            # Only index public and unlisted posts which are notes and have some text
+            where:
+              fragment("data->>'type' = 'Note'") and
+                (fragment("data->'to' \\? ?", ^Pleroma.Constants.as_public()) or
+                   fragment("data->'cc' \\? ?", ^Pleroma.Constants.as_public())),
+            order_by: [desc: fragment("data->'published'")]
+          )
+
+        count = query |> Pleroma.Repo.aggregate(:count, :data)
+        IO.puts("Entries to index: #{count}")
+
+        Pleroma.Repo.stream(
+          query,
+          timeout: :infinity
+        )
+        |> Stream.map(&Pleroma.Search.Meilisearch.object_to_search_data/1)
+        |> Stream.filter(fn o -> not is_nil(o) end)
+        |> Stream.chunk_every(chunk_size)
+        |> Stream.transform(0, fn objects, acc ->
+          new_acc = acc + Enum.count(objects)
+
+          # Reset to the beginning of the line and rewrite it
+          IO.write("\r")
+          IO.write("Indexed #{new_acc} entries")
+
+          {[objects], new_acc}
+        end)
+        |> Stream.each(fn objects ->
+          result =
+            meili_put(
+              "/indexes/objects/documents",
+              objects
+            )
+
+          with {:ok, res} <- result do
+            if not Map.has_key?(res, "uid") do
+              IO.puts("\nFailed to index: #{inspect(result)}")
+            end
+          else
+            e -> IO.puts("\nFailed to index due to network error: #{inspect(e)}")
+          end
+        end)
+        |> Stream.run()
+      end,
+      timeout: :infinity
+    )
+
+    IO.write("\n")
+  end
+
+  def run(["clear"]) do
+    start_pleroma()
+
+    meili_delete!("/indexes/objects/documents")
+  end
+
+  def run(["show-keys", master_key]) do
+    start_pleroma()
+
+    endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url])
+
+    {:ok, result} =
+      Pleroma.HTTP.get(
+        Path.join(endpoint, "/keys"),
+        [{"Authorization", "Bearer #{master_key}"}]
+      )
+
+    decoded = Jason.decode!(result.body)
+
+    if decoded["results"] do
+      Enum.each(decoded["results"], fn %{"description" => desc, "key" => key} ->
+        IO.puts("#{desc}: #{key}")
+      end)
+    else
+      IO.puts("Error fetching the keys, check the master key is correct: #{inspect(decoded)}")
+    end
+  end
+
+  def run(["stats"]) do
+    start_pleroma()
+
+    {:ok, result} = meili_get("/indexes/objects/stats")
+    IO.puts("Number of entries: #{result["numberOfDocuments"]}")
+    IO.puts("Indexing? #{result["isIndexing"]}")
+  end
+end