Use content instead of source and scrub it
[akkoma] / lib / mix / tasks / pleroma / search / meilisearch.ex
1 # Pleroma: A lightweight social networking server
2 # Copyright © 2017-2021 Pleroma Authors <https://pleroma.social/>
3 # SPDX-License-Identifier: AGPL-3.0-only
4
5 defmodule Mix.Tasks.Pleroma.Search.Meilisearch do
6 require Logger
7 require Pleroma.Constants
8
9 import Mix.Pleroma
10 import Ecto.Query
11
12 def run(["index"]) do
13 start_pleroma()
14
15 endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url])
16
17 {:ok, _} =
18 Pleroma.HTTP.post(
19 "#{endpoint}/indexes/objects/settings/ranking-rules",
20 Jason.encode!([
21 "desc(published)",
22 "typo",
23 "words",
24 "proximity",
25 "attribute",
26 "wordsPosition",
27 "exactness"
28 ])
29 )
30
31 chunk_size = 100_000
32
33 Pleroma.Repo.transaction(
34 fn ->
35 Pleroma.Repo.stream(
36 from(Pleroma.Object,
37 # Only index public posts which are notes and have some text
38 where:
39 fragment("data->>'type' = 'Note'") and
40 fragment("LENGTH(data->>'content') > 0") and
41 fragment("data->'to' \\? ?", ^Pleroma.Constants.as_public()),
42 order_by: [desc: fragment("data->'published'")]
43 ),
44 timeout: :infinity
45 )
46 |> Stream.chunk_every(chunk_size)
47 |> Stream.transform(0, fn objects, acc ->
48 new_acc = acc + Enum.count(objects)
49
50 IO.puts("Indexed #{new_acc} entries")
51
52 {[objects], new_acc}
53 end)
54 |> Stream.map(fn objects ->
55 Enum.map(objects, fn object ->
56 data = object.data
57
58 {:ok, published, _} = DateTime.from_iso8601(data["published"])
59 {:ok, content} = FastSanitize.strip_tags(data["content"])
60
61 %{
62 id: object.id,
63 content: content,
64 ap: data["id"],
65 published: published |> DateTime.to_unix()
66 }
67 end)
68 end)
69 |> Stream.each(fn objects ->
70 {:ok, result} =
71 Pleroma.HTTP.post(
72 "#{endpoint}/indexes/objects/documents",
73 Jason.encode!(objects)
74 )
75
76 if not Map.has_key?(Jason.decode!(result.body), "updateId") do
77 IO.puts("Failed to index: #{result}")
78 end
79 end)
80 |> Stream.run()
81 end,
82 timeout: :infinity
83 )
84 end
85
86 def run(["clear"]) do
87 start_pleroma()
88
89 endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url])
90
91 {:ok, _} = Pleroma.HTTP.request(:delete, "#{endpoint}/indexes/objects", "", [], [])
92 end
93 end