From ede414094f7b196d3ff129b8a23ba461ef80d29f Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Sun, 11 Feb 2024 16:11:52 -0500 Subject: RichMedia refactor Rich Media parsing was previously handled on-demand with a 2 second HTTP request timeout and retained only in Cachex. Every time a Pleroma instance is restarted it will have to request and parse the data for each status with a URL detected. When fetching a batch of statuses they were processed in parallel to attempt to keep the maximum latency at 2 seconds, but often resulted in a timeline appearing to hang during loading due to a URL that could not be successfully reached. URLs which had images links that expire (Amazon AWS) were parsed and inserted with a TTL to ensure the image link would not break. Rich Media data is now cached in the database and fetched asynchronously. Cachex is used as a read-through cache. When the data becomes available we stream an update to the clients. If the result is returned quickly the experience is almost seamless. Activities were already processed for their Rich Media data during ingestion to warm the cache, so users should not normally encounter the asynchronous loading of the Rich Media data. Implementation notes: - The async worker is a Task with a globally unique process name to prevent duplicate processing of the same URL - The Task will attempt to fetch the data 3 times with increasing sleep time between attempts - The HTTP request obeys the default HTTP request timeout value instead of 2 seconds - URLs that cannot be successfully parsed due to an unexpected error receives a negative cache entry for 15 minutes - URLs that fail with an expected error will receive a negative cache with no TTL - Activities that have no detected URLs insert a nil value in the Cachex :scrubber_cache so we do not repeat parsing the object content with Floki every time the activity is rendered - Expiring image URLs are handled with an Oban job - There is no automatic cleanup of the Rich Media data in the database, but it is safe to delete at any time - The post draft/preview feature makes the URL processing synchronous so the rendered post preview will have an accurate rendering Overall performance of timelines and creating new posts which contain URLs is greatly improved. --- lib/pleroma/html.ex | 20 ++- lib/pleroma/web/activity_pub/activity_pub.ex | 6 +- lib/pleroma/web/activity_pub/side_effects.ex | 4 +- .../mastodon_api/controllers/status_controller.ex | 7 +- lib/pleroma/web/mastodon_api/views/status_view.ex | 26 ++-- .../views/chat/message_reference_view.ex | 13 +- lib/pleroma/web/rich_media/backfill.ex | 101 +++++++++++++ lib/pleroma/web/rich_media/card.ex | 157 +++++++++++++++++++++ lib/pleroma/web/rich_media/helpers.ex | 70 ++------- lib/pleroma/web/rich_media/parser.ex | 114 +-------------- lib/pleroma/web/rich_media/parser/ttl.ex | 12 ++ .../web/rich_media/parser/ttl/aws_signed_url.ex | 2 +- .../workers/rich_media_expiration_worker.ex | 15 ++ 13 files changed, 336 insertions(+), 211 deletions(-) create mode 100644 lib/pleroma/web/rich_media/backfill.ex create mode 100644 lib/pleroma/web/rich_media/card.ex create mode 100644 lib/pleroma/workers/rich_media_expiration_worker.ex (limited to 'lib') diff --git a/lib/pleroma/html.ex b/lib/pleroma/html.ex index 84ff2f129..4de7cbb76 100644 --- a/lib/pleroma/html.ex +++ b/lib/pleroma/html.ex @@ -65,20 +65,16 @@ defmodule Pleroma.HTML do end end - @spec extract_first_external_url_from_object(Pleroma.Object.t()) :: - {:ok, String.t()} | {:error, :no_content} + @spec extract_first_external_url_from_object(Pleroma.Object.t()) :: String.t() | nil def extract_first_external_url_from_object(%{data: %{"content" => content}}) when is_binary(content) do - url = - content - |> Floki.parse_fragment!() - |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])") - |> Enum.take(1) - |> Floki.attribute("href") - |> Enum.at(0) - - {:ok, url} + content + |> Floki.parse_fragment!() + |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])") + |> Enum.take(1) + |> Floki.attribute("href") + |> Enum.at(0) end - def extract_first_external_url_from_object(_), do: {:error, :no_content} + def extract_first_external_url_from_object(_), do: nil end diff --git a/lib/pleroma/web/activity_pub/activity_pub.ex b/lib/pleroma/web/activity_pub/activity_pub.ex index 2017c696d..a1fccc705 100644 --- a/lib/pleroma/web/activity_pub/activity_pub.ex +++ b/lib/pleroma/web/activity_pub/activity_pub.ex @@ -147,9 +147,7 @@ defmodule Pleroma.Web.ActivityPub.ActivityPub do # Splice in the child object if we have one. activity = Maps.put_if_present(activity, :object, object) - ConcurrentLimiter.limit(Pleroma.Web.RichMedia.Helpers, fn -> - Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end) - end) + Pleroma.Web.RichMedia.Card.get_by_activity(activity) # Add local posts to search index if local, do: Pleroma.Search.add_to_index(activity) @@ -177,7 +175,7 @@ defmodule Pleroma.Web.ActivityPub.ActivityPub do id: "pleroma:fakeid" } - Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) + Pleroma.Web.RichMedia.Card.get_by_activity(activity) {:ok, activity} {:remote_limit_pass, _} -> diff --git a/lib/pleroma/web/activity_pub/side_effects.ex b/lib/pleroma/web/activity_pub/side_effects.ex index 5cb8a9700..7421b8ed8 100644 --- a/lib/pleroma/web/activity_pub/side_effects.ex +++ b/lib/pleroma/web/activity_pub/side_effects.ex @@ -227,9 +227,7 @@ defmodule Pleroma.Web.ActivityPub.SideEffects do end end - ConcurrentLimiter.limit(Pleroma.Web.RichMedia.Helpers, fn -> - Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end) - end) + Pleroma.Web.RichMedia.Card.get_by_activity(activity) Pleroma.Search.add_to_index(Map.put(activity, :object, object)) diff --git a/lib/pleroma/web/mastodon_api/controllers/status_controller.ex b/lib/pleroma/web/mastodon_api/controllers/status_controller.ex index 4f6de8a00..b7dc00a44 100644 --- a/lib/pleroma/web/mastodon_api/controllers/status_controller.ex +++ b/lib/pleroma/web/mastodon_api/controllers/status_controller.ex @@ -25,6 +25,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusController do alias Pleroma.Web.OAuth.Token alias Pleroma.Web.Plugs.OAuthScopesPlug alias Pleroma.Web.Plugs.RateLimiter + alias Pleroma.Web.RichMedia.Card plug(Pleroma.Web.ApiSpec.CastAndValidate, replace_params: false) @@ -480,9 +481,9 @@ defmodule Pleroma.Web.MastodonAPI.StatusController do _ ) do with %Activity{} = activity <- Activity.get_by_id(status_id), - true <- Visibility.visible_for_user?(activity, user) do - data = Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) - render(conn, "card.json", data) + true <- Visibility.visible_for_user?(activity, user), + %Card{} = card_data <- Card.get_by_activity(activity) do + render(conn, "card.json", card_data) else _ -> render_error(conn, :not_found, "Record not found") end diff --git a/lib/pleroma/web/mastodon_api/views/status_view.ex b/lib/pleroma/web/mastodon_api/views/status_view.ex index e464f60dc..77af69eef 100644 --- a/lib/pleroma/web/mastodon_api/views/status_view.ex +++ b/lib/pleroma/web/mastodon_api/views/status_view.ex @@ -21,6 +21,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do alias Pleroma.Web.MastodonAPI.StatusView alias Pleroma.Web.MediaProxy alias Pleroma.Web.PleromaAPI.EmojiReactionController + alias Pleroma.Web.RichMedia.Card import Pleroma.Web.ActivityPub.Visibility, only: [get_visibility: 1, visible_for_user?: 2] @@ -29,9 +30,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do # pagination is restricted to 40 activities at a time defp fetch_rich_media_for_activities(activities) do Enum.each(activities, fn activity -> - spawn(fn -> - Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) - end) + spawn(fn -> Card.get_by_activity(activity) end) end) end @@ -113,9 +112,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do # To do: check AdminAPIControllerTest on the reasons behind nil activities in the list activities = Enum.filter(opts.activities, & &1) - # Start fetching rich media before doing anything else, so that later calls to get the cards - # only block for timeout in the worst case, as opposed to - # length(activities_with_links) * timeout + # Start prefetching rich media before doing anything else fetch_rich_media_for_activities(activities) replied_to_activities = get_replied_to_activities(activities) quoted_activities = get_quoted_activities(activities) @@ -364,7 +361,11 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do summary = object.data["summary"] || "" - card = render("card.json", Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)) + card = + case Card.get_by_activity(activity) do + %Card{} = result -> render("card.json", result) + _ -> nil + end url = if user.local do @@ -567,15 +568,8 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do } end - def render("card.json", %{rich_media: rich_media, page_url: page_url}) do - page_url_data = URI.parse(page_url) - - page_url_data = - if is_binary(rich_media["url"]) do - URI.merge(page_url_data, URI.parse(rich_media["url"])) - else - page_url_data - end + def render("card.json", %Card{fields: rich_media}) do + page_url_data = URI.parse(rich_media["url"]) page_url = page_url_data |> to_string diff --git a/lib/pleroma/web/pleroma_api/views/chat/message_reference_view.ex b/lib/pleroma/web/pleroma_api/views/chat/message_reference_view.ex index 241bf0010..a1c88d075 100644 --- a/lib/pleroma/web/pleroma_api/views/chat/message_reference_view.ex +++ b/lib/pleroma/web/pleroma_api/views/chat/message_reference_view.ex @@ -9,6 +9,7 @@ defmodule Pleroma.Web.PleromaAPI.Chat.MessageReferenceView do alias Pleroma.User alias Pleroma.Web.CommonAPI.Utils alias Pleroma.Web.MastodonAPI.StatusView + alias Pleroma.Web.RichMedia.Card @cachex Pleroma.Config.get([:cachex, :provider], Cachex) @@ -23,6 +24,12 @@ defmodule Pleroma.Web.PleromaAPI.Chat.MessageReferenceView do } } ) do + card = + case Card.get_by_object(object) do + %Card{} = card_data -> StatusView.render("card.json", card_data) + _ -> nil + end + %{ id: id |> to_string(), content: chat_message["content"], @@ -34,11 +41,7 @@ defmodule Pleroma.Web.PleromaAPI.Chat.MessageReferenceView do chat_message["attachment"] && StatusView.render("attachment.json", attachment: chat_message["attachment"]), unread: unread, - card: - StatusView.render( - "card.json", - Pleroma.Web.RichMedia.Helpers.fetch_data_for_object(object) - ) + card: card } |> put_idempotency_key() end diff --git a/lib/pleroma/web/rich_media/backfill.ex b/lib/pleroma/web/rich_media/backfill.ex new file mode 100644 index 000000000..112028901 --- /dev/null +++ b/lib/pleroma/web/rich_media/backfill.ex @@ -0,0 +1,101 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Backfill.Task do + alias Pleroma.Web.RichMedia.Backfill + + def run(args) do + Task.Supervisor.start_child(Pleroma.TaskSupervisor, Backfill, :run, [args], + name: {:global, {:rich_media, args.url_hash}} + ) + end +end + +defmodule Pleroma.Web.RichMedia.Backfill do + alias Pleroma.Web.RichMedia.Card + alias Pleroma.Web.RichMedia.Parser + alias Pleroma.Web.RichMedia.Parser.TTL + alias Pleroma.Workers.RichMediaExpirationWorker + + require Logger + + @backfiller Pleroma.Config.get([__MODULE__, :provider], Pleroma.Web.RichMedia.Backfill.Task) + @cachex Pleroma.Config.get([:cachex, :provider], Cachex) + @max_attempts 3 + @retry 5_000 + + def start(%{url: url} = args) when is_binary(url) do + url_hash = Card.url_to_hash(url) + + args = + args + |> Map.put(:attempt, 1) + |> Map.put(:url_hash, url_hash) + + @backfiller.run(args) + end + + def run(%{url: url, url_hash: url_hash, attempt: attempt} = args) + when attempt <= @max_attempts do + case Parser.parse(url) do + {:ok, fields} -> + {:ok, card} = Card.create(url, fields) + + maybe_schedule_expiration(url, fields) + + if Map.has_key?(args, :activity_id) do + stream_update(args) + end + + warm_cache(url_hash, card) + + {:error, {:invalid_metadata, fields}} -> + Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}") + negative_cache(url_hash) + + {:error, :body_too_large} -> + Logger.error("Rich media error for #{url}: :body_too_large") + negative_cache(url_hash) + + {:error, {:content_type, type}} -> + Logger.debug("Rich media error for #{url}: :content_type is #{type}") + negative_cache(url_hash) + + e -> + Logger.debug("Rich media error for #{url}: #{inspect(e)}") + + :timer.sleep(@retry * attempt) + + run(%{args | attempt: attempt + 1}) + end + end + + def run(%{url: url, url_hash: url_hash}) do + Logger.debug("Rich media failure for #{url}") + + negative_cache(url_hash, :timer.minutes(15)) + end + + defp maybe_schedule_expiration(url, fields) do + case TTL.get_from_image(fields, url) do + ttl when is_number(ttl) -> + timestamp = DateTime.from_unix!(ttl) + + RichMediaExpirationWorker.new(%{"url" => url}, scheduled_at: timestamp) + |> Oban.insert() + + _ -> + :ok + end + end + + defp stream_update(%{activity_id: activity_id}) do + Pleroma.Activity.get_by_id(activity_id) + |> Pleroma.Activity.normalize() + |> Pleroma.Web.ActivityPub.ActivityPub.stream_out() + end + + defp warm_cache(key, val), do: @cachex.put(:rich_media_cache, key, val) + defp negative_cache(key, ttl \\ nil), do: @cachex.put(:rich_media_cache, key, nil, ttl: ttl) +end diff --git a/lib/pleroma/web/rich_media/card.ex b/lib/pleroma/web/rich_media/card.ex new file mode 100644 index 000000000..2d36f2b62 --- /dev/null +++ b/lib/pleroma/web/rich_media/card.ex @@ -0,0 +1,157 @@ +defmodule Pleroma.Web.RichMedia.Card do + use Ecto.Schema + import Ecto.Changeset + import Ecto.Query + + alias Pleroma.Activity + alias Pleroma.HTML + alias Pleroma.Object + alias Pleroma.Repo + alias Pleroma.Web.RichMedia.Backfill + alias Pleroma.Web.RichMedia.Parser + + @cachex Pleroma.Config.get([:cachex, :provider], Cachex) + @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) + + @type t :: %__MODULE__{} + + schema "rich_media_card" do + field(:url_hash, :binary) + field(:fields, :map) + + timestamps() + end + + @doc false + def changeset(card, attrs) do + card + |> cast(attrs, [:url_hash, :fields]) + |> validate_required([:url_hash, :fields]) + |> unique_constraint(:url_hash) + end + + @spec create(String.t(), map()) :: {:ok, t()} + def create(url, fields) do + url_hash = url_to_hash(url) + + fields = Map.put_new(fields, "url", url) + + %__MODULE__{} + |> changeset(%{url_hash: url_hash, fields: fields}) + |> Repo.insert(on_conflict: {:replace, [:fields]}, conflict_target: :url_hash) + end + + @spec delete(String.t()) :: {:ok, Ecto.Schema.t()} | {:error, Ecto.Changeset.t()} | :ok + def delete(url) do + url_hash = url_to_hash(url) + @cachex.del(:rich_media_cache, url_hash) + + case get_by_url(url) do + %__MODULE{} = card -> Repo.delete(card) + nil -> :ok + end + end + + @spec get_by_url(String.t() | nil) :: t() | nil | :error + def get_by_url(url) when is_binary(url) do + if @config_impl.get([:rich_media, :enabled]) do + url_hash = url_to_hash(url) + + @cachex.fetch!(:rich_media_cache, url_hash, fn _ -> + result = + __MODULE__ + |> where(url_hash: ^url_hash) + |> Repo.one() + + case result do + %__MODULE__{} = card -> {:commit, card} + _ -> {:ignore, nil} + end + end) + else + :error + end + end + + def get_by_url(nil), do: nil + + @spec get_or_backfill_by_url(String.t(), map()) :: t() | nil + def get_or_backfill_by_url(url, backfill_opts \\ %{}) do + case get_by_url(url) do + %__MODULE__{} = card -> + card + + nil -> + backfill_opts = Map.put(backfill_opts, :url, url) + + Backfill.start(backfill_opts) + + nil + + :error -> + nil + end + end + + @spec get_by_object(Object.t()) :: t() | nil | :error + def get_by_object(object) do + case HTML.extract_first_external_url_from_object(object) do + nil -> nil + url -> get_or_backfill_by_url(url) + end + end + + @spec get_by_activity(Activity.t()) :: t() | nil | :error + # Fake/Draft activity + def get_by_activity(%Activity{id: "pleroma:fakeid"} = activity) do + with %Object{} = object <- Object.normalize(activity, fetch: false), + url when not is_nil(url) <- HTML.extract_first_external_url_from_object(object) do + case get_by_url(url) do + # Cache hit + %__MODULE__{} = card -> + card + + # Cache miss, but fetch for rendering the Draft + _ -> + with {:ok, fields} <- Parser.parse(url), + {:ok, card} <- create(url, fields) do + card + else + _ -> nil + end + end + else + _ -> + nil + end + end + + def get_by_activity(activity) do + with %Object{} = object <- Object.normalize(activity, fetch: false), + {_, nil} <- {:cached, get_cached_url(object, activity.id)} do + nil + else + {:cached, url} -> + get_or_backfill_by_url(url, %{activity_id: activity.id}) + + _ -> + :error + end + end + + @spec url_to_hash(String.t()) :: String.t() + def url_to_hash(url) do + :crypto.hash(:sha256, url) |> Base.encode16(case: :lower) + end + + defp get_cached_url(object, activity_id) do + key = "URL|#{activity_id}" + + @cachex.fetch!(:scrubber_cache, key, fn _ -> + url = HTML.extract_first_external_url_from_object(object) + Activity.HTML.add_cache_key_for(activity_id, key) + + {:commit, url} + end) + end +end diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex index a711dc436..00af140ae 100644 --- a/lib/pleroma/web/rich_media/helpers.ex +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -3,65 +3,13 @@ # SPDX-License-Identifier: AGPL-3.0-only defmodule Pleroma.Web.RichMedia.Helpers do - alias Pleroma.Activity - alias Pleroma.HTML - alias Pleroma.Object - alias Pleroma.Web.RichMedia.Parser - - @cachex Pleroma.Config.get([:cachex, :provider], Cachex) - - @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) - - @options [ - pool: :media, - max_body: 2_000_000, - recv_timeout: 2_000 - ] - - def fetch_data_for_object(object) do - with true <- @config_impl.get([:rich_media, :enabled]), - {:ok, page_url} <- - HTML.extract_first_external_url_from_object(object), - {:ok, rich_media} <- Parser.parse(page_url) do - %{page_url: page_url, rich_media: rich_media} - else - _ -> %{} - end - end - - def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do - with true <- @config_impl.get([:rich_media, :enabled]), - %Object{} = object <- Object.normalize(activity, fetch: false) do - if object.data["fake"] do - fetch_data_for_object(object) - else - key = "URL|#{activity.id}" - - @cachex.fetch!(:scrubber_cache, key, fn _ -> - result = fetch_data_for_object(object) - - cond do - match?(%{page_url: _, rich_media: _}, result) -> - Activity.HTML.add_cache_key_for(activity.id, key) - {:commit, result} - - true -> - {:ignore, %{}} - end - end) - end - else - _ -> %{} - end - end - - def fetch_data_for_activity(_), do: %{} + alias Pleroma.Config def rich_media_get(url) do headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}] head_check = - case Pleroma.HTTP.head(url, headers, @options) do + case Pleroma.HTTP.head(url, headers, http_options()) do # If the HEAD request didn't reach the server for whatever reason, # we assume the GET that comes right after won't either {:error, _} = e -> @@ -76,7 +24,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do :ok end - with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, @options) + with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options()) end defp check_content_type(headers) do @@ -92,12 +40,13 @@ defmodule Pleroma.Web.RichMedia.Helpers do end end - @max_body @options[:max_body] defp check_content_length(headers) do + max_body = Keyword.get(http_options(), :max_body) + case List.keyfind(headers, "content-length", 0) do {_, maybe_content_length} -> case Integer.parse(maybe_content_length) do - {content_length, ""} when content_length <= @max_body -> :ok + {content_length, ""} when content_length <= max_body -> :ok {_, ""} -> {:error, :body_too_large} _ -> :ok end @@ -106,4 +55,11 @@ defmodule Pleroma.Web.RichMedia.Helpers do :ok end end + + defp http_options() do + [ + pool: :media, + max_body: Config.get([:rich_media, :max_body], 2_000_000) + ] + end end diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index a73fbc4b9..37cf29029 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -5,134 +5,28 @@ defmodule Pleroma.Web.RichMedia.Parser do require Logger - @cachex Pleroma.Config.get([:cachex, :provider], Cachex) @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) defp parsers do Pleroma.Config.get([:rich_media, :parsers]) end - def parse(nil), do: {:error, "No URL provided"} + def parse(nil), do: nil @spec parse(String.t()) :: {:ok, map()} | {:error, any()} def parse(url) do with :ok <- validate_page_url(url), - {:ok, data} <- get_cached_or_parse(url), - {:ok, _} <- set_ttl_based_on_image(data, url) do + {:ok, data} <- parse_url(url) do + data = Map.put(data, "url", url) {:ok, data} end end - defp get_cached_or_parse(url) do - case @cachex.fetch(:rich_media_cache, url, fn -> - case parse_url(url) do - {:ok, _} = res -> - {:commit, res} - - {:error, reason} = e -> - # Unfortunately we have to log errors here, instead of doing that - # along with ttl setting at the bottom. Otherwise we can get log spam - # if more than one process was waiting for the rich media card - # while it was generated. Ideally we would set ttl here as well, - # so we don't override it number_of_waiters_on_generation - # times, but one, obviously, can't set ttl for not-yet-created entry - # and Cachex doesn't support returning ttl from the fetch callback. - log_error(url, reason) - {:commit, e} - end - end) do - {action, res} when action in [:commit, :ok] -> - case res do - {:ok, _data} = res -> - res - - {:error, reason} = e -> - if action == :commit, do: set_error_ttl(url, reason) - e - end - - {:error, e} -> - {:error, {:cachex_error, e}} - end - end - - defp set_error_ttl(_url, :body_too_large), do: :ok - defp set_error_ttl(_url, {:content_type, _}), do: :ok - - # The TTL is not set for the errors above, since they are unlikely to change - # with time - - defp set_error_ttl(url, _reason) do - ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000) - @cachex.expire(:rich_media_cache, url, ttl) - :ok - end - - defp log_error(url, {:invalid_metadata, data}) do - Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end) - end - - defp log_error(url, reason) do - Logger.warning(fn -> "Rich media error for #{url}: #{inspect(reason)}" end) - end - - @doc """ - Set the rich media cache based on the expiration time of image. - - Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL` - - ## Example - - defmodule MyModule do - @behaviour Pleroma.Web.RichMedia.Parser.TTL - def ttl(data, url) do - image_url = Map.get(data, :image) - # do some parsing in the url and get the ttl of the image - # and return ttl is unix time - parse_ttl_from_url(image_url) - end - end - - Define the module in the config - - config :pleroma, :rich_media, - ttl_setters: [MyModule] - """ - @spec set_ttl_based_on_image(map(), String.t()) :: - {:ok, integer() | :noop} | {:error, :no_key} - def set_ttl_based_on_image(data, url) do - case get_ttl_from_image(data, url) do - ttl when is_number(ttl) -> - ttl = ttl * 1000 - - case @cachex.expire_at(:rich_media_cache, url, ttl) do - {:ok, true} -> {:ok, ttl} - {:ok, false} -> {:error, :no_key} - end - - _ -> - {:ok, :noop} - end - end - - defp get_ttl_from_image(data, url) do - [:rich_media, :ttl_setters] - |> Pleroma.Config.get() - |> Enum.reduce({:ok, nil}, fn - module, {:ok, _ttl} -> - module.ttl(data, url) - - _, error -> - error - end) - end - - def parse_url(url) do + defp parse_url(url) do with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url), {:ok, html} <- Floki.parse_document(html) do html |> maybe_parse() - |> Map.put("url", url) |> clean_parsed_data() |> check_parsed_data() end diff --git a/lib/pleroma/web/rich_media/parser/ttl.ex b/lib/pleroma/web/rich_media/parser/ttl.ex index b51298bd8..d69bb0d07 100644 --- a/lib/pleroma/web/rich_media/parser/ttl.ex +++ b/lib/pleroma/web/rich_media/parser/ttl.ex @@ -4,4 +4,16 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL do @callback ttl(map(), String.t()) :: integer() | nil + + def get_from_image(data, url) do + [:rich_media, :ttl_setters] + |> Pleroma.Config.get() + |> Enum.reduce({:ok, nil}, fn + module, {:ok, _ttl} -> + module.ttl(data, url) + + _, error -> + error + end) + end end diff --git a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex index a0d567c42..22e72e22e 100644 --- a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex +++ b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex @@ -7,7 +7,7 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do @impl true def ttl(data, _url) do - image = Map.get(data, :image) + image = Map.get(data, "image") if aws_signed_url?(image) do image diff --git a/lib/pleroma/workers/rich_media_expiration_worker.ex b/lib/pleroma/workers/rich_media_expiration_worker.ex new file mode 100644 index 000000000..d7ae497a7 --- /dev/null +++ b/lib/pleroma/workers/rich_media_expiration_worker.ex @@ -0,0 +1,15 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Workers.RichMediaExpirationWorker do + alias Pleroma.Web.RichMedia.Card + + use Oban.Worker, + queue: :rich_media_expiration + + @impl Oban.Worker + def perform(%Job{args: %{"url" => url} = _args}) do + Card.delete(url) + end +end -- cgit v1.2.3 From df0734fcbf7adcd98e9bce38cc7aa18345aaf78d Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Sun, 11 Feb 2024 16:53:21 -0500 Subject: Increase the :max_body for Rich Media to 5MB Websites are increasingly getting more bloated with tricks like inlining content (e.g., CNN.com) which puts pages at or above 5MB. This value may still be too low. --- lib/pleroma/web/rich_media/helpers.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex index 00af140ae..2c65d9647 100644 --- a/lib/pleroma/web/rich_media/helpers.ex +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -59,7 +59,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do defp http_options() do [ pool: :media, - max_body: Config.get([:rich_media, :max_body], 2_000_000) + max_body: Config.get([:rich_media, :max_body], 5_000_000) ] end end -- cgit v1.2.3 From d21aa1a77cbda323ae2e82ea7910e076b6011571 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Sun, 18 Feb 2024 22:24:27 -0500 Subject: Respect the TTL returned in OpenGraph tags --- lib/pleroma/web/rich_media/backfill.ex | 4 ++-- lib/pleroma/web/rich_media/parser/ttl.ex | 15 ++++++++------- .../web/rich_media/parser/ttl/aws_signed_url.ex | 2 +- lib/pleroma/web/rich_media/parser/ttl/opengraph.ex | 19 +++++++++++++++++++ 4 files changed, 30 insertions(+), 10 deletions(-) create mode 100644 lib/pleroma/web/rich_media/parser/ttl/opengraph.ex (limited to 'lib') diff --git a/lib/pleroma/web/rich_media/backfill.ex b/lib/pleroma/web/rich_media/backfill.ex index 112028901..386e2023a 100644 --- a/lib/pleroma/web/rich_media/backfill.ex +++ b/lib/pleroma/web/rich_media/backfill.ex @@ -78,8 +78,8 @@ defmodule Pleroma.Web.RichMedia.Backfill do end defp maybe_schedule_expiration(url, fields) do - case TTL.get_from_image(fields, url) do - ttl when is_number(ttl) -> + case TTL.process(fields, url) do + {:ok, ttl} when is_number(ttl) -> timestamp = DateTime.from_unix!(ttl) RichMediaExpirationWorker.new(%{"url" => url}, scheduled_at: timestamp) diff --git a/lib/pleroma/web/rich_media/parser/ttl.ex b/lib/pleroma/web/rich_media/parser/ttl.ex index d69bb0d07..7e56375ff 100644 --- a/lib/pleroma/web/rich_media/parser/ttl.ex +++ b/lib/pleroma/web/rich_media/parser/ttl.ex @@ -5,15 +5,16 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL do @callback ttl(map(), String.t()) :: integer() | nil - def get_from_image(data, url) do + @spec process(map(), String.t()) :: {:ok, integer() | nil} + def process(data, url) do [:rich_media, :ttl_setters] |> Pleroma.Config.get() - |> Enum.reduce({:ok, nil}, fn - module, {:ok, _ttl} -> - module.ttl(data, url) - - _, error -> - error + |> Enum.reduce_while({:ok, nil}, fn + module, acc -> + case module.ttl(data, url) do + ttl when is_number(ttl) -> {:halt, {:ok, ttl}} + _ -> {:cont, acc} + end end) end end diff --git a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex index 22e72e22e..d6bf50fa5 100644 --- a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex +++ b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex @@ -15,7 +15,7 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do |> format_query_params() |> get_expiration_timestamp() else - {:error, "Not aws signed url #{inspect(image)}"} + nil end end diff --git a/lib/pleroma/web/rich_media/parser/ttl/opengraph.ex b/lib/pleroma/web/rich_media/parser/ttl/opengraph.ex new file mode 100644 index 000000000..fc99244c3 --- /dev/null +++ b/lib/pleroma/web/rich_media/parser/ttl/opengraph.ex @@ -0,0 +1,19 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parser.TTL.Opengraph do + @behaviour Pleroma.Web.RichMedia.Parser.TTL + + @impl true + def ttl(%{"ttl" => ttl_string}, _url) do + with ttl <- String.to_integer(ttl_string) do + now = DateTime.utc_now() |> DateTime.to_unix() + now + ttl + else + _ -> nil + end + end + + def ttl(_, _), do: nil +end -- cgit v1.2.3 From 5a5a193877dd890db5682d1809e02d4908d11144 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Thu, 7 Mar 2024 14:19:03 -0500 Subject: Fix broken Rich Media parsing when the image URL is a relative path --- lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex index d6bf50fa5..948c727e1 100644 --- a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex +++ b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex @@ -22,7 +22,8 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do defp aws_signed_url?(image) when is_binary(image) and image != "" do %URI{host: host, query: query} = URI.parse(image) - String.contains?(host, "amazonaws.com") and String.contains?(query, "X-Amz-Expires") + is_binary(host) and String.contains?(host, "amazonaws.com") and + String.contains?(query, "X-Amz-Expires") end defp aws_signed_url?(_), do: nil -- cgit v1.2.3 From 19002fd6c11760898daf0b5ed648d6ba58d84b97 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Wed, 8 May 2024 01:44:58 +0000 Subject: Mastodon API: Remove deprecated GET /api/v1/statuses/:id/card endpoint Removed back in 2019 https://github.com/mastodon/mastodon/pull/11213 --- .../web/mastodon_api/controllers/status_controller.ex | 17 ----------------- lib/pleroma/web/router.ex | 1 - 2 files changed, 18 deletions(-) (limited to 'lib') diff --git a/lib/pleroma/web/mastodon_api/controllers/status_controller.ex b/lib/pleroma/web/mastodon_api/controllers/status_controller.ex index b7dc00a44..83e1bee54 100644 --- a/lib/pleroma/web/mastodon_api/controllers/status_controller.ex +++ b/lib/pleroma/web/mastodon_api/controllers/status_controller.ex @@ -25,7 +25,6 @@ defmodule Pleroma.Web.MastodonAPI.StatusController do alias Pleroma.Web.OAuth.Token alias Pleroma.Web.Plugs.OAuthScopesPlug alias Pleroma.Web.Plugs.RateLimiter - alias Pleroma.Web.RichMedia.Card plug(Pleroma.Web.ApiSpec.CastAndValidate, replace_params: false) @@ -39,7 +38,6 @@ defmodule Pleroma.Web.MastodonAPI.StatusController do when action in [ :index, :show, - :card, :context, :show_history, :show_source @@ -474,21 +472,6 @@ defmodule Pleroma.Web.MastodonAPI.StatusController do end end - @doc "GET /api/v1/statuses/:id/card" - @deprecated "https://github.com/tootsuite/mastodon/pull/11213" - def card( - %{assigns: %{user: user}, private: %{open_api_spex: %{params: %{id: status_id}}}} = conn, - _ - ) do - with %Activity{} = activity <- Activity.get_by_id(status_id), - true <- Visibility.visible_for_user?(activity, user), - %Card{} = card_data <- Card.get_by_activity(activity) do - render(conn, "card.json", card_data) - else - _ -> render_error(conn, :not_found, "Record not found") - end - end - @doc "GET /api/v1/statuses/:id/favourited_by" def favourited_by( %{assigns: %{user: user}, private: %{open_api_spex: %{params: %{id: id}}}} = conn, diff --git a/lib/pleroma/web/router.ex b/lib/pleroma/web/router.ex index 4fe0cb02f..86d6da883 100644 --- a/lib/pleroma/web/router.ex +++ b/lib/pleroma/web/router.ex @@ -768,7 +768,6 @@ defmodule Pleroma.Web.Router do get("/statuses", StatusController, :index) get("/statuses/:id", StatusController, :show) get("/statuses/:id/context", StatusController, :context) - get("/statuses/:id/card", StatusController, :card) get("/statuses/:id/favourited_by", StatusController, :favourited_by) get("/statuses/:id/reblogged_by", StatusController, :reblogged_by) get("/statuses/:id/history", StatusController, :show_history) -- cgit v1.2.3 From 9b9a32bf74a87047b3c12b468f3351950c334995 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Tue, 7 May 2024 21:55:39 -0400 Subject: Fix compile warning warning: "else" clauses will never match because all patterns in "with" will always match lib/pleroma/web/rich_media/parser/ttl/opengraph.ex:10 --- lib/pleroma/web/rich_media/parser/ttl/opengraph.ex | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/pleroma/web/rich_media/parser/ttl/opengraph.ex b/lib/pleroma/web/rich_media/parser/ttl/opengraph.ex index fc99244c3..b06889669 100644 --- a/lib/pleroma/web/rich_media/parser/ttl/opengraph.ex +++ b/lib/pleroma/web/rich_media/parser/ttl/opengraph.ex @@ -6,11 +6,12 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.Opengraph do @behaviour Pleroma.Web.RichMedia.Parser.TTL @impl true - def ttl(%{"ttl" => ttl_string}, _url) do - with ttl <- String.to_integer(ttl_string) do + def ttl(%{"ttl" => ttl_string}, _url) when is_binary(ttl_string) do + try do + ttl = String.to_integer(ttl_string) now = DateTime.utc_now() |> DateTime.to_unix() now + ttl - else + rescue _ -> nil end end -- cgit v1.2.3 From 37c35daba66b3534d8724266118053c11057f2b3 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Tue, 7 May 2024 22:10:49 -0400 Subject: Credo --- lib/pleroma/web/rich_media/backfill.ex | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/pleroma/web/rich_media/backfill.ex b/lib/pleroma/web/rich_media/backfill.ex index 386e2023a..4ec50e132 100644 --- a/lib/pleroma/web/rich_media/backfill.ex +++ b/lib/pleroma/web/rich_media/backfill.ex @@ -2,16 +2,6 @@ # Copyright © 2017-2022 Pleroma Authors # SPDX-License-Identifier: AGPL-3.0-only -defmodule Pleroma.Web.RichMedia.Backfill.Task do - alias Pleroma.Web.RichMedia.Backfill - - def run(args) do - Task.Supervisor.start_child(Pleroma.TaskSupervisor, Backfill, :run, [args], - name: {:global, {:rich_media, args.url_hash}} - ) - end -end - defmodule Pleroma.Web.RichMedia.Backfill do alias Pleroma.Web.RichMedia.Card alias Pleroma.Web.RichMedia.Parser @@ -99,3 +89,13 @@ defmodule Pleroma.Web.RichMedia.Backfill do defp warm_cache(key, val), do: @cachex.put(:rich_media_cache, key, val) defp negative_cache(key, ttl \\ nil), do: @cachex.put(:rich_media_cache, key, nil, ttl: ttl) end + +defmodule Pleroma.Web.RichMedia.Backfill.Task do + alias Pleroma.Web.RichMedia.Backfill + + def run(args) do + Task.Supervisor.start_child(Pleroma.TaskSupervisor, Backfill, :run, [args], + name: {:global, {:rich_media, args.url_hash}} + ) + end +end -- cgit v1.2.3 From 9a83301ff8a3cbc187bb869b01ce6dcab63d93a7 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Tue, 7 May 2024 22:11:19 -0400 Subject: Credo --- lib/pleroma/web/rich_media/helpers.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex index 2c65d9647..119994458 100644 --- a/lib/pleroma/web/rich_media/helpers.ex +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -56,7 +56,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do end end - defp http_options() do + defp http_options do [ pool: :media, max_body: Config.get([:rich_media, :max_body], 5_000_000) -- cgit v1.2.3 From 54c2bab25f965e2e3fa8d118bf5135c335eca2c0 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Tue, 7 May 2024 22:27:18 -0400 Subject: Fix module struct matching --- lib/pleroma/web/rich_media/card.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/pleroma/web/rich_media/card.ex b/lib/pleroma/web/rich_media/card.ex index 2d36f2b62..36a1ae44a 100644 --- a/lib/pleroma/web/rich_media/card.ex +++ b/lib/pleroma/web/rich_media/card.ex @@ -47,7 +47,7 @@ defmodule Pleroma.Web.RichMedia.Card do @cachex.del(:rich_media_cache, url_hash) case get_by_url(url) do - %__MODULE{} = card -> Repo.delete(card) + %__MODULE__{} = card -> Repo.delete(card) nil -> :ok end end -- cgit v1.2.3