From 6b7b443ff95587b33f4b666e68ed82dc6fb485a5 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Tue, 6 Feb 2024 14:34:59 -0500 Subject: Pleroma.Web.RichMedia.Parser: Remove test-specific codepaths Also consolidate Tesla mocks into the HttpRequestMock module. Tests were not exercising the real codepaths. The Rich Media Preview only works with https, but most of these tests were only mocking http. --- lib/pleroma/caching.ex | 3 + lib/pleroma/web/rich_media/parser.ex | 107 +++++++++++++++++------------------ 2 files changed, 54 insertions(+), 56 deletions(-) (limited to 'lib') diff --git a/lib/pleroma/caching.ex b/lib/pleroma/caching.ex index eb0588708..796a465af 100644 --- a/lib/pleroma/caching.ex +++ b/lib/pleroma/caching.ex @@ -8,10 +8,13 @@ defmodule Pleroma.Caching do @callback put(Cachex.cache(), any(), any(), Keyword.t()) :: {Cachex.status(), boolean()} @callback put(Cachex.cache(), any(), any()) :: {Cachex.status(), boolean()} @callback fetch!(Cachex.cache(), any(), function() | nil) :: any() + @callback fetch(Cachex.cache(), any(), function() | nil) :: + {atom(), any()} | {atom(), any(), any()} # @callback del(Cachex.cache(), any(), Keyword.t()) :: {Cachex.status(), boolean()} @callback del(Cachex.cache(), any()) :: {Cachex.status(), boolean()} @callback stream!(Cachex.cache(), any()) :: Enumerable.t() @callback expire_at(Cachex.cache(), binary(), number()) :: {Cachex.status(), boolean()} + @callback expire(Cachex.cache(), binary(), number()) :: {Cachex.status(), boolean()} @callback exists?(Cachex.cache(), any()) :: {Cachex.status(), boolean()} @callback execute!(Cachex.cache(), function()) :: any() @callback get_and_update(Cachex.cache(), any(), function()) :: diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index 837182d8a..a791b2bab 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -13,70 +13,65 @@ defmodule Pleroma.Web.RichMedia.Parser do def parse(nil), do: {:error, "No URL provided"} - if Pleroma.Config.get(:env) == :test do - @spec parse(String.t()) :: {:ok, map()} | {:error, any()} - def parse(url), do: parse_url(url) - else - @spec parse(String.t()) :: {:ok, map()} | {:error, any()} - def parse(url) do - with {:ok, data} <- get_cached_or_parse(url), - {:ok, _} <- set_ttl_based_on_image(data, url) do - {:ok, data} - end + @spec parse(String.t()) :: {:ok, map()} | {:error, any()} + def parse(url) do + with {:ok, data} <- get_cached_or_parse(url), + {:ok, _} <- set_ttl_based_on_image(data, url) do + {:ok, data} end + end - defp get_cached_or_parse(url) do - case @cachex.fetch(:rich_media_cache, url, fn -> - case parse_url(url) do - {:ok, _} = res -> - {:commit, res} - - {:error, reason} = e -> - # Unfortunately we have to log errors here, instead of doing that - # along with ttl setting at the bottom. Otherwise we can get log spam - # if more than one process was waiting for the rich media card - # while it was generated. Ideally we would set ttl here as well, - # so we don't override it number_of_waiters_on_generation - # times, but one, obviously, can't set ttl for not-yet-created entry - # and Cachex doesn't support returning ttl from the fetch callback. - log_error(url, reason) - {:commit, e} - end - end) do - {action, res} when action in [:commit, :ok] -> - case res do - {:ok, _data} = res -> - res - - {:error, reason} = e -> - if action == :commit, do: set_error_ttl(url, reason) - e - end - - {:error, e} -> - {:error, {:cachex_error, e}} - end + defp get_cached_or_parse(url) do + case @cachex.fetch(:rich_media_cache, url, fn -> + case parse_url(url) do + {:ok, _} = res -> + {:commit, res} + + {:error, reason} = e -> + # Unfortunately we have to log errors here, instead of doing that + # along with ttl setting at the bottom. Otherwise we can get log spam + # if more than one process was waiting for the rich media card + # while it was generated. Ideally we would set ttl here as well, + # so we don't override it number_of_waiters_on_generation + # times, but one, obviously, can't set ttl for not-yet-created entry + # and Cachex doesn't support returning ttl from the fetch callback. + log_error(url, reason) + {:commit, e} + end + end) do + {action, res} when action in [:commit, :ok] -> + case res do + {:ok, _data} = res -> + res + + {:error, reason} = e -> + if action == :commit, do: set_error_ttl(url, reason) + e + end + + {:error, e} -> + {:error, {:cachex_error, e}} end + end - defp set_error_ttl(_url, :body_too_large), do: :ok - defp set_error_ttl(_url, {:content_type, _}), do: :ok + defp set_error_ttl(_url, :body_too_large), do: :ok + defp set_error_ttl(_url, {:content_type, _}), do: :ok - # The TTL is not set for the errors above, since they are unlikely to change - # with time + # The TTL is not set for the errors above, since they are unlikely to change + # with time - defp set_error_ttl(url, _reason) do - ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000) - @cachex.expire(:rich_media_cache, url, ttl) - :ok - end + defp set_error_ttl(url, _reason) do + ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000) + @cachex.expire(:rich_media_cache, url, ttl) + :ok + end - defp log_error(url, {:invalid_metadata, data}) do - Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end) - end + defp log_error(url, {:invalid_metadata, data}) do + Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end) + end - defp log_error(url, reason) do - Logger.warning(fn -> "Rich media error for #{url}: #{inspect(reason)}" end) - end + defp log_error(url, reason) do + Logger.warning(fn -> "Rich media error for #{url}: #{inspect(reason)}" end) end @doc """ -- cgit v1.2.3 From 9f2319e50dc0516bde4bfa3b117ec4792e553bd2 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Tue, 6 Feb 2024 16:54:52 -0500 Subject: RichMedia.Helpers: move the validate_page_url/1 function to the Parser module This will ensure that the page validation happens in Parser.parse/1 so it can be called from anywhere and still filter invalid URLs. --- lib/pleroma/web/rich_media/helpers.ex | 43 -------------------------------- lib/pleroma/web/rich_media/parser.ex | 46 ++++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 44 deletions(-) (limited to 'lib') diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex index 1501776d9..a711dc436 100644 --- a/lib/pleroma/web/rich_media/helpers.ex +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -18,53 +18,10 @@ defmodule Pleroma.Web.RichMedia.Helpers do recv_timeout: 2_000 ] - @spec validate_page_url(URI.t() | binary()) :: :ok | :error - defp validate_page_url(page_url) when is_binary(page_url) do - validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld]) - - page_url - |> Linkify.Parser.url?(validate_tld: validate_tld) - |> parse_uri(page_url) - end - - defp validate_page_url(%URI{host: host, scheme: "https"}) do - cond do - Linkify.Parser.ip?(host) -> - :error - - host in @config_impl.get([:rich_media, :ignore_hosts], []) -> - :error - - get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) -> - :error - - true -> - :ok - end - end - - defp validate_page_url(_), do: :error - - defp parse_uri(true, url) do - url - |> URI.parse() - |> validate_page_url - end - - defp parse_uri(_, _), do: :error - - defp get_tld(host) do - host - |> String.split(".") - |> Enum.reverse() - |> hd - end - def fetch_data_for_object(object) do with true <- @config_impl.get([:rich_media, :enabled]), {:ok, page_url} <- HTML.extract_first_external_url_from_object(object), - :ok <- validate_page_url(page_url), {:ok, rich_media} <- Parser.parse(page_url) do %{page_url: page_url, rich_media: rich_media} else diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index a791b2bab..a73fbc4b9 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -6,6 +6,7 @@ defmodule Pleroma.Web.RichMedia.Parser do require Logger @cachex Pleroma.Config.get([:cachex, :provider], Cachex) + @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) defp parsers do Pleroma.Config.get([:rich_media, :parsers]) @@ -15,7 +16,8 @@ defmodule Pleroma.Web.RichMedia.Parser do @spec parse(String.t()) :: {:ok, map()} | {:error, any()} def parse(url) do - with {:ok, data} <- get_cached_or_parse(url), + with :ok <- validate_page_url(url), + {:ok, data} <- get_cached_or_parse(url), {:ok, _} <- set_ttl_based_on_image(data, url) do {:ok, data} end @@ -161,4 +163,46 @@ defmodule Pleroma.Web.RichMedia.Parser do end) |> Map.new() end + + @spec validate_page_url(URI.t() | binary()) :: :ok | :error + defp validate_page_url(page_url) when is_binary(page_url) do + validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld]) + + page_url + |> Linkify.Parser.url?(validate_tld: validate_tld) + |> parse_uri(page_url) + end + + defp validate_page_url(%URI{host: host, scheme: "https"}) do + cond do + Linkify.Parser.ip?(host) -> + :error + + host in @config_impl.get([:rich_media, :ignore_hosts], []) -> + :error + + get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) -> + :error + + true -> + :ok + end + end + + defp validate_page_url(_), do: :error + + defp parse_uri(true, url) do + url + |> URI.parse() + |> validate_page_url + end + + defp parse_uri(_, _), do: :error + + defp get_tld(host) do + host + |> String.split(".") + |> Enum.reverse() + |> hd + end end -- cgit v1.2.3