From 3097dbebe934fada543d8649ec24eafc5ac390d3 Mon Sep 17 00:00:00 2001
From: Thomas Sileo <t@a4.io>
Date: Thu, 15 Dec 2022 22:14:24 +0100
Subject: [PATCH] Improve Webfinger

---
 app/webfinger.py | 88 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 74 insertions(+), 14 deletions(-)

diff --git a/app/webfinger.py b/app/webfinger.py
index e0bd854..d58bab3 100644
--- a/app/webfinger.py
+++ b/app/webfinger.py
@@ -1,3 +1,4 @@
+import xml.etree.ElementTree as ET
 from typing import Any
 from urllib.parse import urlparse
 
@@ -8,33 +9,85 @@ from app import config
 from app.utils.url import check_url
 
 
+async def get_webfinger_via_host_meta(host: str) -> str | None:
+    resp: httpx.Response | None = None
+    is_404 = False
+    async with httpx.AsyncClient() as client:
+        for i, proto in enumerate({"http", "https"}):
+            try:
+                url = f"{proto}://{host}/.well-known/host-meta"
+                check_url(url)
+                resp = await client.get(
+                    url,
+                    headers={
+                        "User-Agent": config.USER_AGENT,
+                    },
+                    follow_redirects=True,
+                )
+                resp.raise_for_status()
+                break
+            except httpx.HTTPStatusError as http_error:
+                logger.exception("HTTP error")
+                if http_error.response.status_code in [403, 404, 410]:
+                    is_404 = True
+                    continue
+                raise
+            except httpx.HTTPError:
+                logger.exception("req failed")
+                # If we tried https first and the domain is "http only"
+                if i == 0:
+                    continue
+                break
+
+        if is_404:
+            return None
+
+    if resp:
+        tree = ET.fromstring(resp.text)
+        maybe_link = tree.find(
+            "./{http://docs.oasis-open.org/ns/xri/xrd-1.0}Link[@rel='lrdd']"
+        )
+        if maybe_link is not None:
+            return maybe_link.attrib.get("template")
+
+    return None
+
+
 async def webfinger(
     resource: str,
+    webfinger_url: str | None = None,
 ) -> dict[str, Any] | None:  # noqa: C901
     """Mastodon-like WebFinger resolution to retrieve the activity stream Actor URL."""
     resource = resource.strip()
     logger.info(f"performing webfinger resolution for {resource}")
-    protos = ["https", "http"]
-    if resource.startswith("http://"):
-        protos.reverse()
-        host = urlparse(resource).netloc
-    elif resource.startswith("https://"):
-        host = urlparse(resource).netloc
+    urls = []
+    host = None
+    if webfinger_url:
+        urls = [webfinger_url]
     else:
-        if resource.startswith("acct:"):
-            resource = resource[5:]
-        if resource.startswith("@"):
-            resource = resource[1:]
-        _, host = resource.split("@", 1)
-        resource = "acct:" + resource
+        if resource.startswith("http://"):
+            host = urlparse(resource).netloc
+            url = f"http://{host}/.well-known/webfinger"
+        elif resource.startswith("https://"):
+            host = urlparse(resource).netloc
+            url = f"https://{host}/.well-known/webfinger"
+        else:
+            protos = ["https", "http"]
+            _, host = resource.split("@", 1)
+            urls = [f"{proto}://{host}/.well-known/webfinger" for proto in protos]
+
+    if resource.startswith("acct:"):
+        resource = resource[5:]
+    if resource.startswith("@"):
+        resource = resource[1:]
+    resource = "acct:" + resource
 
     is_404 = False
 
     resp: httpx.Response | None = None
     async with httpx.AsyncClient() as client:
-        for i, proto in enumerate(protos):
+        for i, url in enumerate(urls):
             try:
-                url = f"{proto}://{host}/.well-known/webfinger"
                 check_url(url)
                 resp = await client.get(
                     url,
@@ -58,7 +111,14 @@ async def webfinger(
                 if i == 0:
                     continue
                 break
+
     if is_404:
+        if not webfinger_url and host:
+            if webfinger_url := (await get_webfinger_via_host_meta(host)):
+                return await webfinger(
+                    resource,
+                    webfinger_url=webfinger_url,
+                )
         return None
 
     if resp: