From 3097dbebe934fada543d8649ec24eafc5ac390d3 Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Thu, 15 Dec 2022 22:14:24 +0100 Subject: [PATCH] Improve Webfinger --- app/webfinger.py | 88 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 14 deletions(-) diff --git a/app/webfinger.py b/app/webfinger.py index e0bd854..d58bab3 100644 --- a/app/webfinger.py +++ b/app/webfinger.py @@ -1,3 +1,4 @@ +import xml.etree.ElementTree as ET from typing import Any from urllib.parse import urlparse @@ -8,33 +9,85 @@ from app import config from app.utils.url import check_url +async def get_webfinger_via_host_meta(host: str) -> str | None: + resp: httpx.Response | None = None + is_404 = False + async with httpx.AsyncClient() as client: + for i, proto in enumerate({"http", "https"}): + try: + url = f"{proto}://{host}/.well-known/host-meta" + check_url(url) + resp = await client.get( + url, + headers={ + "User-Agent": config.USER_AGENT, + }, + follow_redirects=True, + ) + resp.raise_for_status() + break + except httpx.HTTPStatusError as http_error: + logger.exception("HTTP error") + if http_error.response.status_code in [403, 404, 410]: + is_404 = True + continue + raise + except httpx.HTTPError: + logger.exception("req failed") + # If we tried https first and the domain is "http only" + if i == 0: + continue + break + + if is_404: + return None + + if resp: + tree = ET.fromstring(resp.text) + maybe_link = tree.find( + "./{http://docs.oasis-open.org/ns/xri/xrd-1.0}Link[@rel='lrdd']" + ) + if maybe_link is not None: + return maybe_link.attrib.get("template") + + return None + + async def webfinger( resource: str, + webfinger_url: str | None = None, ) -> dict[str, Any] | None: # noqa: C901 """Mastodon-like WebFinger resolution to retrieve the activity stream Actor URL.""" resource = resource.strip() logger.info(f"performing webfinger resolution for {resource}") - protos = ["https", "http"] - if resource.startswith("http://"): - protos.reverse() - host = urlparse(resource).netloc - elif resource.startswith("https://"): - host = urlparse(resource).netloc + urls = [] + host = None + if webfinger_url: + urls = [webfinger_url] else: - if resource.startswith("acct:"): - resource = resource[5:] - if resource.startswith("@"): - resource = resource[1:] - _, host = resource.split("@", 1) - resource = "acct:" + resource + if resource.startswith("http://"): + host = urlparse(resource).netloc + url = f"http://{host}/.well-known/webfinger" + elif resource.startswith("https://"): + host = urlparse(resource).netloc + url = f"https://{host}/.well-known/webfinger" + else: + protos = ["https", "http"] + _, host = resource.split("@", 1) + urls = [f"{proto}://{host}/.well-known/webfinger" for proto in protos] + + if resource.startswith("acct:"): + resource = resource[5:] + if resource.startswith("@"): + resource = resource[1:] + resource = "acct:" + resource is_404 = False resp: httpx.Response | None = None async with httpx.AsyncClient() as client: - for i, proto in enumerate(protos): + for i, url in enumerate(urls): try: - url = f"{proto}://{host}/.well-known/webfinger" check_url(url) resp = await client.get( url, @@ -58,7 +111,14 @@ async def webfinger( if i == 0: continue break + if is_404: + if not webfinger_url and host: + if webfinger_url := (await get_webfinger_via_host_meta(host)): + return await webfinger( + resource, + webfinger_url=webfinger_url, + ) return None if resp: