import mimetypes import re from typing import Any from urllib.parse import urlparse import httpx from bs4 import BeautifulSoup # type: ignore from loguru import logger from pydantic import BaseModel from app import ap_object from app import config from app.actor import LOCAL_ACTOR from app.actor import fetch_actor from app.database import AsyncSession from app.models import InboxObject from app.models import OutboxObject from app.utils.url import is_url_valid from app.utils.url import make_abs class OpenGraphMeta(BaseModel): url: str title: str image: str | None description: str | None site_name: str def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None: soup = BeautifulSoup(html, "html5lib") ogs = { og.attrs["property"]: og.attrs.get("content") for og in soup.html.head.findAll(property=re.compile(r"^og")) } # FIXME some page have no raw = { "url": url, "title": soup.find("title").text, "image": None, "description": None, "site_name": urlparse(url).netloc, } for field in OpenGraphMeta.__fields__.keys(): og_field = f"og:{field}" if ogs.get(og_field): raw[field] = ogs.get(og_field, None) if "title" not in raw: return None for maybe_rel in {"url", "image"}: if u := raw.get(maybe_rel): raw[maybe_rel] = make_abs(u, url) return OpenGraphMeta.parse_obj(raw) async def external_urls( db_session: AsyncSession, ro: ap_object.RemoteObject | OutboxObject | InboxObject, ) -> set[str]: note_host = urlparse(ro.ap_id).netloc tags_hrefs = set() for tag in ro.tags: if tag_href := tag.get("href"): tags_hrefs.add(tag_href) if tag.get("type") == "Mention": if tag["href"] != LOCAL_ACTOR.ap_id: mentioned_actor = await fetch_actor(db_session, tag["href"]) tags_hrefs.add(mentioned_actor.url) tags_hrefs.add(mentioned_actor.ap_id) else: tags_hrefs.add(LOCAL_ACTOR.ap_id) tags_hrefs.add(LOCAL_ACTOR.url) urls = set() if ro.content: soup = BeautifulSoup(ro.content, "html5lib") for link in soup.find_all("a"): h = link.get("href") ph = urlparse(h) mimetype, _ = mimetypes.guess_type(h) if ( ph.scheme in {"http", "https"} and ph.netloc != note_host and is_url_valid(h) and ( not mimetype or mimetype.split("/")[0] not in ["image", "video", "audio"] ) ): urls.add(h) return urls - tags_hrefs async def _og_meta_from_url(url: str) -> OpenGraphMeta | None: async with httpx.AsyncClient() as client: resp = await client.get( url, headers={ "User-Agent": config.USER_AGENT, }, follow_redirects=True, ) resp.raise_for_status() if not (ct := resp.headers.get("content-type")) or not ct.startswith("text/html"): return None try: return _scrap_og_meta(url, resp.text) except Exception: logger.info(f"Failed to scrap OG meta for {url}") return None async def og_meta_from_note( db_session: AsyncSession, ro: ap_object.RemoteObject, ) -> list[dict[str, Any]]: og_meta = [] urls = await external_urls(db_session, ro) for url in urls: try: maybe_og_meta = await _og_meta_from_url(url) if maybe_og_meta: og_meta.append(maybe_og_meta.dict()) except httpx.HTTPError: pass return og_meta