microblog.pub/app/opengraph.py

import mimetypes
import re
from urllib.parse import urlparse

import httpx
from bs4 import BeautifulSoup  # type: ignore
from pydantic import BaseModel

from app import activitypub as ap
from app import config
from app.urlutils import is_url_valid


class OpenGraphMeta(BaseModel):
    url: str
    title: str
    image: str
    description: str
    site_name: str


def _scrap_og_meta(html: str) -> OpenGraphMeta | None:
    soup = BeautifulSoup(html, "html5lib")
    ogs = {
        og.attrs["property"]: og.attrs.get("content")
        for og in soup.html.head.findAll(property=re.compile(r"^og"))
    }
    raw = {}
    for field in OpenGraphMeta.__fields__.keys():
        og_field = f"og:{field}"
        if not ogs.get(og_field):
            return None

        raw[field] = ogs[og_field]

    return OpenGraphMeta.parse_obj(raw)


def _urls_from_note(note: ap.RawObject) -> set[str]:
    note_host = urlparse(ap.get_id(note["id"]) or "").netloc

    urls = set()
    if "content" in note:
        soup = BeautifulSoup(note["content"], "html5lib")
        for link in soup.find_all("a"):
            h = link.get("href")
            ph = urlparse(h)
            mimetype, _ = mimetypes.guess_type(h)
            if (
                ph.scheme in {"http", "https"}
                and ph.netloc != note_host
                and is_url_valid(h)
                and (
                    not mimetype
                    or mimetype.split("/")[0] in ["image", "video", "audio"]
                )
            ):
                urls.add(h)

    return urls


def _og_meta_from_url(url: str) -> OpenGraphMeta | None:
    resp = httpx.get(
        url,
        headers={
            "User-Agent": config.USER_AGENT,
        },
        follow_redirects=True,
    )
    resp.raise_for_status()

    if not (ct := resp.headers.get("content-type")) or not ct.startswith("text/html"):
        return None

    return _scrap_og_meta(resp.text)


def og_meta_from_note(note: ap.RawObject) -> list[OpenGraphMeta]:
    og_meta = []
    urls = _urls_from_note(note)
    for url in urls:
        try:
            maybe_og_meta = _og_meta_from_url(url)
            if maybe_og_meta:
                og_meta.append(maybe_og_meta)
        except httpx.HTTPError:
            pass

    return og_meta