microblog.pub/app/utils/opengraph.py

import asyncio
import mimetypes
import re
import signal
from concurrent.futures import TimeoutError
from typing import Any
from urllib.parse import urlparse

import httpx
from bs4 import BeautifulSoup  # type: ignore
from loguru import logger
from pebble import concurrent  # type: ignore
from pydantic import BaseModel

from app import activitypub as ap
from app import ap_object
from app import config
from app.actor import LOCAL_ACTOR
from app.actor import fetch_actor
from app.database import AsyncSession
from app.models import InboxObject
from app.models import OutboxObject
from app.utils.url import is_url_valid
from app.utils.url import make_abs


class OpenGraphMeta(BaseModel):
    url: str
    title: str
    image: str | None
    description: str | None
    site_name: str


@concurrent.process(timeout=5)
def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
    # Prevent SIGTERM to bubble up to the worker
    signal.signal(signal.SIGTERM, signal.SIG_IGN)

    soup = BeautifulSoup(html, "html5lib")
    ogs = {
        og.attrs["property"]: og.attrs.get("content")
        for og in soup.html.head.findAll(property=re.compile(r"^og"))
    }
    # FIXME some page have no <title>
    raw = {
        "url": url,
        "title": soup.find("title").text.strip(),
        "image": None,
        "description": None,
        "site_name": urlparse(url).hostname,
    }
    for field in OpenGraphMeta.__fields__.keys():
        og_field = f"og:{field}"
        if ogs.get(og_field):
            raw[field] = ogs.get(og_field, None)

    if "title" not in raw:
        return None

    for maybe_rel in {"url", "image"}:
        if u := raw.get(maybe_rel):
            raw[maybe_rel] = make_abs(u, url)

    return OpenGraphMeta.parse_obj(raw)


def scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
    return _scrap_og_meta(url, html).result()


async def external_urls(
    db_session: AsyncSession,
    ro: ap_object.RemoteObject | OutboxObject | InboxObject,
) -> set[str]:
    note_host = urlparse(ro.ap_id).hostname

    tags_hrefs = set()
    for tag in ro.tags:
        if tag_href := tag.get("href"):
            tags_hrefs.add(tag_href)
        if tag.get("type") == "Mention":
            if tag["href"] != LOCAL_ACTOR.ap_id:
                try:
                    mentioned_actor = await fetch_actor(db_session, tag["href"])
                except (ap.FetchError, ap.NotAnObjectError):
                    tags_hrefs.add(tag["href"])
                    continue

                tags_hrefs.add(mentioned_actor.url)
                tags_hrefs.add(mentioned_actor.ap_id)
            else:
                tags_hrefs.add(LOCAL_ACTOR.ap_id)
                tags_hrefs.add(LOCAL_ACTOR.url)

    urls = set()
    if ro.content:
        soup = BeautifulSoup(ro.content, "html5lib")
        for link in soup.find_all("a"):
            h = link.get("href")
            if not h:
                continue

            try:
                ph = urlparse(h)
                mimetype, _ = mimetypes.guess_type(h)
                if (
                    ph.scheme in {"http", "https"}
                    and ph.hostname != note_host
                    and is_url_valid(h)
                    and (
                        not mimetype
                        or mimetype.split("/")[0] not in ["image", "video", "audio"]
                    )
                ):
                    urls.add(h)
            except Exception:
                logger.exception(f"Failed to check {h}")
                continue

    return urls - tags_hrefs


async def _og_meta_from_url(url: str) -> OpenGraphMeta | None:
    async with httpx.AsyncClient() as client:
        resp = await client.get(
            url,
            headers={
                "User-Agent": config.USER_AGENT,
            },
            follow_redirects=True,
        )

    resp.raise_for_status()

    if not (ct := resp.headers.get("content-type")) or not ct.startswith("text/html"):
        return None

    try:
        return scrap_og_meta(url, resp.text)
    except TimeoutError:
        logger.info(f"Timed out when scraping OG meta for {url}")
        return None
    except Exception:
        logger.info(f"Failed to scrap OG meta for {url}")
        return None


async def og_meta_from_note(
    db_session: AsyncSession,
    ro: ap_object.RemoteObject,
) -> list[dict[str, Any]]:
    og_meta = []
    urls = await external_urls(db_session, ro)
    logger.debug(f"Lookig OG metadata in {urls=}")
    for url in urls:
        logger.debug(f"Processing {url}")
        try:
            maybe_og_meta = None
            try:
                maybe_og_meta = await asyncio.wait_for(
                    _og_meta_from_url(url),
                    timeout=5,
                )
            except asyncio.TimeoutError:
                logger.info(f"Timing out fetching {url}")
            except Exception:
                logger.exception(f"Failed scrap OG meta for {url}")

            if maybe_og_meta:
                og_meta.append(maybe_og_meta.dict())
        except httpx.HTTPError:
            pass

    return og_meta
Fix OG metadata processing 2022-08-28 19:05:06 +02:00			`import asyncio`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`import mimetypes`
			`import re`
Fix OG metadata scraping and improve workers 2022-11-13 13:00:22 +01:00			`import signal`
			`from concurrent.futures import TimeoutError`
Admin fixes and improved OG meta support 2022-06-29 21:38:13 +02:00			`from typing import Any`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`from urllib.parse import urlparse`

			`import httpx`
			`from bs4 import BeautifulSoup # type: ignore`
Fix OG meta 2022-08-11 22:14:11 +02:00			`from loguru import logger`
Fix OG metadata scraping and improve workers 2022-11-13 13:00:22 +01:00			`from pebble import concurrent # type: ignore`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`from pydantic import BaseModel`

Tweak processing 2022-09-26 21:41:34 +02:00			`from app import activitypub as ap`
OG metadata fixes/tweaks 2022-08-04 17:36:21 +02:00			`from app import ap_object`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`from app import config`
OG metadata fixes/tweaks 2022-08-04 17:36:21 +02:00			`from app.actor import LOCAL_ACTOR`
			`from app.actor import fetch_actor`
			`from app.database import AsyncSession`
			`from app.models import InboxObject`
			`from app.models import OutboxObject`
Cleanup utils 2022-06-22 21:15:07 +02:00			`from app.utils.url import is_url_valid`
Improve privacy relace 2022-08-04 19:11:14 +02:00			`from app.utils.url import make_abs`
Initial commit for new v2 2022-06-22 20:11:22 +02:00

			`class OpenGraphMeta(BaseModel):`
			`url: str`
			`title: str`
Improve opengrah metadata 2022-08-02 22:22:15 +02:00			`image: str \| None`
			`description: str \| None`
			`site_name: str`
Initial commit for new v2 2022-06-22 20:11:22 +02:00

Fix OG metadata scraping and improve workers 2022-11-13 13:00:22 +01:00			`@concurrent.process(timeout=5)`
Improve opengrah metadata 2022-08-02 22:22:15 +02:00			`def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta \| None:`
Fix OG metadata scraping and improve workers 2022-11-13 13:00:22 +01:00			`# Prevent SIGTERM to bubble up to the worker`
			`signal.signal(signal.SIGTERM, signal.SIG_IGN)`

Initial commit for new v2 2022-06-22 20:11:22 +02:00			`soup = BeautifulSoup(html, "html5lib")`
			`ogs = {`
			`og.attrs["property"]: og.attrs.get("content")`
			`for og in soup.html.head.findAll(property=re.compile(r"^og"))`
			`}`
Fix OG meta 2022-08-11 22:14:11 +02:00			`# FIXME some page have no <title>`
Improve opengrah metadata 2022-08-02 22:22:15 +02:00			`raw = {`
			`"url": url,`
Fix OG metadata processing 2022-08-28 19:05:06 +02:00			`"title": soup.find("title").text.strip(),`
Improve opengrah metadata 2022-08-02 22:22:15 +02:00			`"image": None,`
			`"description": None,`
Tweak URL parsing 2022-08-15 10:27:58 +02:00			`"site_name": urlparse(url).hostname,`
Improve opengrah metadata 2022-08-02 22:22:15 +02:00			`}`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`for field in OpenGraphMeta.__fields__.keys():`
			`og_field = f"og:{field}"`
Improve opengrah metadata 2022-08-02 22:22:15 +02:00			`if ogs.get(og_field):`
			`raw[field] = ogs.get(og_field, None)`
Initial commit for new v2 2022-06-22 20:11:22 +02:00
Improve opengrah metadata 2022-08-02 22:22:15 +02:00			`if "title" not in raw:`
			`return None`
Initial commit for new v2 2022-06-22 20:11:22 +02:00
Improve privacy relace 2022-08-04 19:11:14 +02:00			`for maybe_rel in {"url", "image"}:`
			`if u := raw.get(maybe_rel):`
			`raw[maybe_rel] = make_abs(u, url)`

Initial commit for new v2 2022-06-22 20:11:22 +02:00			`return OpenGraphMeta.parse_obj(raw)`


Fix OG metadata scraping and improve workers 2022-11-13 13:00:22 +01:00			`def scrap_og_meta(url: str, html: str) -> OpenGraphMeta \| None:`
			`return _scrap_og_meta(url, html).result()`


OG metadata fixes/tweaks 2022-08-04 17:36:21 +02:00			`async def external_urls(`
			`db_session: AsyncSession,`
			`ro: ap_object.RemoteObject \| OutboxObject \| InboxObject,`
			`) -> set[str]:`
Tweak URL parsing 2022-08-15 10:27:58 +02:00			`note_host = urlparse(ro.ap_id).hostname`
Initial commit for new v2 2022-06-22 20:11:22 +02:00
Bug fixes 2022-07-06 21:13:33 +02:00			`tags_hrefs = set()`
OG metadata fixes/tweaks 2022-08-04 17:36:21 +02:00			`for tag in ro.tags:`
Bug fixes 2022-07-06 21:13:33 +02:00			`if tag_href := tag.get("href"):`
			`tags_hrefs.add(tag_href)`
Improve workers for incoming/outgoing activities 2022-08-10 20:34:36 +02:00			`if tag.get("type") == "Mention":`
Fix typing 2022-08-10 20:47:19 +02:00			`if tag["href"] != LOCAL_ACTOR.ap_id:`
Tweak processing 2022-09-26 21:41:34 +02:00			`try:`
			`mentioned_actor = await fetch_actor(db_session, tag["href"])`
			`except (ap.FetchError, ap.NotAnObjectError):`
			`tags_hrefs.add(tag["href"])`
			`continue`

Fix typing 2022-08-10 20:47:19 +02:00			`tags_hrefs.add(mentioned_actor.url)`
			`tags_hrefs.add(mentioned_actor.ap_id)`
			`else:`
			`tags_hrefs.add(LOCAL_ACTOR.ap_id)`
			`tags_hrefs.add(LOCAL_ACTOR.url)`
Bug fixes 2022-07-06 21:13:33 +02:00
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`urls = set()`
OG metadata fixes/tweaks 2022-08-04 17:36:21 +02:00			`if ro.content:`
			`soup = BeautifulSoup(ro.content, "html5lib")`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`for link in soup.find_all("a"):`
			`h = link.get("href")`
Bugfixes 2022-09-13 21:02:47 +02:00			`if not h:`
			`continue`

Improve OG meta processing 2022-09-29 09:10:05 +02:00			`try:`
			`ph = urlparse(h)`
			`mimetype, _ = mimetypes.guess_type(h)`
			`if (`
			`ph.scheme in {"http", "https"}`
			`and ph.hostname != note_host`
			`and is_url_valid(h)`
			`and (`
			`not mimetype`
			`or mimetype.split("/")[0] not in ["image", "video", "audio"]`
			`)`
			`):`
			`urls.add(h)`
			`except Exception:`
			`logger.exception(f"Failed to check {h}")`
			`continue`
Initial commit for new v2 2022-06-22 20:11:22 +02:00
Bug fixes 2022-07-06 21:13:33 +02:00			`return urls - tags_hrefs`
Initial commit for new v2 2022-06-22 20:11:22 +02:00

Admin fixes and improved OG meta support 2022-06-29 21:38:13 +02:00			`async def _og_meta_from_url(url: str) -> OpenGraphMeta \| None:`
			`async with httpx.AsyncClient() as client:`
			`resp = await client.get(`
			`url,`
			`headers={`
			`"User-Agent": config.USER_AGENT,`
			`},`
			`follow_redirects=True,`
			`)`

Initial commit for new v2 2022-06-22 20:11:22 +02:00			`resp.raise_for_status()`

			`if not (ct := resp.headers.get("content-type")) or not ct.startswith("text/html"):`
			`return None`

Fix OG meta 2022-08-11 22:14:11 +02:00			`try:`
Fix OG metadata scraping and improve workers 2022-11-13 13:00:22 +01:00			`return scrap_og_meta(url, resp.text)`
			`except TimeoutError:`
			`logger.info(f"Timed out when scraping OG meta for {url}")`
			`return None`
Fix OG meta 2022-08-11 22:14:11 +02:00			`except Exception:`
			`logger.info(f"Failed to scrap OG meta for {url}")`
			`return None`
Initial commit for new v2 2022-06-22 20:11:22 +02:00

OG metadata fixes/tweaks 2022-08-04 17:36:21 +02:00			`async def og_meta_from_note(`
			`db_session: AsyncSession,`
			`ro: ap_object.RemoteObject,`
			`) -> list[dict[str, Any]]:`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`og_meta = []`
OG metadata fixes/tweaks 2022-08-04 17:36:21 +02:00			`urls = await external_urls(db_session, ro)`
Fix OG metadata processing 2022-08-28 19:05:06 +02:00			`logger.debug(f"Lookig OG metadata in {urls=}")`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`for url in urls:`
Fix OG metadata processing 2022-08-28 19:05:06 +02:00			`logger.debug(f"Processing {url}")`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`try:`
Fix OG metadata processing 2022-08-28 19:05:06 +02:00			`maybe_og_meta = None`
			`try:`
			`maybe_og_meta = await asyncio.wait_for(`
			`_og_meta_from_url(url),`
			`timeout=5,`
			`)`
			`except asyncio.TimeoutError:`
			`logger.info(f"Timing out fetching {url}")`
			`except Exception:`
			`logger.exception(f"Failed scrap OG meta for {url}")`

Initial commit for new v2 2022-06-22 20:11:22 +02:00			`if maybe_og_meta:`
Admin fixes and improved OG meta support 2022-06-29 21:38:13 +02:00			`og_meta.append(maybe_og_meta.dict())`
Initial commit for new v2 2022-06-22 20:11:22 +02:00			`except httpx.HTTPError:`
			`pass`

			`return og_meta`