microblog.pub/app/utils/opengraph.py

176 lines
5.0 KiB
Python
Raw Normal View History

2022-08-28 19:05:06 +02:00
import asyncio
2022-06-22 20:11:22 +02:00
import mimetypes
import re
import signal
from concurrent.futures import TimeoutError
from typing import Any
2022-06-22 20:11:22 +02:00
from urllib.parse import urlparse
import httpx
from bs4 import BeautifulSoup # type: ignore
2022-08-11 22:14:11 +02:00
from loguru import logger
from pebble import concurrent # type: ignore
2022-06-22 20:11:22 +02:00
from pydantic import BaseModel
2022-09-26 21:41:34 +02:00
from app import activitypub as ap
2022-08-04 17:36:21 +02:00
from app import ap_object
2022-06-22 20:11:22 +02:00
from app import config
2022-08-04 17:36:21 +02:00
from app.actor import LOCAL_ACTOR
from app.actor import fetch_actor
from app.database import AsyncSession
from app.models import InboxObject
from app.models import OutboxObject
2022-06-22 21:15:07 +02:00
from app.utils.url import is_url_valid
2022-08-04 19:11:14 +02:00
from app.utils.url import make_abs
2022-06-22 20:11:22 +02:00
class OpenGraphMeta(BaseModel):
url: str
title: str
2022-08-02 22:22:15 +02:00
image: str | None
description: str | None
site_name: str
2022-06-22 20:11:22 +02:00
@concurrent.process(timeout=5)
2022-08-02 22:22:15 +02:00
def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
# Prevent SIGTERM to bubble up to the worker
signal.signal(signal.SIGTERM, signal.SIG_IGN)
2022-06-22 20:11:22 +02:00
soup = BeautifulSoup(html, "html5lib")
ogs = {
og.attrs["property"]: og.attrs.get("content")
for og in soup.html.head.findAll(property=re.compile(r"^og"))
}
2022-08-11 22:14:11 +02:00
# FIXME some page have no <title>
2022-08-02 22:22:15 +02:00
raw = {
"url": url,
2022-08-28 19:05:06 +02:00
"title": soup.find("title").text.strip(),
2022-08-02 22:22:15 +02:00
"image": None,
"description": None,
2022-08-15 10:27:58 +02:00
"site_name": urlparse(url).hostname,
2022-08-02 22:22:15 +02:00
}
2022-06-22 20:11:22 +02:00
for field in OpenGraphMeta.__fields__.keys():
og_field = f"og:{field}"
2022-08-02 22:22:15 +02:00
if ogs.get(og_field):
raw[field] = ogs.get(og_field, None)
2022-06-22 20:11:22 +02:00
2022-08-02 22:22:15 +02:00
if "title" not in raw:
return None
2022-06-22 20:11:22 +02:00
2022-08-04 19:11:14 +02:00
for maybe_rel in {"url", "image"}:
if u := raw.get(maybe_rel):
raw[maybe_rel] = make_abs(u, url)
2022-06-22 20:11:22 +02:00
return OpenGraphMeta.parse_obj(raw)
def scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
return _scrap_og_meta(url, html).result()
2022-08-04 17:36:21 +02:00
async def external_urls(
db_session: AsyncSession,
ro: ap_object.RemoteObject | OutboxObject | InboxObject,
) -> set[str]:
2022-08-15 10:27:58 +02:00
note_host = urlparse(ro.ap_id).hostname
2022-06-22 20:11:22 +02:00
2022-07-06 21:13:33 +02:00
tags_hrefs = set()
2022-08-04 17:36:21 +02:00
for tag in ro.tags:
2022-07-06 21:13:33 +02:00
if tag_href := tag.get("href"):
tags_hrefs.add(tag_href)
if tag.get("type") == "Mention":
2022-08-10 20:47:19 +02:00
if tag["href"] != LOCAL_ACTOR.ap_id:
2022-09-26 21:41:34 +02:00
try:
mentioned_actor = await fetch_actor(db_session, tag["href"])
except (ap.FetchError, ap.NotAnObjectError):
tags_hrefs.add(tag["href"])
continue
2022-08-10 20:47:19 +02:00
tags_hrefs.add(mentioned_actor.url)
tags_hrefs.add(mentioned_actor.ap_id)
else:
tags_hrefs.add(LOCAL_ACTOR.ap_id)
tags_hrefs.add(LOCAL_ACTOR.url)
2022-07-06 21:13:33 +02:00
2022-06-22 20:11:22 +02:00
urls = set()
2022-08-04 17:36:21 +02:00
if ro.content:
soup = BeautifulSoup(ro.content, "html5lib")
2022-06-22 20:11:22 +02:00
for link in soup.find_all("a"):
h = link.get("href")
2022-09-13 21:02:47 +02:00
if not h:
continue
2022-09-29 09:10:05 +02:00
try:
ph = urlparse(h)
mimetype, _ = mimetypes.guess_type(h)
if (
ph.scheme in {"http", "https"}
and ph.hostname != note_host
and is_url_valid(h)
and (
not mimetype
or mimetype.split("/")[0] not in ["image", "video", "audio"]
)
):
urls.add(h)
except Exception:
logger.exception(f"Failed to check {h}")
continue
2022-06-22 20:11:22 +02:00
2022-07-06 21:13:33 +02:00
return urls - tags_hrefs
2022-06-22 20:11:22 +02:00
async def _og_meta_from_url(url: str) -> OpenGraphMeta | None:
async with httpx.AsyncClient() as client:
resp = await client.get(
url,
headers={
"User-Agent": config.USER_AGENT,
},
follow_redirects=True,
)
2022-06-22 20:11:22 +02:00
resp.raise_for_status()
if not (ct := resp.headers.get("content-type")) or not ct.startswith("text/html"):
return None
2022-08-11 22:14:11 +02:00
try:
return scrap_og_meta(url, resp.text)
except TimeoutError:
logger.info(f"Timed out when scraping OG meta for {url}")
return None
2022-08-11 22:14:11 +02:00
except Exception:
logger.info(f"Failed to scrap OG meta for {url}")
return None
2022-06-22 20:11:22 +02:00
2022-08-04 17:36:21 +02:00
async def og_meta_from_note(
db_session: AsyncSession,
ro: ap_object.RemoteObject,
) -> list[dict[str, Any]]:
2022-06-22 20:11:22 +02:00
og_meta = []
2022-08-04 17:36:21 +02:00
urls = await external_urls(db_session, ro)
2022-08-28 19:05:06 +02:00
logger.debug(f"Lookig OG metadata in {urls=}")
2022-06-22 20:11:22 +02:00
for url in urls:
2022-08-28 19:05:06 +02:00
logger.debug(f"Processing {url}")
2022-06-22 20:11:22 +02:00
try:
2022-08-28 19:05:06 +02:00
maybe_og_meta = None
try:
maybe_og_meta = await asyncio.wait_for(
_og_meta_from_url(url),
timeout=5,
)
except asyncio.TimeoutError:
logger.info(f"Timing out fetching {url}")
except Exception:
logger.exception(f"Failed scrap OG meta for {url}")
2022-06-22 20:11:22 +02:00
if maybe_og_meta:
og_meta.append(maybe_og_meta.dict())
2022-06-22 20:11:22 +02:00
except httpx.HTTPError:
pass
return og_meta