2022-07-10 19:19:55 +02:00
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
import httpx
|
|
|
|
import mf2py # type: ignore
|
|
|
|
from loguru import logger
|
|
|
|
|
|
|
|
from app import config
|
|
|
|
|
|
|
|
|
2022-07-19 20:38:32 +02:00
|
|
|
class URLNotFoundOrGone(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
async def fetch_and_parse(url: str) -> tuple[dict[str, Any], str]:
|
2022-07-10 19:19:55 +02:00
|
|
|
async with httpx.AsyncClient() as client:
|
2022-07-19 20:38:32 +02:00
|
|
|
resp = await client.get(
|
|
|
|
url,
|
|
|
|
headers={
|
|
|
|
"User-Agent": config.USER_AGENT,
|
|
|
|
},
|
|
|
|
follow_redirects=True,
|
|
|
|
)
|
|
|
|
if resp.status_code in [404, 410]:
|
|
|
|
raise URLNotFoundOrGone
|
|
|
|
|
2022-07-10 19:19:55 +02:00
|
|
|
try:
|
|
|
|
resp.raise_for_status()
|
2022-07-19 20:38:32 +02:00
|
|
|
except httpx.HTTPStatusError:
|
|
|
|
logger.error(
|
|
|
|
f"Failed to parse microformats for {url}: " f"got {resp.status_code}"
|
|
|
|
)
|
|
|
|
raise
|
2022-07-10 19:19:55 +02:00
|
|
|
|
|
|
|
return mf2py.parse(doc=resp.text), resp.text
|