From 881d0ad899928f179ecb8402f9bac850bc57a2ea Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Tue, 4 Oct 2022 20:26:01 +0200 Subject: [PATCH] Switch Markdown parser --- app/source.py | 152 ++++++++++++++++++++++++++++++++----------- poetry.lock | 14 +++- pyproject.toml | 1 + tests/test_outbox.py | 4 +- 4 files changed, 131 insertions(+), 40 deletions(-) diff --git a/app/source.py b/app/source.py index 0b699c8..0dea35c 100644 --- a/app/source.py +++ b/app/source.py @@ -1,52 +1,118 @@ import re import typing -from markdown import markdown +from mistletoe import Document # type: ignore +from mistletoe.html_renderer import HTMLRenderer # type: ignore +from mistletoe.span_token import SpanToken # type: ignore +from pygments import highlight # type: ignore +from pygments.formatters import HtmlFormatter # type: ignore +from pygments.lexers import get_lexer_by_name as get_lexer # type: ignore +from pygments.lexers import guess_lexer # type: ignore from sqlalchemy import select from app import webfinger from app.config import BASE_URL +from app.config import CODE_HIGHLIGHTING_THEME from app.database import AsyncSession from app.utils import emoji if typing.TYPE_CHECKING: from app.actor import Actor - -def _set_a_attrs(attrs, new=False): - attrs[(None, "target")] = "_blank" - attrs[(None, "class")] = "external" - attrs[(None, "rel")] = "noopener" - attrs[(None, "title")] = attrs[(None, "href")] - return attrs - - +_FORMATTER = HtmlFormatter(style=CODE_HIGHLIGHTING_THEME) _HASHTAG_REGEX = re.compile(r"(#[\d\w]+)") _MENTION_REGEX = re.compile(r"@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+") -def hashtagify(content: str) -> tuple[str, list[dict[str, str]]]: - tags = [] - hashtags = re.findall(_HASHTAG_REGEX, content) - hashtags = sorted(set(hashtags), reverse=True) # unique tags, longest first - for hashtag in hashtags: - tag = hashtag[1:] +class AutoLink(SpanToken): + parse_inner = False + precedence = 10 + pattern = re.compile( + "(https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*))" # noqa: E501 + ) + + def __init__(self, match_obj: re.Match) -> None: + self.target = match_obj.group() + + +class Mention(SpanToken): + parse_inner = False + precedence = 10 + pattern = re.compile(r"(@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+)") + + def __init__(self, match_obj: re.Match) -> None: + self.target = match_obj.group() + + +class Hashtag(SpanToken): + parse_inner = False + precedence = 10 + pattern = re.compile(r"(#[\d\w]+)") + + def __init__(self, match_obj: re.Match) -> None: + self.target = match_obj.group() + + +class CustomRenderer(HTMLRenderer): + def __init__( + self, + mentioned_actors: dict[str, "Actor"] = {}, + enable_mentionify: bool = True, + enable_hashtagify: bool = True, + ) -> None: + extra_tokens = [] + if enable_mentionify: + extra_tokens.append(Mention) + if enable_hashtagify: + extra_tokens.append(Hashtag) + super().__init__(AutoLink, *extra_tokens) + + self.tags: list[dict[str, str]] = [] + self.mentioned_actors = mentioned_actors + + def render_auto_link(self, token: AutoLink) -> str: + template = '{inner}' + target = self.escape_url(token.target) + return template.format(target=target, inner=target) + + def render_mention(self, token: Mention) -> str: + mention = token.target + actor = self.mentioned_actors.get(mention) + if not actor: + return mention + + self.tags.append(dict(type="Mention", href=actor.ap_id, name=mention)) + + link = f'{actor.handle}' # noqa: E501 + return link + + def render_hashtag(self, token: Hashtag) -> str: + tag = token.target[1:] link = f'' # noqa: E501 - tags.append(dict(href=f"{BASE_URL}/t/{tag}", name=hashtag, type="Hashtag")) - content = content.replace(hashtag, link) - return content, tags + self.tags.append( + dict(href=f"{BASE_URL}/t/{tag}", name=token.target, type="Hashtag") + ) + return link + + def render_block_code(self, token: typing.Any) -> str: + code = token.children[0].content + lexer = get_lexer(token.language) if token.language else guess_lexer(code) + return highlight(code, lexer, _FORMATTER) -async def _mentionify( +async def _prefetch_mentioned_actors( db_session: AsyncSession, content: str, -) -> tuple[str, list[dict[str, str]], list["Actor"]]: +) -> dict[str, "Actor"]: from app import models from app.actor import fetch_actor - tags = [] - mentioned_actors = [] + actors = {} + for mention in re.findall(_MENTION_REGEX, content): + if mention in actors: + continue + _, username, domain = mention.split("@") actor = ( await db_session.execute( @@ -63,12 +129,22 @@ async def _mentionify( continue actor = await fetch_actor(db_session, actor_url) - mentioned_actors.append(actor) - tags.append(dict(type="Mention", href=actor.ap_id, name=mention)) + actors[mention] = actor - link = f'{actor.handle}' # noqa: E501 - content = content.replace(mention, link) - return content, tags, mentioned_actors + return actors + + +def hashtagify(content: str) -> tuple[str, list[dict[str, str]]]: + # TODO: fix this, switch to mistletoe? + tags = [] + hashtags = re.findall(_HASHTAG_REGEX, content) + hashtags = sorted(set(hashtags), reverse=True) # unique tags, longest first + for hashtag in hashtags: + tag = hashtag[1:] + link = f'' # noqa: E501 + tags.append(dict(href=f"{BASE_URL}/t/{tag}", name=hashtag, type="Hashtag")) + content = content.replace(hashtag, link) + return content, tags async def markdownify( @@ -82,17 +158,19 @@ async def markdownify( """ tags = [] - mentioned_actors: list["Actor"] = [] - if enable_hashtagify: - content, hashtag_tags = hashtagify(content) - tags.extend(hashtag_tags) + mentioned_actors: dict[str, "Actor"] = {} if enable_mentionify: - content, mention_tags, mentioned_actors = await _mentionify(db_session, content) - tags.extend(mention_tags) + mentioned_actors = await _prefetch_mentioned_actors(db_session, content) + + with CustomRenderer( + mentioned_actors=mentioned_actors, + enable_mentionify=enable_mentionify, + enable_hashtagify=enable_hashtagify, + ) as renderer: + rendered_content = renderer.render(Document(content)) + tags.extend(renderer.tags) # Handle custom emoji tags.extend(emoji.tags(content)) - content = markdown(content, extensions=["mdx_linkify", "fenced_code"]) - - return content, tags, mentioned_actors + return rendered_content, tags, list(mentioned_actors.values()) diff --git a/poetry.lock b/poetry.lock index 769a7e0..fdf0378 100644 --- a/poetry.lock +++ b/poetry.lock @@ -648,6 +648,14 @@ BeautifulSoup4 = ">=4.6.0" html5lib = ">=1.0.1" requests = ">=2.18.4" +[[package]] +name = "mistletoe" +version = "0.9.0" +description = "A fast, extensible Markdown parser in pure Python." +category = "main" +optional = false +python-versions = "~=3.5" + [[package]] name = "mypy" version = "0.960" @@ -1275,7 +1283,7 @@ dev = ["pytest (>=4.6.2)", "black (>=19.3b0)"] [metadata] lock-version = "1.1" python-versions = "^3.10" -content-hash = "84b3a6dcfc055fb0712c6abbf1bf94d9526eda940c4ddb0bd275664e68a4c3e3" +content-hash = "bc8585a0da6f4d4e54afafde1da287ed75ed6544981d11bba561a7678bc31b8f" [metadata.files] aiosqlite = [ @@ -1832,6 +1840,10 @@ mdx-linkify = [ mf2py = [ {file = "mf2py-1.1.2.tar.gz", hash = "sha256:84f1f8f2ff3f1deb1c30be497e7ccd805452996a662fd4a77f09e0105bede2c9"}, ] +mistletoe = [ + {file = "mistletoe-0.9.0-py3-none-any.whl", hash = "sha256:11316e2fe0be422a8248293ad0efbee9ad0c6f3683b2f45bc6b989ea17a68c74"}, + {file = "mistletoe-0.9.0.tar.gz", hash = "sha256:3cb96d78226d08f0d3bf09efcaf330d23902492006e18b2c06558e8b86bf7faf"}, +] mypy = [ {file = "mypy-0.960-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3a3e525cd76c2c4f90f1449fd034ba21fcca68050ff7c8397bb7dd25dd8b8248"}, {file = "mypy-0.960-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7a76dc4f91e92db119b1be293892df8379b08fd31795bb44e0ff84256d34c251"}, diff --git a/pyproject.toml b/pyproject.toml index 526d260..a9a050c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ boussole = "^2.0.0" uvicorn = {extras = ["standard"], version = "^0.18.3"} Brotli = "^1.0.9" greenlet = "^1.1.3" +mistletoe = "^0.9.0" [tool.poetry.dev-dependencies] black = "^22.3.0" diff --git a/tests/test_outbox.py b/tests/test_outbox.py index 5159e8f..09e1269 100644 --- a/tests/test_outbox.py +++ b/tests/test_outbox.py @@ -179,7 +179,7 @@ def test_send_create_activity__with_attachment( outbox_object = db.execute(select(models.OutboxObject)).scalar_one() assert outbox_object.ap_type == "Note" assert outbox_object.summary is None - assert outbox_object.content == "

hello

" + assert outbox_object.content == "

hello

\n" assert len(outbox_object.attachments) == 1 attachment = outbox_object.attachments[0] assert attachment.type == "Document" @@ -227,7 +227,7 @@ def test_send_create_activity__no_content_with_cw_and_attachments( outbox_object = db.execute(select(models.OutboxObject)).scalar_one() assert outbox_object.ap_type == "Note" assert outbox_object.summary is None - assert outbox_object.content == "

cw

" + assert outbox_object.content == "

cw

\n" assert len(outbox_object.attachments) == 1