From 55745b9c41e0c0aa736754a6f1fb2169d1ac835d Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Fri, 28 Oct 2022 23:25:30 +0100 Subject: [PATCH] more robust url parsing --- .../sync/internal/sync/message/HtmlParser.kt | 112 +++++++++++------- .../sync/internal/sync/message/PartBuilder.kt | 5 + .../sync/message/RichMessageParser.kt | 24 +++- .../sync/internal/sync/message/UrlParser.kt | 20 +++- .../internal/sync/RichMessageParserTest.kt | 10 +- .../internal/sync/RoomEventCreatorTest.kt | 2 +- 6 files changed, 114 insertions(+), 59 deletions(-) diff --git a/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/HtmlParser.kt b/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/HtmlParser.kt index d855bb2..f9e2e0f 100644 --- a/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/HtmlParser.kt +++ b/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/HtmlParser.kt @@ -24,68 +24,27 @@ internal class HtmlParser { tagName == "br" -> { appendTextBeforeTag(searchIndex, tagOpen, builder, input) - builder.appendText("\n") + builder.appendNewline() tagClose.next() } else -> { val exitTag = "" val exitIndex = input.indexOf(exitTag, startIndex = tagClose) - val exitTagClose = exitIndex + exitTag.length + val exitTagCloseIndex = exitIndex + exitTag.length if (exitIndex == END_SEARCH) { builder.appendText(input[searchIndex].toString()) searchIndex.next() } else { when (tagName) { "mx-reply" -> { - exitTagClose + exitTagCloseIndex } else -> { appendTextBeforeTag(searchIndex, tagOpen, builder, input) val tagContent = input.substring(tagClose + 1, exitIndex) - when (tagName) { - "a" -> { - val findHrefUrl = wholeTag.substringAfter("href=").replace("\"", "").removeSuffix(">") - if (findHrefUrl.startsWith("https://matrix.to/#/@")) { - val userId = UserId(findHrefUrl.substringAfter("https://matrix.to/#/").substringBeforeLast("\"")) - builder.appendPerson(userId, "@${tagContent.removePrefix("@")}") - if (input.getOrNull(exitTagClose) == ':') { - exitTagClose.next() - } else { - exitTagClose - } - } else { - builder.appendLink(findHrefUrl, label = tagContent) - exitTagClose - } - } - - "b" -> { - builder.appendBold(tagContent) - exitTagClose - } - - "strong" -> { - builder.appendBold(tagContent) - exitTagClose - } - - "i" -> { - builder.appendItalic(tagContent) - exitTagClose - } - - "em" -> { - builder.appendItalic(tagContent) - exitTagClose - } - - else -> { - builder.appendText(tagContent) - exitTagClose - } - } + handleTagWithContent(input, tagName, wholeTag, builder, tagContent, exitTagCloseIndex) } } } @@ -94,6 +53,65 @@ internal class HtmlParser { } ) + private fun handleTagWithContent( + input: String, + tagName: String, + wholeTag: String, + builder: PartBuilder, + tagContent: String, + exitTagCloseIndex: Int + ) = when (tagName) { + "a" -> { + val findHrefUrl = wholeTag.substringAfter("href=").replace("\"", "").removeSuffix(">") + if (findHrefUrl.startsWith("https://matrix.to/#/@")) { + val userId = UserId(findHrefUrl.substringAfter("https://matrix.to/#/").substringBeforeLast("\"")) + builder.appendPerson(userId, "@${tagContent.removePrefix("@")}") + ignoreMatrixColonMentionSuffix(input, exitTagCloseIndex) + } else { + builder.appendLink(findHrefUrl, label = tagContent) + exitTagCloseIndex + } + } + + "b" -> { + builder.appendBold(tagContent) + exitTagCloseIndex + } + + "p" -> { + builder.appendText(tagContent) + builder.appendNewline() + builder.appendNewline() + exitTagCloseIndex + } + + "strong" -> { + builder.appendBold(tagContent) + exitTagCloseIndex + } + + "i" -> { + builder.appendItalic(tagContent) + exitTagCloseIndex + } + + "em" -> { + builder.appendItalic(tagContent) + exitTagCloseIndex + } + + else -> { + builder.appendText(tagContent) + exitTagCloseIndex + } + } + + private fun ignoreMatrixColonMentionSuffix(input: String, exitTagCloseIndex: Int) = if (input.getOrNull(exitTagCloseIndex) == ':') { + exitTagCloseIndex.next() + } else { + exitTagCloseIndex + } + private fun appendTextBeforeTag(searchIndex: Int, tagOpen: Int, builder: PartBuilder, input: String) { if (searchIndex != tagOpen) { builder.appendText(input.substring(searchIndex, tagOpen)) @@ -115,4 +133,8 @@ internal class HtmlParser { } } + fun test(startingFrom: Int, intput: String): Int { + return intput.indexOf('<', startingFrom) + } + } \ No newline at end of file diff --git a/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/PartBuilder.kt b/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/PartBuilder.kt index db3c93b..ab34699 100644 --- a/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/PartBuilder.kt +++ b/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/PartBuilder.kt @@ -54,3 +54,8 @@ internal fun PartBuilder.appendTextBeforeTag(previousIndex: Int, tagOpenIndex: I this.appendText(input.substring(previousIndex, tagOpenIndex)) } } + +internal fun PartBuilder.appendNewline() { + this.appendText("\n") +} + diff --git a/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/RichMessageParser.kt b/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/RichMessageParser.kt index 2d68b78..ac537da 100644 --- a/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/RichMessageParser.kt +++ b/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/RichMessageParser.kt @@ -17,15 +17,27 @@ class RichMessageParser { val builder = PartBuilder() var nextIndex = 0 while (nextIndex != END_SEARCH) { - val htmlResult = htmlParser.parseHtmlTags(input, nextIndex, builder) - val linkStartIndex = findUrlStartIndex(htmlResult, nextIndex) - val urlResult = urlParser.parseUrl(input, linkStartIndex, builder) + val htmlStart = htmlParser.test(nextIndex, input) + val urlStart = urlParser.test(nextIndex, input) - val hasReachedEnd = hasReachedEnd(htmlResult, urlResult, input) - if (hasReachedEnd && hasUnprocessedText(htmlResult, urlResult, input)) { + val firstResult = if (htmlStart < urlStart) { + htmlParser.parseHtmlTags(input, nextIndex, builder) + } else { + urlParser.parseUrl(input, nextIndex, builder) + } + + val secondStartIndex = findUrlStartIndex(firstResult, nextIndex) + val secondResult = if (htmlStart < urlStart) { + urlParser.parseUrl(input, secondStartIndex, builder) + } else { + htmlParser.parseHtmlTags(input, secondStartIndex, builder) + } + + val hasReachedEnd = hasReachedEnd(firstResult, secondResult, input) + if (hasReachedEnd && hasUnprocessedText(firstResult, secondResult, input)) { builder.appendText(input.substring(nextIndex)) } - nextIndex = if (hasReachedEnd) END_SEARCH else max(htmlResult, urlResult) + nextIndex = if (hasReachedEnd) END_SEARCH else max(firstResult, secondResult) } return RichText(builder.build()) } diff --git a/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/UrlParser.kt b/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/UrlParser.kt index a37869e..9366120 100644 --- a/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/UrlParser.kt +++ b/matrix/services/sync/src/main/kotlin/app/dapk/st/matrix/sync/internal/sync/message/UrlParser.kt @@ -1,16 +1,25 @@ package app.dapk.st.matrix.sync.internal.sync.message -private const val INVALID_TRAILING_CHARS = ",.:;?" +private const val INVALID_TRAILING_CHARS = ",.:;?<>" internal class UrlParser { + private fun String.hasLookAhead(current: Int, value: String): Boolean { + return length > current + value.length && this.substring(current, current + value.length) == value + } + fun parseUrl(input: String, linkStartIndex: Int, builder: PartBuilder): Int { val urlIndex = input.indexOf("http", startIndex = linkStartIndex) - val urlResult = if (urlIndex == END_SEARCH) END_SEARCH else { + return if (urlIndex == END_SEARCH) END_SEARCH else { builder.appendTextBeforeTag(linkStartIndex, urlIndex, input) val originalUrl = input.substring(urlIndex) - val urlEndIndex = originalUrl.indexOfFirst { it == '\n' || it == ' ' } + var index = 0 + val maybeUrl = originalUrl.takeWhile { + it != '\n' && it != ' ' && !originalUrl.hasLookAhead(index++, "