more robust url parsing

This commit is contained in:
Adam Brown 2022-10-28 23:25:30 +01:00
parent 8e36efe0c2
commit 55745b9c41
6 changed files with 114 additions and 59 deletions

View File

@ -24,68 +24,27 @@ internal class HtmlParser {
tagName == "br" -> {
appendTextBeforeTag(searchIndex, tagOpen, builder, input)
builder.appendText("\n")
builder.appendNewline()
tagClose.next()
}
else -> {
val exitTag = "</$tagName>"
val exitIndex = input.indexOf(exitTag, startIndex = tagClose)
val exitTagClose = exitIndex + exitTag.length
val exitTagCloseIndex = exitIndex + exitTag.length
if (exitIndex == END_SEARCH) {
builder.appendText(input[searchIndex].toString())
searchIndex.next()
} else {
when (tagName) {
"mx-reply" -> {
exitTagClose
exitTagCloseIndex
}
else -> {
appendTextBeforeTag(searchIndex, tagOpen, builder, input)
val tagContent = input.substring(tagClose + 1, exitIndex)
when (tagName) {
"a" -> {
val findHrefUrl = wholeTag.substringAfter("href=").replace("\"", "").removeSuffix(">")
if (findHrefUrl.startsWith("https://matrix.to/#/@")) {
val userId = UserId(findHrefUrl.substringAfter("https://matrix.to/#/").substringBeforeLast("\""))
builder.appendPerson(userId, "@${tagContent.removePrefix("@")}")
if (input.getOrNull(exitTagClose) == ':') {
exitTagClose.next()
} else {
exitTagClose
}
} else {
builder.appendLink(findHrefUrl, label = tagContent)
exitTagClose
}
}
"b" -> {
builder.appendBold(tagContent)
exitTagClose
}
"strong" -> {
builder.appendBold(tagContent)
exitTagClose
}
"i" -> {
builder.appendItalic(tagContent)
exitTagClose
}
"em" -> {
builder.appendItalic(tagContent)
exitTagClose
}
else -> {
builder.appendText(tagContent)
exitTagClose
}
}
handleTagWithContent(input, tagName, wholeTag, builder, tagContent, exitTagCloseIndex)
}
}
}
@ -94,6 +53,65 @@ internal class HtmlParser {
}
)
private fun handleTagWithContent(
input: String,
tagName: String,
wholeTag: String,
builder: PartBuilder,
tagContent: String,
exitTagCloseIndex: Int
) = when (tagName) {
"a" -> {
val findHrefUrl = wholeTag.substringAfter("href=").replace("\"", "").removeSuffix(">")
if (findHrefUrl.startsWith("https://matrix.to/#/@")) {
val userId = UserId(findHrefUrl.substringAfter("https://matrix.to/#/").substringBeforeLast("\""))
builder.appendPerson(userId, "@${tagContent.removePrefix("@")}")
ignoreMatrixColonMentionSuffix(input, exitTagCloseIndex)
} else {
builder.appendLink(findHrefUrl, label = tagContent)
exitTagCloseIndex
}
}
"b" -> {
builder.appendBold(tagContent)
exitTagCloseIndex
}
"p" -> {
builder.appendText(tagContent)
builder.appendNewline()
builder.appendNewline()
exitTagCloseIndex
}
"strong" -> {
builder.appendBold(tagContent)
exitTagCloseIndex
}
"i" -> {
builder.appendItalic(tagContent)
exitTagCloseIndex
}
"em" -> {
builder.appendItalic(tagContent)
exitTagCloseIndex
}
else -> {
builder.appendText(tagContent)
exitTagCloseIndex
}
}
private fun ignoreMatrixColonMentionSuffix(input: String, exitTagCloseIndex: Int) = if (input.getOrNull(exitTagCloseIndex) == ':') {
exitTagCloseIndex.next()
} else {
exitTagCloseIndex
}
private fun appendTextBeforeTag(searchIndex: Int, tagOpen: Int, builder: PartBuilder, input: String) {
if (searchIndex != tagOpen) {
builder.appendText(input.substring(searchIndex, tagOpen))
@ -115,4 +133,8 @@ internal class HtmlParser {
}
}
fun test(startingFrom: Int, intput: String): Int {
return intput.indexOf('<', startingFrom)
}
}

View File

@ -54,3 +54,8 @@ internal fun PartBuilder.appendTextBeforeTag(previousIndex: Int, tagOpenIndex: I
this.appendText(input.substring(previousIndex, tagOpenIndex))
}
}
internal fun PartBuilder.appendNewline() {
this.appendText("\n")
}

View File

@ -17,15 +17,27 @@ class RichMessageParser {
val builder = PartBuilder()
var nextIndex = 0
while (nextIndex != END_SEARCH) {
val htmlResult = htmlParser.parseHtmlTags(input, nextIndex, builder)
val linkStartIndex = findUrlStartIndex(htmlResult, nextIndex)
val urlResult = urlParser.parseUrl(input, linkStartIndex, builder)
val htmlStart = htmlParser.test(nextIndex, input)
val urlStart = urlParser.test(nextIndex, input)
val hasReachedEnd = hasReachedEnd(htmlResult, urlResult, input)
if (hasReachedEnd && hasUnprocessedText(htmlResult, urlResult, input)) {
val firstResult = if (htmlStart < urlStart) {
htmlParser.parseHtmlTags(input, nextIndex, builder)
} else {
urlParser.parseUrl(input, nextIndex, builder)
}
val secondStartIndex = findUrlStartIndex(firstResult, nextIndex)
val secondResult = if (htmlStart < urlStart) {
urlParser.parseUrl(input, secondStartIndex, builder)
} else {
htmlParser.parseHtmlTags(input, secondStartIndex, builder)
}
val hasReachedEnd = hasReachedEnd(firstResult, secondResult, input)
if (hasReachedEnd && hasUnprocessedText(firstResult, secondResult, input)) {
builder.appendText(input.substring(nextIndex))
}
nextIndex = if (hasReachedEnd) END_SEARCH else max(htmlResult, urlResult)
nextIndex = if (hasReachedEnd) END_SEARCH else max(firstResult, secondResult)
}
return RichText(builder.build())
}

View File

@ -1,16 +1,25 @@
package app.dapk.st.matrix.sync.internal.sync.message
private const val INVALID_TRAILING_CHARS = ",.:;?"
private const val INVALID_TRAILING_CHARS = ",.:;?<>"
internal class UrlParser {
private fun String.hasLookAhead(current: Int, value: String): Boolean {
return length > current + value.length && this.substring(current, current + value.length) == value
}
fun parseUrl(input: String, linkStartIndex: Int, builder: PartBuilder): Int {
val urlIndex = input.indexOf("http", startIndex = linkStartIndex)
val urlResult = if (urlIndex == END_SEARCH) END_SEARCH else {
return if (urlIndex == END_SEARCH) END_SEARCH else {
builder.appendTextBeforeTag(linkStartIndex, urlIndex, input)
val originalUrl = input.substring(urlIndex)
val urlEndIndex = originalUrl.indexOfFirst { it == '\n' || it == ' ' }
var index = 0
val maybeUrl = originalUrl.takeWhile {
it != '\n' && it != ' ' && !originalUrl.hasLookAhead(index++, "<br")
}
val urlEndIndex = maybeUrl.length + urlIndex
val urlContinuesUntilEnd = urlEndIndex == -1
when {
@ -31,7 +40,10 @@ internal class UrlParser {
}
}
}
return urlResult
}
fun test(startingFrom: Int, input: String): Int {
return input.indexOf("http", startingFrom)
}
}

View File

@ -19,9 +19,9 @@ class RichMessageParserTest {
)
@Test
fun `skips p tags`() = runParserTest(
input = "Hello world! <p>foo bar</p> after paragraph",
expected = RichText(setOf(Normal("Hello world! foo bar after paragraph")))
fun `parses p tags`() = runParserTest(
input = "<p>Hello world!</p><p>foo bar</p>after paragraph",
expected = RichText(setOf(Normal("Hello world!\n\nfoo bar\n\nafter paragraph")))
)
@Test
@ -84,6 +84,10 @@ class RichMessageParserTest {
input = "ending sentence with url https://google.com.",
expected = RichText(setOf(Normal("ending sentence with url "), Link("https://google.com", "https://google.com"), Normal(".")))
),
Case(
input = "https://google.com<br>html after url",
expected = RichText(setOf(Link("https://google.com", "https://google.com"), Normal("\nhtml after url")))
),
)
@Test

View File

@ -111,7 +111,7 @@ internal class RoomEventCreatorTest {
result shouldBeEqualTo aMatrixRoomMessageEvent(
eventId = editEvent.id,
utcTimestamp = editEvent.utcTimestamp,
content = RichText.of(editEvent.asTextContent().body!!),
content = RichText.of(editEvent.asTextContent().body!!.trimStart()),
author = A_SENDER,
edited = true
)