From bda5896413b335272c23c6cdd3f54b840085a42e Mon Sep 17 00:00:00 2001 From: Shinokuni Date: Sat, 22 Jul 2023 19:44:44 +0200 Subject: [PATCH] Rewrite HtmlParser in Kotlin with some tests --- .../java/com/readrops/api/utils/HtmlParser.kt | 75 +++ .../com/readrops/api/utils/HtmlParserTest.kt | 89 +++ api/src/test/resources/utils/file.html | 601 ++++++++++++++++++ .../resources/utils/file_without_head.html | 593 +++++++++++++++++ 4 files changed, 1358 insertions(+) create mode 100644 api/src/main/java/com/readrops/api/utils/HtmlParser.kt create mode 100644 api/src/test/java/com/readrops/api/utils/HtmlParserTest.kt create mode 100644 api/src/test/resources/utils/file.html create mode 100644 api/src/test/resources/utils/file_without_head.html diff --git a/api/src/main/java/com/readrops/api/utils/HtmlParser.kt b/api/src/main/java/com/readrops/api/utils/HtmlParser.kt new file mode 100644 index 00000000..5e0e3201 --- /dev/null +++ b/api/src/main/java/com/readrops/api/utils/HtmlParser.kt @@ -0,0 +1,75 @@ +package com.readrops.api.utils + +import android.nfc.FormatException +import com.readrops.api.localfeed.LocalRSSHelper +import okhttp3.OkHttpClient +import okhttp3.Request +import org.jsoup.Jsoup +import org.jsoup.nodes.Document + +data class ParsingResult( + val url: String, + val label: String?, +) + +object HtmlParser { + + suspend fun getFeedLink(url: String, client: OkHttpClient): List { + val results = mutableListOf() + + val document = getHTMLHeadFromUrl(url, client) + val elements = document.select("link") + + for (element in elements) { + val type = element.attributes()["type"] + + if (LocalRSSHelper.isRSSType(type)) { + results += ParsingResult( + url = element.absUrl("href"), + label = element.attributes()["title"] + ) + } + } + + return results + } + + private fun getHTMLHeadFromUrl(url: String, client: OkHttpClient): Document { + client.newCall(Request.Builder().url(url).build()).execute().use { response -> + if (response.header(ApiUtils.CONTENT_TYPE_HEADER)!!.contains(ApiUtils.HTML_CONTENT_TYPE) + ) { + val body = response.body!!.source() + + val stringBuilder = StringBuilder() + var collectionStarted = false + + while (!body.exhausted()) { + val currentLine = body.readUtf8LineStrict() + + when { + currentLine.contains("") -> { + stringBuilder.append(currentLine) + collectionStarted = true + } + currentLine.contains("") -> { + stringBuilder.append(currentLine) + break + } + collectionStarted -> { + stringBuilder.append(currentLine) + } + } + } + + if (!stringBuilder.contains("") || !stringBuilder.contains("")) + throw Exception("Failed to get HTML head") + + body.close() + return Jsoup.parse(stringBuilder.toString(), url) + } else { + throw FormatException("The response is not a html file") + } + } + } + +} \ No newline at end of file diff --git a/api/src/test/java/com/readrops/api/utils/HtmlParserTest.kt b/api/src/test/java/com/readrops/api/utils/HtmlParserTest.kt new file mode 100644 index 00000000..b4eadaff --- /dev/null +++ b/api/src/test/java/com/readrops/api/utils/HtmlParserTest.kt @@ -0,0 +1,89 @@ +package com.readrops.api.utils + +import android.nfc.FormatException +import com.readrops.api.TestUtils +import kotlinx.coroutines.runBlocking +import okhttp3.OkHttpClient +import okhttp3.mockwebserver.MockResponse +import okhttp3.mockwebserver.MockWebServer +import okio.Buffer +import org.junit.Rule +import org.junit.Test +import org.koin.dsl.module +import org.koin.test.KoinTest +import org.koin.test.KoinTestRule +import java.net.HttpURLConnection +import java.util.concurrent.TimeUnit +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +class HtmlParserTest : KoinTest { + + private val mockServer = MockWebServer() + + @get:Rule + val koinTestRule = KoinTestRule.create { + modules(module { + single { + OkHttpClient.Builder() + .callTimeout(1, TimeUnit.MINUTES) + .readTimeout(1, TimeUnit.HOURS) + .build() + } + }) + } + + @Test + fun before() { + mockServer.start() + } + + @Test + fun after() { + mockServer.shutdown() + } + + @Test + fun getFeedLinkTest() { + val stream = TestUtils.loadResource("utils/file.html") + + mockServer.enqueue( + MockResponse().setResponseCode(HttpURLConnection.HTTP_OK) + .addHeader(ApiUtils.CONTENT_TYPE_HEADER, ApiUtils.HTML_CONTENT_TYPE) + .setBody(Buffer().readFrom(stream)) + ) + + runBlocking { + val result = + HtmlParser.getFeedLink(mockServer.url("/rss").toString(), koinTestRule.koin.get()) + + assertTrue { result.size == 1 } + assertTrue { result.first().url.endsWith("/rss") } + assertEquals("RSS", result.first().label) + + } + } + + @Test(expected = Exception::class) + fun getFeedLinkWithoutHeadTest() { + val stream = TestUtils.loadResource("utils/file_without_head.html") + + mockServer.enqueue( + MockResponse().setResponseCode(HttpURLConnection.HTTP_OK) + .addHeader(ApiUtils.CONTENT_TYPE_HEADER, ApiUtils.HTML_CONTENT_TYPE) + .setBody(Buffer().readFrom(stream)) + ) + + runBlocking { HtmlParser.getFeedLink(mockServer.url("/rss").toString(), koinTestRule.koin.get()) } + } + + @Test(expected = FormatException::class) + fun getFeedLinkNoHtmlFileTest() { + mockServer.enqueue( + MockResponse().setResponseCode(HttpURLConnection.HTTP_OK) + .addHeader(ApiUtils.CONTENT_TYPE_HEADER, "application/rss+xml")) + + + runBlocking { HtmlParser.getFeedLink(mockServer.url("/rss").toString(), koinTestRule.koin.get()) } + } +} \ No newline at end of file diff --git a/api/src/test/resources/utils/file.html b/api/src/test/resources/utils/file.html new file mode 100644 index 00000000..d55ef435 --- /dev/null +++ b/api/src/test/resources/utils/file.html @@ -0,0 +1,601 @@ + + + + + + + + Hacker News + + +
+ + + + + + + + + + + +
+ + + + + + +
Hacker News + new | past | comments | ask | show | jobs | submit + + login + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
1.A Brief History of Computers (lesswrong.com)
+ 31 points by zdw 1 hour ago | hide | 3 comments +
2.Consumer Software Is Expected to Be Next Fast-Growing Segment (1994) (csmonitor.com)
+ 9 points by 1970-01-01 1 hour ago | hide | 1 comment +
3.MSX-DOS (wikipedia.org)
+ 82 points by pavlov 6 hours ago | hide | 26 comments +
4.New Yorkers Got Broken Promises. Developers Got 20M Sq. Ft (nytimes.com)
+ 12 points by asnyder 20 minutes ago | hide | 1 comment +
5.Apple's interactive television box: Hacking the set top box System 7.1 in ROM (oldvcr.blogspot.com)
+ 160 points by todsacerdoti 10 hours ago | hide | 20 comments +
6.Putting the “You” in CPU (cpu.land)
+ 187 points by uneekname 10 hours ago | hide | 73 comments +
7.Botulinum toxin: Bioweapon and magic drug (nih.gov)
+ 12 points by redbell 2 hours ago | hide | 10 comments +
8.Octos – HTML live wallpaper engine (github.com/underpig1)
+ 85 points by underpig1 6 hours ago | hide | 23 comments +
9.More than you've ever wanted to know about errors in Rust (shuttle.rs)
+ 13 points by asymmetric 2 hours ago | hide | 3 comments +
10.Embrace Complexity; Tighten Your Feedback Loops (ferd.ca)
+ 27 points by lutzh 4 hours ago | hide | 1 comment +
11.AWS networking concepts in a diagram (miparnisariblog.wordpress.com)
+ 171 points by mparnisari 10 hours ago | hide | 66 comments +
12.Plane – Open-source Jira alternative (plane.so)
+ 240 points by prhrb 7 hours ago | hide | 93 comments +
13.Neurotechnology: Current Developments and Ethical Issues (frontiersin.org)
+ 28 points by Quinzel 3 hours ago | hide | 15 comments +
14.What we talk about when we talk about System Design (maheshba.bitbucket.io)
+ 166 points by scv119 11 hours ago | hide | 22 comments +
15.ElKaWe – Electrocaloric heat pumps (fraunhofer.de)
+ 140 points by danans 10 hours ago | hide | 73 comments +
16.Over-grazing and desertification in the Syrian steppe root causes of war (2015) (theecologist.org)
+ 64 points by joveian 6 hours ago | hide | 43 comments +
17.Redmine – open-source project management (redmine.org)
+ 34 points by synergy20 2 hours ago | hide | 24 comments +
18.Google tries internet air-gap for some staff PCs (theregister.com)
+ 67 points by beardyw 9 hours ago | hide | 73 comments +
19.I thought I wanted to be a professor, then I served on a hiring committee (2021) (science.org)
+ 104 points by ykonstant 4 hours ago | hide | 72 comments +
20.Internet search tips (gwern.net)
+ 161 points by herbertl 12 hours ago | hide | 58 comments +
21.Bayesian methods to provide probablistic solution for the Drake equation (2019) (sciencedirect.com)
+ 22 points by benbreen 4 hours ago | hide | 18 comments +
22.Biotumen: Bitumen Reinvented (biofabrik.com)
+ 40 points by patall 7 hours ago | hide | 11 comments +
23.Why even let users set their own passwords? (devever.net)
+ 103 points by hlandau 2 hours ago | hide | 121 comments +
24.Confronting failure as a core life skill (buildinghealthier.substack.com)
+ 168 points by blh75 15 hours ago | hide | 75 comments +
25.Hokusai’s Illustrated Warrior Vanguard of Japan and China (1836) (publicdomainreview.org)
+ 19 points by tintinnabula 2 hours ago | hide | discuss +
26.Bun v0.7.0 (bun.sh)
+ 163 points by sshroot 9 hours ago | hide | 107 comments +
27.Simpson Fan Grows Tomacco (2003) (simpsonsarchive.com)
+ 81 points by pipeline_peak 6 hours ago | hide | 55 comments +
28.Discovery: Metals can heal themselves (sandia.gov)
+ 77 points by bobvanluijt 13 hours ago | hide | 24 comments +
29.Pressure and vacuum marination does not work (2016) (genuineideas.com)
+ 87 points by OJFord 13 hours ago | hide | 57 comments +
30.Scientists: Fishing boats compete with whales and penguins for Antarctic krill (mongabay.com)
+ 5 points by PaulHoule 1 hour ago | hide | discuss +
+
+ + + + + +
+
+
+ Guidelines | FAQ | Lists | API | Security | Legal | Apply to YC | Contact

+
Search:
+
+
+
+ + + + diff --git a/api/src/test/resources/utils/file_without_head.html b/api/src/test/resources/utils/file_without_head.html new file mode 100644 index 00000000..3606ac07 --- /dev/null +++ b/api/src/test/resources/utils/file_without_head.html @@ -0,0 +1,593 @@ + + +
+ + + + + + + + + + + +
+ + + + + + +
Hacker News + new | past | comments | ask | show | jobs | submit + + login + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
1.A Brief History of Computers (lesswrong.com)
+ 31 points by zdw 1 hour ago | hide | 3 comments +
2.Consumer Software Is Expected to Be Next Fast-Growing Segment (1994) (csmonitor.com)
+ 9 points by 1970-01-01 1 hour ago | hide | 1 comment +
3.MSX-DOS (wikipedia.org)
+ 82 points by pavlov 6 hours ago | hide | 26 comments +
4.New Yorkers Got Broken Promises. Developers Got 20M Sq. Ft (nytimes.com)
+ 12 points by asnyder 20 minutes ago | hide | 1 comment +
5.Apple's interactive television box: Hacking the set top box System 7.1 in ROM (oldvcr.blogspot.com)
+ 160 points by todsacerdoti 10 hours ago | hide | 20 comments +
6.Putting the “You” in CPU (cpu.land)
+ 187 points by uneekname 10 hours ago | hide | 73 comments +
7.Botulinum toxin: Bioweapon and magic drug (nih.gov)
+ 12 points by redbell 2 hours ago | hide | 10 comments +
8.Octos – HTML live wallpaper engine (github.com/underpig1)
+ 85 points by underpig1 6 hours ago | hide | 23 comments +
9.More than you've ever wanted to know about errors in Rust (shuttle.rs)
+ 13 points by asymmetric 2 hours ago | hide | 3 comments +
10.Embrace Complexity; Tighten Your Feedback Loops (ferd.ca)
+ 27 points by lutzh 4 hours ago | hide | 1 comment +
11.AWS networking concepts in a diagram (miparnisariblog.wordpress.com)
+ 171 points by mparnisari 10 hours ago | hide | 66 comments +
12.Plane – Open-source Jira alternative (plane.so)
+ 240 points by prhrb 7 hours ago | hide | 93 comments +
13.Neurotechnology: Current Developments and Ethical Issues (frontiersin.org)
+ 28 points by Quinzel 3 hours ago | hide | 15 comments +
14.What we talk about when we talk about System Design (maheshba.bitbucket.io)
+ 166 points by scv119 11 hours ago | hide | 22 comments +
15.ElKaWe – Electrocaloric heat pumps (fraunhofer.de)
+ 140 points by danans 10 hours ago | hide | 73 comments +
16.Over-grazing and desertification in the Syrian steppe root causes of war (2015) (theecologist.org)
+ 64 points by joveian 6 hours ago | hide | 43 comments +
17.Redmine – open-source project management (redmine.org)
+ 34 points by synergy20 2 hours ago | hide | 24 comments +
18.Google tries internet air-gap for some staff PCs (theregister.com)
+ 67 points by beardyw 9 hours ago | hide | 73 comments +
19.I thought I wanted to be a professor, then I served on a hiring committee (2021) (science.org)
+ 104 points by ykonstant 4 hours ago | hide | 72 comments +
20.Internet search tips (gwern.net)
+ 161 points by herbertl 12 hours ago | hide | 58 comments +
21.Bayesian methods to provide probablistic solution for the Drake equation (2019) (sciencedirect.com)
+ 22 points by benbreen 4 hours ago | hide | 18 comments +
22.Biotumen: Bitumen Reinvented (biofabrik.com)
+ 40 points by patall 7 hours ago | hide | 11 comments +
23.Why even let users set their own passwords? (devever.net)
+ 103 points by hlandau 2 hours ago | hide | 121 comments +
24.Confronting failure as a core life skill (buildinghealthier.substack.com)
+ 168 points by blh75 15 hours ago | hide | 75 comments +
25.Hokusai’s Illustrated Warrior Vanguard of Japan and China (1836) (publicdomainreview.org)
+ 19 points by tintinnabula 2 hours ago | hide | discuss +
26.Bun v0.7.0 (bun.sh)
+ 163 points by sshroot 9 hours ago | hide | 107 comments +
27.Simpson Fan Grows Tomacco (2003) (simpsonsarchive.com)
+ 81 points by pipeline_peak 6 hours ago | hide | 55 comments +
28.Discovery: Metals can heal themselves (sandia.gov)
+ 77 points by bobvanluijt 13 hours ago | hide | 24 comments +
29.Pressure and vacuum marination does not work (2016) (genuineideas.com)
+ 87 points by OJFord 13 hours ago | hide | 57 comments +
30.Scientists: Fishing boats compete with whales and penguins for Antarctic krill (mongabay.com)
+ 5 points by PaulHoule 1 hour ago | hide | discuss +
+
+ + + + + +
+
+
+ Guidelines | FAQ | Lists | API | Security | Legal | Apply to YC | Contact

+
Search:
+
+
+
+ + + +