Bubble/Threaded/Data/Content/HTMLString.swift

//Made by Lumaa

import Foundation
import SwiftSoup
import SwiftUI

private enum CodingKeys: CodingKey {
    case htmlValue, asMarkdown, asRawText, statusesURLs, links
}

public struct HTMLString: Codable, Equatable, Hashable, @unchecked Sendable {
    public var htmlValue: String = ""
    public var asMarkdown: String = ""
    public var asRawText: String = ""
    public var statusesURLs = [URL]()
    public private(set) var links = [Link]()

    public var asSafeMarkdownAttributedString: AttributedString = .init()
    private var main_regex: NSRegularExpression?
    private var underscore_regex: NSRegularExpression?
    public init(from decoder: Decoder) {
        var alreadyDecoded = false
        do {
            let container = try decoder.singleValueContainer()
            htmlValue = try container.decode(String.self)
        } catch {
            do {
                alreadyDecoded = true
                let container = try decoder.container(keyedBy: CodingKeys.self)
                htmlValue = try container.decode(String.self, forKey: .htmlValue)
                asMarkdown = try container.decode(String.self, forKey: .asMarkdown)
                asRawText = try container.decode(String.self, forKey: .asRawText)
                statusesURLs = try container.decode([URL].self, forKey: .statusesURLs)
                links = try container.decode([Link].self, forKey: .links)
            } catch {
                htmlValue = ""
            }
        }

        if !alreadyDecoded {
            // https://daringfireball.net/projects/markdown/syntax
            // Pre-escape \ ` _ * ~ and [ as these are the only
            // characters the markdown parser uses when it renders
            // to attributed text. Note that ~ for strikethrough is
            // not documented in the syntax docs but is used by
            // AttributedString.
            main_regex = try? NSRegularExpression(pattern: "([\\*\\`\\~\\[\\\\])", options: .caseInsensitive)
            // don't escape underscores that are between colons, they are most likely custom emoji
            underscore_regex = try? NSRegularExpression(pattern: "(?!\\B:[^:]*)(_)(?![^:]*:\\B)", options: .caseInsensitive)

            asMarkdown = ""
            do {
                let document: Document = try SwiftSoup.parse(htmlValue)
                handleNode(node: document)

                document.outputSettings(OutputSettings().prettyPrint(pretty: false))
                try document.select("br").after("\n")
                try document.select("p").after("\n\n")
                let html = try document.html()
                var text = try SwiftSoup.clean(html, "", Whitelist.none(), OutputSettings().prettyPrint(pretty: false)) ?? ""
                // Remove the two last line break added after the last paragraph.
                if text.hasSuffix("\n\n") {
                    _ = text.removeLast()
                    _ = text.removeLast()
                }
                asRawText = text

                if asMarkdown.hasPrefix("\n") {
                    _ = asMarkdown.removeFirst()
                }

            } catch {
                asRawText = htmlValue
            }
        }

        do {
            let options = AttributedString.MarkdownParsingOptions(allowsExtendedAttributes: true,
                                                                  interpretedSyntax: .inlineOnlyPreservingWhitespace)
            asSafeMarkdownAttributedString = try AttributedString(markdown: asMarkdown, options: options)
        } catch {
            asSafeMarkdownAttributedString = AttributedString(stringLiteral: htmlValue)
        }
    }

    public init(stringValue: String, parseMarkdown: Bool = false) {
        htmlValue = stringValue
        asMarkdown = stringValue
        asRawText = stringValue
        statusesURLs = []

        if parseMarkdown {
            do {
                let options = AttributedString.MarkdownParsingOptions(allowsExtendedAttributes: true,
                                                                      interpretedSyntax: .inlineOnlyPreservingWhitespace)
                asSafeMarkdownAttributedString = try AttributedString(markdown: asMarkdown, options: options)
            } catch {
                asSafeMarkdownAttributedString = AttributedString(stringLiteral: htmlValue)
            }
        } else {
            asSafeMarkdownAttributedString = AttributedString(stringLiteral: htmlValue)
        }
    }

    public func encode(to encoder: Encoder) throws {
        var container = encoder.container(keyedBy: CodingKeys.self)
        try container.encode(htmlValue, forKey: .htmlValue)
        try container.encode(asMarkdown, forKey: .asMarkdown)
        try container.encode(asRawText, forKey: .asRawText)
        try container.encode(statusesURLs, forKey: .statusesURLs)
        try container.encode(links, forKey: .links)
    }

    private mutating func handleNode(node: SwiftSoup.Node) {
        do {
            if let className = try? node.attr("class") {
                if className == "invisible" {
                    // don't display
                    return
                }

                if className == "ellipsis" {
                    // descend into this one now and
                    // append the ellipsis
                    for nn in node.getChildNodes() {
                        handleNode(node: nn)
                    }
                    asMarkdown += "…"
                    return
                }
            }

            if node.nodeName() == "p" {
                if asMarkdown.count > 0 { // ignore first opening <p>
                    asMarkdown += "\n\n"
                }
            } else if node.nodeName() == "br" {
                if asMarkdown.count > 0 { // ignore first opening <br>
                    asMarkdown += "\n"
                }
            } else if node.nodeName() == "a" {
                let href = try node.attr("href")
                if href != "" {
                    if let url = URL(string: href),
                       let _ = Int(url.lastPathComponent)
                    {
                        statusesURLs.append(url)
                    }
                }
                asMarkdown += "["
                let start = asMarkdown.endIndex
                // descend into this node now so we can wrap the
                // inner part of the link in the right markup
                for nn in node.getChildNodes() {
                    handleNode(node: nn)
                }
                let finish = asMarkdown.endIndex

                var linkRef = href

                // Try creating a URL from the string. If it fails, try URL encoding
                //   the string first.
                var url = URL(string: href)
                if url == nil {
                    url = URL(string: href, encodePath: true)
                }
                if let linkUrl = url {
                    linkRef = linkUrl.absoluteString
                    let displayString = asMarkdown[start ..< finish]
                    links.append(Link(linkUrl, displayString: String(displayString)))
                }

                asMarkdown += "]("
                asMarkdown += linkRef
                asMarkdown += ")"

                return
            } else if node.nodeName() == "#text" {
                var txt = node.description

                if let underscore_regex, let main_regex {
                    //  This is the markdown escaper
                    txt = main_regex.stringByReplacingMatches(in: txt, options: [], range: NSRange(location: 0, length: txt.count), withTemplate: "\\\\$1")
                    txt = underscore_regex.stringByReplacingMatches(in: txt, options: [], range: NSRange(location: 0, length: txt.count), withTemplate: "\\\\$1")
                }
                // Strip newlines and line separators - they should be being sent as <br>s
                asMarkdown += txt.replacingOccurrences(of: "\n", with: "").replacingOccurrences(of: "\u{2028}", with: "")
            } else if node.nodeName() == "ul" {
                // Unordered (bulleted) list
                // SwiftUI's Text won't display these in an AttributedString, but we can at least improve the appearance
                asMarkdown += "\n\n"
                for nn in node.getChildNodes() {
                    asMarkdown += "- "
                    handleNode(node: nn)
                    asMarkdown += "\n"
                }
                return
            } else if node.nodeName() == "ol" {
                // Ordered (numbered) list
                // Same thing, won't display in a Text, but this is just an attempt to improve the appearance
                asMarkdown += "\n\n"
                var curNumber = 1
                for nn in node.getChildNodes() {
                    asMarkdown += "\(curNumber). "
                    handleNode(node: nn)
                    asMarkdown += "\n"
                    curNumber += 1
                }
                return
            }

            for n in node.getChildNodes() {
                handleNode(node: n)
            }
        } catch {}
    }

    public struct Link: Codable, Hashable, Identifiable {
        public var id: Int { hashValue }
        public let url: URL
        public let displayString: String
        public let type: LinkType
        public let title: String

        init(_ url: URL, displayString: String) {
            self.url = url
            self.displayString = displayString

            switch displayString.first {
                case "@":
                    type = .mention
                    title = displayString
                case "#":
                    type = .hashtag
                    title = String(displayString.dropFirst())
                default:
                    type = .url
                    var hostNameUrl = url.host ?? url.absoluteString
                    if hostNameUrl.hasPrefix("www.") {
                        hostNameUrl = String(hostNameUrl.dropFirst(4))
                    }
                    title = hostNameUrl
            }
        }

        public enum LinkType: String, Codable {
            case url
            case mention
            case hashtag
        }
    }
}

public extension URL {
    // It's common to use non-ASCII characters in URLs even though they're technically
    //   invalid characters. Every modern browser handles this by silently encoding
    //   the invalid characters on the user's behalf. However, trying to create a URL
    //   object with un-encoded characters will result in nil so we need to encode the
    //   invalid characters before creating the URL object. The unencoded version
    //   should still be shown in the displayed status.
    init?(string: String, encodePath: Bool) {
        var encodedUrlString = ""
        if encodePath,
           string.starts(with: "http://") || string.starts(with: "https://"),
           var startIndex = string.firstIndex(of: "/")
        {
            startIndex = string.index(startIndex, offsetBy: 1)

            // We don't want to encode the host portion of the URL
            if var startIndex = string[startIndex...].firstIndex(of: "/") {
                encodedUrlString = String(string[...startIndex])
                while let endIndex = string[string.index(after: startIndex)...].firstIndex(of: "/") {
                    let componentStartIndex = string.index(after: startIndex)
                    encodedUrlString = encodedUrlString + (string[componentStartIndex ... endIndex].addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) ?? "")
                    startIndex = endIndex
                }

                // The last part of the path may have a query string appended to it
                let componentStartIndex = string.index(after: startIndex)
                if let queryStartIndex = string[componentStartIndex...].firstIndex(of: "?") {
                    encodedUrlString = encodedUrlString + (string[componentStartIndex ..< queryStartIndex].addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) ?? "")
                    encodedUrlString = encodedUrlString + (string[queryStartIndex...].addingPercentEncoding(withAllowedCharacters: .urlQueryAllowed) ?? "")
                } else {
                    encodedUrlString = encodedUrlString + (string[componentStartIndex...].addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) ?? "")
                }
            }
        }
        if encodedUrlString.isEmpty {
            encodedUrlString = string
        }
        self.init(string: encodedUrlString)
    }
}