infer html charset from meta tag

This commit is contained in:
刘浩远 2020-08-09 21:55:45 +08:00
parent 42e819834b
commit 5c6cbf5be0
2 changed files with 12 additions and 4 deletions

View File

@ -216,7 +216,7 @@ class Article extends React.Component<ArticleProps, ArticleState> {
try {
const result = await fetch(this.props.item.link)
if (!result || !result.ok) throw new Error()
const html = await decodeFetchResponse(result)
const html = await decodeFetchResponse(result, true)
this.setState({ fullContent: html })
} catch {
this.setState({ loaded: true, error: true, errorDescription: "MERCURY_PARSER_FAILURE" })

View File

@ -29,12 +29,20 @@ const rssParser = new Parser({
})
const CHARSET_RE = /charset=([^()<>@,;:\"/[\]?.=\s]*)/i
export async function decodeFetchResponse(response: Response) {
export async function decodeFetchResponse(response: Response, isHTML = false) {
const buffer = await response.arrayBuffer()
const ctype = response.headers.has("content-type") && response.headers.get("content-type")
const charset = (ctype && CHARSET_RE.test(ctype)) ? CHARSET_RE.exec(ctype)[1] : "utf-8"
const charset = (ctype && CHARSET_RE.test(ctype)) ? CHARSET_RE.exec(ctype)[1] : undefined
const decoder = new TextDecoder(charset)
return decoder.decode(buffer)
let content = decoder.decode(buffer)
if (charset === undefined && isHTML) {
const dom = domParser.parseFromString(content, "text/html")
const meta = dom.querySelector("meta[charset]")
if (meta) {
content = (new TextDecoder(meta.getAttribute("charset"))).decode(buffer)
}
}
return content
}
export async function parseRSS(url: string) {