fix garbled URL preview for non-UTF-8 HTML

pull/818/head
jiftechnify 2024-03-24 23:42:54 +09:00
rodzic 7eefbee0e3
commit 3434c31487
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 009040DA8C4F544C
1 zmienionych plików z 143 dodań i 54 usunięć

Wyświetl plik

@ -27,10 +27,19 @@ import kotlinx.coroutines.withContext
import okhttp3.MediaType import okhttp3.MediaType
import okhttp3.MediaType.Companion.toMediaType import okhttp3.MediaType.Companion.toMediaType
import okhttp3.Request import okhttp3.Request
import okhttp3.ResponseBody
import okio.BufferedSource
import okio.ByteString.Companion.decodeHex
import okio.Options
import org.jsoup.Jsoup import org.jsoup.Jsoup
import org.jsoup.nodes.Document import org.jsoup.nodes.Document
import java.io.ByteArrayInputStream
import java.io.IOException
import java.nio.charset.Charset
private const val ELEMENT_TAG_META = "meta" private const val ELEMENT_TAG_META = "meta"
private const val ATTRIBUTE_VALUE_CHARSET = "charset"
private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv"
private const val ATTRIBUTE_VALUE_PROPERTY = "property" private const val ATTRIBUTE_VALUE_PROPERTY = "property"
private const val ATTRIBUTE_VALUE_NAME = "name" private const val ATTRIBUTE_VALUE_NAME = "name"
private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop" private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop"
@ -99,10 +108,8 @@ suspend fun getDocument(
?: throw IllegalArgumentException( ?: throw IllegalArgumentException(
"Website returned unknown mimetype: ${it.headers.get("Content-Type")}", "Website returned unknown mimetype: ${it.headers.get("Content-Type")}",
) )
if (mimeType.type == "text" && mimeType.subtype == "html") { if (mimeType.type == "text" && mimeType.subtype == "html") {
val document = Jsoup.parse(it.body.string()) parseHtml(url, it.body, mimeType)
parseHtml(url, document, mimeType)
} else if (mimeType.type == "image") { } else if (mimeType.type == "image") {
UrlInfoItem(url, image = url, mimeType = mimeType) UrlInfoItem(url, image = url, mimeType = mimeType)
} else if (mimeType.type == "video") { } else if (mimeType.type == "video") {
@ -120,65 +127,147 @@ suspend fun getDocument(
suspend fun parseHtml( suspend fun parseHtml(
url: String, url: String,
document: Document, body: ResponseBody,
type: MediaType, type: MediaType,
): UrlInfoItem = ): UrlInfoItem =
withContext(Dispatchers.IO) { withContext(Dispatchers.IO) {
val metaTags = document.getElementsByTag(ELEMENT_TAG_META) val source = body.source()
var title: String = "" // sniff charset from Content-Type header or BOM
var description: String = "" val sniffedCharset = type.charset() ?: source.readBomAsCharset()
var image: String = "" if (sniffedCharset != null) {
val doc = Jsoup.parse(source.inputStream(), sniffedCharset.name(), url)
return@withContext parseUrlInfo(url, doc, type)
}
metaTags.forEach { // if sniffing was failed, detect charset from content
when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) { val bodyBytes = source.readByteArray()
in META_X_TITLE -> val charset = detectCharset(bodyBytes, url)
if (title.isEmpty()) { val doc = Jsoup.parse(ByteArrayInputStream(bodyBytes), charset.name(), url)
title = it.attr(CONTENT) return@withContext parseUrlInfo(url, doc, type)
} }
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
when (it.attr(ATTRIBUTE_VALUE_NAME)) { private val UNICODE_BOMS =
in META_X_TITLE -> Options.of(
if (title.isEmpty()) { // UTF-8
title = it.attr(CONTENT) "efbbbf".decodeHex(),
} // UTF-16BE
in META_X_DESCRIPTION -> "feff".decodeHex(),
if (description.isEmpty()) { // UTF-16LE
description = it.attr(CONTENT) "fffe".decodeHex(),
} // UTF-32BE
in META_X_IMAGE -> "0000ffff".decodeHex(),
if (image.isEmpty()) { // UTF-32LE
image = it.attr(CONTENT) "ffff0000".decodeHex(),
} )
}
when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) { @Throws(IOException::class)
in META_X_TITLE -> private fun BufferedSource.readBomAsCharset(): Charset? {
if (title.isEmpty()) { return when (select(UNICODE_BOMS)) {
title = it.attr(CONTENT) 0 -> Charsets.UTF_8
} 1 -> Charsets.UTF_16BE
in META_X_DESCRIPTION -> 2 -> Charsets.UTF_16LE
if (description.isEmpty()) { 3 -> Charsets.UTF_32BE
description = it.attr(CONTENT) 4 -> Charsets.UTF_32LE
} -1 -> null
in META_X_IMAGE -> else -> throw AssertionError()
if (image.isEmpty()) { }
image = it.attr(CONTENT) }
}
}
if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) { private val RE_CONTENT_TYPE_CHARSET = Regex("""charset\s*=\s*([^;]+)""")
return@withContext UrlInfoItem(url, title, description, image, type)
private fun detectCharset(
bodyBytes: ByteArray,
url: String,
): Charset {
// tentatively decode response body as UTF-8
val tentativeDoc = Jsoup.parse(ByteArrayInputStream(bodyBytes), "utf-8", url)
tentativeDoc.getElementsByTag(ELEMENT_TAG_META).forEach { meta ->
val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET)
if (charsetAttr.isNotEmpty()) {
runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let {
return it
} }
} }
return@withContext UrlInfoItem(url, title, description, image, type) if (meta.attr(ATTRIBUTE_VALUE_HTTP_EQUIV).lowercase() == "content-type") {
RE_CONTENT_TYPE_CHARSET.find(meta.attr(CONTENT))
?.let {
runCatching { Charset.forName(it.groupValues[1]) }.getOrNull()
}?.let {
return it
}
}
} }
return Charset.forName("utf-8")
}
private fun parseUrlInfo(
url: String,
document: Document,
type: MediaType,
): UrlInfoItem {
val metaTags = document.getElementsByTag(ELEMENT_TAG_META)
var title: String = ""
var description: String = ""
var image: String = ""
metaTags.forEach {
when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
when (it.attr(ATTRIBUTE_VALUE_NAME)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) {
return UrlInfoItem(url, title, description, image, type)
}
}
return UrlInfoItem(url, title, description, image, type)
}