fix garbled URL preview for non-UTF-8 HTML

pull/818/head
jiftechnify 2024-03-24 23:42:54 +09:00
rodzic 7eefbee0e3
commit 3434c31487
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 009040DA8C4F544C
1 zmienionych plików z 143 dodań i 54 usunięć

Wyświetl plik

@ -27,10 +27,19 @@ import kotlinx.coroutines.withContext
import okhttp3.MediaType
import okhttp3.MediaType.Companion.toMediaType
import okhttp3.Request
import okhttp3.ResponseBody
import okio.BufferedSource
import okio.ByteString.Companion.decodeHex
import okio.Options
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import java.io.ByteArrayInputStream
import java.io.IOException
import java.nio.charset.Charset
private const val ELEMENT_TAG_META = "meta"
private const val ATTRIBUTE_VALUE_CHARSET = "charset"
private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv"
private const val ATTRIBUTE_VALUE_PROPERTY = "property"
private const val ATTRIBUTE_VALUE_NAME = "name"
private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop"
@ -99,10 +108,8 @@ suspend fun getDocument(
?: throw IllegalArgumentException(
"Website returned unknown mimetype: ${it.headers.get("Content-Type")}",
)
if (mimeType.type == "text" && mimeType.subtype == "html") {
val document = Jsoup.parse(it.body.string())
parseHtml(url, document, mimeType)
parseHtml(url, it.body, mimeType)
} else if (mimeType.type == "image") {
UrlInfoItem(url, image = url, mimeType = mimeType)
} else if (mimeType.type == "video") {
@ -120,65 +127,147 @@ suspend fun getDocument(
suspend fun parseHtml(
url: String,
document: Document,
body: ResponseBody,
type: MediaType,
): UrlInfoItem =
withContext(Dispatchers.IO) {
val metaTags = document.getElementsByTag(ELEMENT_TAG_META)
val source = body.source()
var title: String = ""
var description: String = ""
var image: String = ""
// sniff charset from Content-Type header or BOM
val sniffedCharset = type.charset() ?: source.readBomAsCharset()
if (sniffedCharset != null) {
val doc = Jsoup.parse(source.inputStream(), sniffedCharset.name(), url)
return@withContext parseUrlInfo(url, doc, type)
}
metaTags.forEach {
when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
// if sniffing was failed, detect charset from content
val bodyBytes = source.readByteArray()
val charset = detectCharset(bodyBytes, url)
val doc = Jsoup.parse(ByteArrayInputStream(bodyBytes), charset.name(), url)
return@withContext parseUrlInfo(url, doc, type)
}
when (it.attr(ATTRIBUTE_VALUE_NAME)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
private val UNICODE_BOMS =
Options.of(
// UTF-8
"efbbbf".decodeHex(),
// UTF-16BE
"feff".decodeHex(),
// UTF-16LE
"fffe".decodeHex(),
// UTF-32BE
"0000ffff".decodeHex(),
// UTF-32LE
"ffff0000".decodeHex(),
)
when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
@Throws(IOException::class)
private fun BufferedSource.readBomAsCharset(): Charset? {
return when (select(UNICODE_BOMS)) {
0 -> Charsets.UTF_8
1 -> Charsets.UTF_16BE
2 -> Charsets.UTF_16LE
3 -> Charsets.UTF_32BE
4 -> Charsets.UTF_32LE
-1 -> null
else -> throw AssertionError()
}
}
if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) {
return@withContext UrlInfoItem(url, title, description, image, type)
private val RE_CONTENT_TYPE_CHARSET = Regex("""charset\s*=\s*([^;]+)""")
private fun detectCharset(
bodyBytes: ByteArray,
url: String,
): Charset {
// tentatively decode response body as UTF-8
val tentativeDoc = Jsoup.parse(ByteArrayInputStream(bodyBytes), "utf-8", url)
tentativeDoc.getElementsByTag(ELEMENT_TAG_META).forEach { meta ->
val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET)
if (charsetAttr.isNotEmpty()) {
runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let {
return it
}
}
return@withContext UrlInfoItem(url, title, description, image, type)
if (meta.attr(ATTRIBUTE_VALUE_HTTP_EQUIV).lowercase() == "content-type") {
RE_CONTENT_TYPE_CHARSET.find(meta.attr(CONTENT))
?.let {
runCatching { Charset.forName(it.groupValues[1]) }.getOrNull()
}?.let {
return it
}
}
}
return Charset.forName("utf-8")
}
private fun parseUrlInfo(
url: String,
document: Document,
type: MediaType,
): UrlInfoItem {
val metaTags = document.getElementsByTag(ELEMENT_TAG_META)
var title: String = ""
var description: String = ""
var image: String = ""
metaTags.forEach {
when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
when (it.attr(ATTRIBUTE_VALUE_NAME)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) {
return UrlInfoItem(url, title, description, image, type)
}
}
return UrlInfoItem(url, title, description, image, type)
}