From 3434c31487e503f1fd4111d3166fc4407fc8cb07 Mon Sep 17 00:00:00 2001 From: jiftechnify Date: Sun, 24 Mar 2024 23:42:54 +0900 Subject: [PATCH] fix garbled URL preview for non-UTF-8 HTML --- .../service/previews/UrlPreviewUtils.kt | 197 +++++++++++++----- 1 file changed, 143 insertions(+), 54 deletions(-) diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt index ab0e1f9b4..392082b14 100644 --- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt @@ -27,10 +27,19 @@ import kotlinx.coroutines.withContext import okhttp3.MediaType import okhttp3.MediaType.Companion.toMediaType import okhttp3.Request +import okhttp3.ResponseBody +import okio.BufferedSource +import okio.ByteString.Companion.decodeHex +import okio.Options import org.jsoup.Jsoup import org.jsoup.nodes.Document +import java.io.ByteArrayInputStream +import java.io.IOException +import java.nio.charset.Charset private const val ELEMENT_TAG_META = "meta" +private const val ATTRIBUTE_VALUE_CHARSET = "charset" +private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv" private const val ATTRIBUTE_VALUE_PROPERTY = "property" private const val ATTRIBUTE_VALUE_NAME = "name" private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop" @@ -99,10 +108,8 @@ suspend fun getDocument( ?: throw IllegalArgumentException( "Website returned unknown mimetype: ${it.headers.get("Content-Type")}", ) - if (mimeType.type == "text" && mimeType.subtype == "html") { - val document = Jsoup.parse(it.body.string()) - parseHtml(url, document, mimeType) + parseHtml(url, it.body, mimeType) } else if (mimeType.type == "image") { UrlInfoItem(url, image = url, mimeType = mimeType) } else if (mimeType.type == "video") { @@ -120,65 +127,147 @@ suspend fun getDocument( suspend fun parseHtml( url: String, - document: Document, + body: ResponseBody, type: MediaType, ): UrlInfoItem = withContext(Dispatchers.IO) { - val metaTags = document.getElementsByTag(ELEMENT_TAG_META) + val source = body.source() - var title: String = "" - var description: String = "" - var image: String = "" + // sniff charset from Content-Type header or BOM + val sniffedCharset = type.charset() ?: source.readBomAsCharset() + if (sniffedCharset != null) { + val doc = Jsoup.parse(source.inputStream(), sniffedCharset.name(), url) + return@withContext parseUrlInfo(url, doc, type) + } - metaTags.forEach { - when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) { - in META_X_TITLE -> - if (title.isEmpty()) { - title = it.attr(CONTENT) - } - in META_X_DESCRIPTION -> - if (description.isEmpty()) { - description = it.attr(CONTENT) - } - in META_X_IMAGE -> - if (image.isEmpty()) { - image = it.attr(CONTENT) - } - } + // if sniffing was failed, detect charset from content + val bodyBytes = source.readByteArray() + val charset = detectCharset(bodyBytes, url) + val doc = Jsoup.parse(ByteArrayInputStream(bodyBytes), charset.name(), url) + return@withContext parseUrlInfo(url, doc, type) + } - when (it.attr(ATTRIBUTE_VALUE_NAME)) { - in META_X_TITLE -> - if (title.isEmpty()) { - title = it.attr(CONTENT) - } - in META_X_DESCRIPTION -> - if (description.isEmpty()) { - description = it.attr(CONTENT) - } - in META_X_IMAGE -> - if (image.isEmpty()) { - image = it.attr(CONTENT) - } - } +private val UNICODE_BOMS = + Options.of( + // UTF-8 + "efbbbf".decodeHex(), + // UTF-16BE + "feff".decodeHex(), + // UTF-16LE + "fffe".decodeHex(), + // UTF-32BE + "0000ffff".decodeHex(), + // UTF-32LE + "ffff0000".decodeHex(), + ) - when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) { - in META_X_TITLE -> - if (title.isEmpty()) { - title = it.attr(CONTENT) - } - in META_X_DESCRIPTION -> - if (description.isEmpty()) { - description = it.attr(CONTENT) - } - in META_X_IMAGE -> - if (image.isEmpty()) { - image = it.attr(CONTENT) - } - } +@Throws(IOException::class) +private fun BufferedSource.readBomAsCharset(): Charset? { + return when (select(UNICODE_BOMS)) { + 0 -> Charsets.UTF_8 + 1 -> Charsets.UTF_16BE + 2 -> Charsets.UTF_16LE + 3 -> Charsets.UTF_32BE + 4 -> Charsets.UTF_32LE + -1 -> null + else -> throw AssertionError() + } +} - if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) { - return@withContext UrlInfoItem(url, title, description, image, type) +private val RE_CONTENT_TYPE_CHARSET = Regex("""charset\s*=\s*([^;]+)""") + +private fun detectCharset( + bodyBytes: ByteArray, + url: String, +): Charset { + // tentatively decode response body as UTF-8 + val tentativeDoc = Jsoup.parse(ByteArrayInputStream(bodyBytes), "utf-8", url) + + tentativeDoc.getElementsByTag(ELEMENT_TAG_META).forEach { meta -> + val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET) + if (charsetAttr.isNotEmpty()) { + runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let { + return it } } - return@withContext UrlInfoItem(url, title, description, image, type) + if (meta.attr(ATTRIBUTE_VALUE_HTTP_EQUIV).lowercase() == "content-type") { + RE_CONTENT_TYPE_CHARSET.find(meta.attr(CONTENT)) + ?.let { + runCatching { Charset.forName(it.groupValues[1]) }.getOrNull() + }?.let { + return it + } + } } + return Charset.forName("utf-8") +} + +private fun parseUrlInfo( + url: String, + document: Document, + type: MediaType, +): UrlInfoItem { + val metaTags = document.getElementsByTag(ELEMENT_TAG_META) + + var title: String = "" + var description: String = "" + var image: String = "" + + metaTags.forEach { + when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) { + in META_X_TITLE -> + if (title.isEmpty()) { + title = it.attr(CONTENT) + } + + in META_X_DESCRIPTION -> + if (description.isEmpty()) { + description = it.attr(CONTENT) + } + + in META_X_IMAGE -> + if (image.isEmpty()) { + image = it.attr(CONTENT) + } + } + + when (it.attr(ATTRIBUTE_VALUE_NAME)) { + in META_X_TITLE -> + if (title.isEmpty()) { + title = it.attr(CONTENT) + } + + in META_X_DESCRIPTION -> + if (description.isEmpty()) { + description = it.attr(CONTENT) + } + + in META_X_IMAGE -> + if (image.isEmpty()) { + image = it.attr(CONTENT) + } + } + + when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) { + in META_X_TITLE -> + if (title.isEmpty()) { + title = it.attr(CONTENT) + } + + in META_X_DESCRIPTION -> + if (description.isEmpty()) { + description = it.attr(CONTENT) + } + + in META_X_IMAGE -> + if (image.isEmpty()) { + image = it.attr(CONTENT) + } + } + + if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) { + return UrlInfoItem(url, title, description, image, type) + } + } + return UrlInfoItem(url, title, description, image, type) +}