fix garbled URL preview for non-UTF-8 HTML

2024-03-24 23:42:54 +09:00 · 2024-03-24 23:42:54 +09:00 · 3434c31487
commit 3434c31487
--- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt
+++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt
@ -27,10 +27,19 @@ import kotlinx.coroutines.withContext
 import okhttp3.MediaType
 import okhttp3.MediaType.Companion.toMediaType
 import okhttp3.Request
+import okhttp3.ResponseBody
+import okio.BufferedSource
+import okio.ByteString.Companion.decodeHex
+import okio.Options
 import org.jsoup.Jsoup
 import org.jsoup.nodes.Document
+import java.io.ByteArrayInputStream
+import java.io.IOException
+import java.nio.charset.Charset

 private const val ELEMENT_TAG_META = "meta"
+private const val ATTRIBUTE_VALUE_CHARSET = "charset"
+private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv"
 private const val ATTRIBUTE_VALUE_PROPERTY = "property"
 private const val ATTRIBUTE_VALUE_NAME = "name"
 private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop"
@ -99,10 +108,8 @@ suspend fun getDocument(
                        ?: throw IllegalArgumentException(
                            "Website returned unknown mimetype: ${it.headers.get("Content-Type")}",
                        )
-
                if (mimeType.type == "text" && mimeType.subtype == "html") {
-                    val document = Jsoup.parse(it.body.string())
-                    parseHtml(url, document, mimeType)
+                    parseHtml(url, it.body, mimeType)
                } else if (mimeType.type == "image") {
                    UrlInfoItem(url, image = url, mimeType = mimeType)
                } else if (mimeType.type == "video") {
@ -120,65 +127,147 @@ suspend fun getDocument(

 suspend fun parseHtml(
    url: String,
-    document: Document,
+    body: ResponseBody,
    type: MediaType,
 ): UrlInfoItem =
    withContext(Dispatchers.IO) {
-        val metaTags = document.getElementsByTag(ELEMENT_TAG_META)
+        val source = body.source()

-        var title: String = ""
-        var description: String = ""
-        var image: String = ""
+        // sniff charset from Content-Type header or BOM
+        val sniffedCharset = type.charset() ?: source.readBomAsCharset()
+        if (sniffedCharset != null) {
+            val doc = Jsoup.parse(source.inputStream(), sniffedCharset.name(), url)
+            return@withContext parseUrlInfo(url, doc, type)
+        }

-        metaTags.forEach {
-            when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) {
-                in META_X_TITLE ->
-                    if (title.isEmpty()) {
-                        title = it.attr(CONTENT)
-                    }
-                in META_X_DESCRIPTION ->
-                    if (description.isEmpty()) {
-                        description = it.attr(CONTENT)
-                    }
-                in META_X_IMAGE ->
-                    if (image.isEmpty()) {
-                        image = it.attr(CONTENT)
-                    }
-            }
+        // if sniffing was failed, detect charset from content
+        val bodyBytes = source.readByteArray()
+        val charset = detectCharset(bodyBytes, url)
+        val doc = Jsoup.parse(ByteArrayInputStream(bodyBytes), charset.name(), url)
+        return@withContext parseUrlInfo(url, doc, type)
+    }

-            when (it.attr(ATTRIBUTE_VALUE_NAME)) {
-                in META_X_TITLE ->
-                    if (title.isEmpty()) {
-                        title = it.attr(CONTENT)
-                    }
-                in META_X_DESCRIPTION ->
-                    if (description.isEmpty()) {
-                        description = it.attr(CONTENT)
-                    }
-                in META_X_IMAGE ->
-                    if (image.isEmpty()) {
-                        image = it.attr(CONTENT)
-                    }
-            }
+private val UNICODE_BOMS =
+    Options.of(
+        // UTF-8
+        "efbbbf".decodeHex(),
+        // UTF-16BE
+        "feff".decodeHex(),
+        // UTF-16LE
+        "fffe".decodeHex(),
+        // UTF-32BE
+        "0000ffff".decodeHex(),
+        // UTF-32LE
+        "ffff0000".decodeHex(),
+    )

-            when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) {
-                in META_X_TITLE ->
-                    if (title.isEmpty()) {
-                        title = it.attr(CONTENT)
-                    }
-                in META_X_DESCRIPTION ->
-                    if (description.isEmpty()) {
-                        description = it.attr(CONTENT)
-                    }
-                in META_X_IMAGE ->
-                    if (image.isEmpty()) {
-                        image = it.attr(CONTENT)
-                    }
-            }
+@Throws(IOException::class)
+private fun BufferedSource.readBomAsCharset(): Charset? {
+    return when (select(UNICODE_BOMS)) {
+        0 -> Charsets.UTF_8
+        1 -> Charsets.UTF_16BE
+        2 -> Charsets.UTF_16LE
+        3 -> Charsets.UTF_32BE
+        4 -> Charsets.UTF_32LE
+        -1 -> null
+        else -> throw AssertionError()
+    }
+}

-            if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) {
-                return@withContext UrlInfoItem(url, title, description, image, type)
+private val RE_CONTENT_TYPE_CHARSET = Regex("""charset\s*=\s*([^;]+)""")
+
+private fun detectCharset(
+    bodyBytes: ByteArray,
+    url: String,
+): Charset {
+    // tentatively decode response body as UTF-8
+    val tentativeDoc = Jsoup.parse(ByteArrayInputStream(bodyBytes), "utf-8", url)
+
+    tentativeDoc.getElementsByTag(ELEMENT_TAG_META).forEach { meta ->
+        val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET)
+        if (charsetAttr.isNotEmpty()) {
+            runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let {
+                return it
            }
        }
-        return@withContext UrlInfoItem(url, title, description, image, type)
+        if (meta.attr(ATTRIBUTE_VALUE_HTTP_EQUIV).lowercase() == "content-type") {
+            RE_CONTENT_TYPE_CHARSET.find(meta.attr(CONTENT))
+                ?.let {
+                    runCatching { Charset.forName(it.groupValues[1]) }.getOrNull()
+                }?.let {
+                    return it
+                }
+        }
    }
+    return Charset.forName("utf-8")
+}
+
+private fun parseUrlInfo(
+    url: String,
+    document: Document,
+    type: MediaType,
+): UrlInfoItem {
+    val metaTags = document.getElementsByTag(ELEMENT_TAG_META)
+
+    var title: String = ""
+    var description: String = ""
+    var image: String = ""
+
+    metaTags.forEach {
+        when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) {
+            in META_X_TITLE ->
+                if (title.isEmpty()) {
+                    title = it.attr(CONTENT)
+                }
+
+            in META_X_DESCRIPTION ->
+                if (description.isEmpty()) {
+                    description = it.attr(CONTENT)
+                }
+
+            in META_X_IMAGE ->
+                if (image.isEmpty()) {
+                    image = it.attr(CONTENT)
+                }
+        }
+
+        when (it.attr(ATTRIBUTE_VALUE_NAME)) {
+            in META_X_TITLE ->
+                if (title.isEmpty()) {
+                    title = it.attr(CONTENT)
+                }
+
+            in META_X_DESCRIPTION ->
+                if (description.isEmpty()) {
+                    description = it.attr(CONTENT)
+                }
+
+            in META_X_IMAGE ->
+                if (image.isEmpty()) {
+                    image = it.attr(CONTENT)
+                }
+        }
+
+        when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) {
+            in META_X_TITLE ->
+                if (title.isEmpty()) {
+                    title = it.attr(CONTENT)
+                }
+
+            in META_X_DESCRIPTION ->
+                if (description.isEmpty()) {
+                    description = it.attr(CONTENT)
+                }
+
+            in META_X_IMAGE ->
+                if (image.isEmpty()) {
+                    image = it.attr(CONTENT)
+                }
+        }
+
+        if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) {
+            return UrlInfoItem(url, title, description, image, type)
+        }
+    }
+    return UrlInfoItem(url, title, description, image, type)
+}