diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt index 392082b14..e01054201 100644 --- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt @@ -22,6 +22,7 @@ package com.vitorpamplona.amethyst.service.previews import com.vitorpamplona.amethyst.service.HttpClientManager import com.vitorpamplona.amethyst.service.checkNotInMainThread +import kotlinx.collections.immutable.toImmutableMap import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.withContext import okhttp3.MediaType @@ -34,15 +35,14 @@ import okio.Options import org.jsoup.Jsoup import org.jsoup.nodes.Document import java.io.ByteArrayInputStream -import java.io.IOException import java.nio.charset.Charset private const val ELEMENT_TAG_META = "meta" -private const val ATTRIBUTE_VALUE_CHARSET = "charset" -private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv" private const val ATTRIBUTE_VALUE_PROPERTY = "property" private const val ATTRIBUTE_VALUE_NAME = "name" private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop" +private const val ATTRIBUTE_VALUE_CHARSET = "charset" +private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv" // for Charsets.UTF_8 @@ -174,16 +174,13 @@ private fun BufferedSource.readBomAsCharset(): Charset? { } } -private val RE_CONTENT_TYPE_CHARSET = Regex("""charset\s*=\s*([^;]+)""") +private val RE_CONTENT_TYPE_CHARSET = Regex("""charset=([^;]+)""") -private fun detectCharset( - bodyBytes: ByteArray, - url: String, -): Charset { - // tentatively decode response body as UTF-8 - val tentativeDoc = Jsoup.parse(ByteArrayInputStream(bodyBytes), "utf-8", url) - - tentativeDoc.getElementsByTag(ELEMENT_TAG_META).forEach { meta -> +private fun detectCharset(bodyBytes: ByteArray): Charset { + // try to detect charset from meta tags parsed from first 1024 bytes of body + val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8")) + val metaTags = runCatching { MetaTagsParser.parse(firstPart) }.getOrDefault(emptySequence()) + metaTags.forEach { meta -> val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET) if (charsetAttr.isNotEmpty()) { runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let { @@ -199,6 +196,7 @@ private fun detectCharset( } } } + // defaults to UTF-8 return Charset.forName("utf-8") } @@ -271,3 +269,156 @@ private fun parseUrlInfo( } return UrlInfoItem(url, title, description, image, type) } + +private class MetaTag(private val attrs: Map) { + fun attr(name: String): String = attrs[name.lowercase()] ?: "" +} + +// map of HTML element attribute name to its value, with some guarantees: +// - attribute names are compared in a case-insensitive manner +// - attribute names never duplicate +private class Attrs { + private val attrs = mutableMapOf() + + fun add(attr: Pair) { + val name = attr.first.lowercase() + if (attrs.containsKey(name)) { + throw IllegalArgumentException("duplicated attribute name: $name") + } + attrs += Pair(name, attr.second) + } + + fun freeze(): Map = attrs.toImmutableMap() +} + +// parser for parsing a partial HTML document into meta tags +private object MetaTagsParser { + private val RE_META = Regex("""""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL)) + + private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/') + private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`') + + fun parse(input: String): Sequence = + RE_META.findAll(input).map { + MetaTag(parseAttrs(it.groupValues[1])) + } + + private enum class State { + NAME, + BEFORE_EQ, + AFTER_EQ, + VALUE, + SPACE, + } + + private fun parseAttrs(input: String): Map { + val attrs = Attrs() + + var state = State.NAME + var nameBegin = 0 + var nameEnd = 0 + var valueBegin = 0 + var valueQuote: Char? = null + + input.forEachIndexed { i, c -> + when (state) { + State.NAME -> { + when { + c == '=' -> { + nameEnd = i + state = State.AFTER_EQ + } + + c.isWhitespace() -> { + nameEnd = i + state = State.BEFORE_EQ + } + + NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> { + throw IllegalArgumentException("meta has invalid attributes part") + } + } + } + + State.BEFORE_EQ -> { + when { + c == '=' -> { + state = State.AFTER_EQ + } + + c.isWhitespace() -> {} + else -> throw IllegalArgumentException("meta has invalid attributes part") + } + } + + State.AFTER_EQ -> { + when { + c.isWhitespace() -> {} + c == '\'' || c == '"' -> { + valueBegin = i + 1 + valueQuote = c + state = State.VALUE + } + + else -> { + valueBegin = i + valueQuote = null + state = State.VALUE + } + } + } + + State.VALUE -> { + var attr: Pair? = null + when { + valueQuote != null -> { + if (c == valueQuote) { + attr = + Pair( + input.slice(nameBegin until nameEnd), + input.slice(valueBegin until i), + ) + } + } + + valueQuote == null -> { + when { + c.isWhitespace() -> { + attr = + Pair( + input.slice(nameBegin until nameEnd), + input.slice(valueBegin until i), + ) + } + + i == input.length - 1 -> { + attr = + Pair( + input.slice(nameBegin until nameEnd), + input.slice(valueBegin..i), + ) + } + + NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> { + throw IllegalArgumentException("meta has invalid attributes part") + } + } + } + } + if (attr != null) { + attrs.add(attr) + state = State.SPACE + } + } + + State.SPACE -> { + if (!c.isWhitespace()) { + nameBegin = i + state = State.NAME + } + } + } + } + return attrs.freeze() + } +}