diff --git a/app/build.gradle b/app/build.gradle index cad079522..06b822a7e 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -205,9 +205,6 @@ dependencies { // Websockets API implementation libs.okhttp - // HTML Parsing for Link Preview - implementation libs.jsoup - // Encrypted Key Storage implementation libs.androidx.security.crypto.ktx diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt new file mode 100644 index 000000000..5245e92c0 --- /dev/null +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt @@ -0,0 +1,311 @@ +/** + * Copyright (c) 2024 Vitor Pamplona + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package com.vitorpamplona.amethyst.service.previews + +import kotlinx.collections.immutable.toImmutableMap +import java.lang.StringBuilder + +internal data class MetaTag(private val attrs: Map) { + fun attr(name: String): String = attrs[name.lowercase()] ?: "" +} + +// parse a partial HTML document and extract meta tags +internal object MetaTagsParser { + private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/') + private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`') + + fun parse(input: String): Sequence = + sequence { + val s = TagScanner(input) + while (!s.exhausted()) { + val t = s.nextTag() ?: continue + if (t.name == "/head") { + break + } + if (t.name == "meta") { + val attrs = parseAttrs(t.attrPart) ?: continue + yield(MetaTag(attrs)) + } + } + } + + private data class RawTag(val name: String, val attrPart: String) + + private class TagScanner(private val input: String) { + var p = 0 + + fun exhausted(): Boolean = p >= input.length + + private fun peek(): Char = input[p] + + private fun consume(): Char { + return input[p++] + } + + private fun consumeChar(c: Char): Boolean { + if (this.peek() == c) { + this.consume() + return true + } + return false + } + + private fun skipSpaces() { + while (!this.exhausted() && this.peek().isWhitespace()) { + this.consume() + } + } + + private fun skipUntil(c: Char) { + while (!this.exhausted() && this.peek() != c) { + this.consume() + } + } + + private fun readWhile(pred: (Char) -> Boolean): String { + val sb = StringBuilder() + while (!this.exhausted() && pred(this.peek())) { + sb.append(this.consume()) + } + return sb.toString() + } + + fun nextTag(): RawTag? { + skipUntil('<') + consume() + + // read tag name + val name = StringBuilder() + if (consumeChar('/')) { + name.append('/') + } + val n = readWhile { !it.isWhitespace() && it != '>' } + skipSpaces() + + // read until end of tag + val attrsPart = StringBuilder() + var quote: Char? = null + while (!exhausted()) { + val c = consume() + when { + // `/>` out of quote -> end of tag + quote == null && c == '/' && peek() == '>' -> { + consume() + break + } + // `>` out of quote -> end of tag + quote == null && c == '>' -> { + break + } + // entering quote + quote == null && (c == '\'' || c == '"') -> { + quote = c + } + // leaving quote + quote != null && c == quote -> { + quote = null + } + } + attrsPart.append(c) + } + + if (!n.matches(Regex("""[0-9a-zA-Z]+"""))) { + return null + } + return RawTag(name.append(n).toString().lowercase(), attrsPart.toString()) + } + } + + // map of HTML element attribute name to its value, with additional logics: + // - attribute names are matched in a case-insensitive manner + // - attribute names never duplicate + // - commonly used character references in attribute values are resolved + private class Attrs { + companion object { + val RE_CHAR_REF = Regex("""&(\w+)(;?)""") + val BASE_CHAR_REFS = + mapOf( + "amp" to "&", + "AMP" to "&", + "quot" to "\"", + "QUOT" to "\"", + "lt" to "<", + "LT" to "<", + "gt" to ">", + "GT" to ">", + ) + val CHAR_REFS = + mapOf( + "apos" to "'", + "equals" to "=", + "grave" to "`", + "DiacriticalGrave" to "`", + ) + + fun replaceCharRefs(match: MatchResult): String { + val bcr = BASE_CHAR_REFS[match.groupValues[1]] + if (bcr != null) { + return bcr + } + // non-base char refs must be terminated by ';' + if (match.groupValues[2].isNotEmpty()) { + val cr = CHAR_REFS[match.groupValues[1]] + if (cr != null) { + return cr + } + } + return match.value + } + } + + private val attrs = mutableMapOf() + + fun add(attr: Pair) { + val name = attr.first.lowercase() + if (attrs.containsKey(name)) { + throw IllegalArgumentException("duplicated attribute name: $name") + } + val value = attr.second.replace(RE_CHAR_REF, Companion::replaceCharRefs) + attrs += Pair(name, value) + } + + fun freeze(): Map = attrs.toImmutableMap() + } + + private enum class State { + NAME, + BEFORE_EQ, + AFTER_EQ, + VALUE, + SPACE, + } + + private fun parseAttrs(input: String): Map? { + val attrs = Attrs() + + var state = State.NAME + var nameBegin = 0 + var nameEnd = 0 + var valueBegin = 0 + var valueQuote: Char? = null + + input.forEachIndexed { i, c -> + when (state) { + State.NAME -> { + when { + c == '=' -> { + nameEnd = i + state = State.AFTER_EQ + } + + c.isWhitespace() -> { + nameEnd = i + state = State.BEFORE_EQ + } + + NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> { + return null + } + } + } + + State.BEFORE_EQ -> { + when { + c == '=' -> { + state = State.AFTER_EQ + } + + c.isWhitespace() -> {} + else -> return null + } + } + + State.AFTER_EQ -> { + when { + c.isWhitespace() -> {} + c == '\'' || c == '"' -> { + valueBegin = i + 1 + valueQuote = c + state = State.VALUE + } + + else -> { + valueBegin = i + valueQuote = null + state = State.VALUE + } + } + } + + State.VALUE -> { + var attr: Pair? = null + when { + valueQuote != null -> { + if (c == valueQuote) { + attr = + Pair( + input.slice(nameBegin.. { + when { + c.isWhitespace() -> { + attr = + Pair( + input.slice(nameBegin.. { + attr = + Pair( + input.slice(nameBegin.. { + return null + } + } + } + } + if (attr != null) { + runCatching { attrs.add(attr) }.getOrNull() ?: return null + state = State.SPACE + } + } + + State.SPACE -> { + if (!c.isWhitespace()) { + nameBegin = i + state = State.NAME + } + } + } + } + return attrs.freeze() + } +} diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt index ab0e1f9b4..71553b77b 100644 --- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt @@ -27,60 +27,39 @@ import kotlinx.coroutines.withContext import okhttp3.MediaType import okhttp3.MediaType.Companion.toMediaType import okhttp3.Request -import org.jsoup.Jsoup -import org.jsoup.nodes.Document +import okio.BufferedSource +import okio.ByteString.Companion.decodeHex +import okio.Options +import java.nio.charset.Charset -private const val ELEMENT_TAG_META = "meta" private const val ATTRIBUTE_VALUE_PROPERTY = "property" private const val ATTRIBUTE_VALUE_NAME = "name" private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop" +private const val ATTRIBUTE_VALUE_CHARSET = "charset" +private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv" // for - if (title.isEmpty()) { - title = it.attr(CONTENT) - } - in META_X_DESCRIPTION -> - if (description.isEmpty()) { - description = it.attr(CONTENT) - } - in META_X_IMAGE -> - if (image.isEmpty()) { - image = it.attr(CONTENT) - } - } +// taken from okhttp +private val UNICODE_BOMS = + Options.of( + // UTF-8 + "efbbbf".decodeHex(), + // UTF-16BE + "feff".decodeHex(), + // UTF-16LE + "fffe".decodeHex(), + // UTF-32BE + "0000ffff".decodeHex(), + // UTF-32LE + "ffff0000".decodeHex(), + ) - when (it.attr(ATTRIBUTE_VALUE_NAME)) { - in META_X_TITLE -> - if (title.isEmpty()) { - title = it.attr(CONTENT) - } - in META_X_DESCRIPTION -> - if (description.isEmpty()) { - description = it.attr(CONTENT) - } - in META_X_IMAGE -> - if (image.isEmpty()) { - image = it.attr(CONTENT) - } - } +private fun BufferedSource.readBomAsCharset(): Charset? { + return when (select(UNICODE_BOMS)) { + 0 -> Charsets.UTF_8 + 1 -> Charsets.UTF_16BE + 2 -> Charsets.UTF_16LE + 3 -> Charsets.UTF_32BE + 4 -> Charsets.UTF_32LE + -1 -> null + else -> throw AssertionError() + } +} - when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) { - in META_X_TITLE -> - if (title.isEmpty()) { - title = it.attr(CONTENT) - } - in META_X_DESCRIPTION -> - if (description.isEmpty()) { - description = it.attr(CONTENT) - } - in META_X_IMAGE -> - if (image.isEmpty()) { - image = it.attr(CONTENT) - } - } +private val RE_CONTENT_TYPE_CHARSET = Regex("""charset=([^;]+)""") - if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) { - return@withContext UrlInfoItem(url, title, description, image, type) +private fun detectCharset(bodyBytes: ByteArray): Charset { + // try to detect charset from meta tags parsed from first 1024 bytes of body + val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8")) + val metaTags = MetaTagsParser.parse(firstPart) + metaTags.forEach { meta -> + val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET) + if (charsetAttr.isNotEmpty()) { + runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let { + return it } } - return@withContext UrlInfoItem(url, title, description, image, type) + if (meta.attr(ATTRIBUTE_VALUE_HTTP_EQUIV).lowercase() == "content-type") { + RE_CONTENT_TYPE_CHARSET.find(meta.attr(CONTENT)) + ?.let { + runCatching { Charset.forName(it.groupValues[1]) }.getOrNull() + }?.let { + return it + } + } } + // defaults to UTF-8 + return Charset.forName("utf-8") +} + +private fun extractUrlInfo( + url: String, + metaTags: Sequence, + type: MediaType, +): UrlInfoItem { + var title: String = "" + var description: String = "" + var image: String = "" + + metaTags.forEach { + when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) { + in META_X_TITLE -> + if (title.isEmpty()) { + title = it.attr(CONTENT) + } + + in META_X_DESCRIPTION -> + if (description.isEmpty()) { + description = it.attr(CONTENT) + } + + in META_X_IMAGE -> + if (image.isEmpty()) { + image = it.attr(CONTENT) + } + } + + when (it.attr(ATTRIBUTE_VALUE_NAME)) { + in META_X_TITLE -> + if (title.isEmpty()) { + title = it.attr(CONTENT) + } + + in META_X_DESCRIPTION -> + if (description.isEmpty()) { + description = it.attr(CONTENT) + } + + in META_X_IMAGE -> + if (image.isEmpty()) { + image = it.attr(CONTENT) + } + } + + when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) { + in META_X_TITLE -> + if (title.isEmpty()) { + title = it.attr(CONTENT) + } + + in META_X_DESCRIPTION -> + if (description.isEmpty()) { + description = it.attr(CONTENT) + } + + in META_X_IMAGE -> + if (image.isEmpty()) { + image = it.attr(CONTENT) + } + } + + if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) { + return UrlInfoItem(url, title, description, image, type) + } + } + return UrlInfoItem(url, title, description, image, type) +} diff --git a/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt b/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt new file mode 100644 index 000000000..ef5cc8311 --- /dev/null +++ b/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt @@ -0,0 +1,81 @@ +/** + * Copyright (c) 2024 Vitor Pamplona + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package com.vitorpamplona.amethyst.service.previews + +import org.junit.Assert.assertEquals +import org.junit.Test + +class MetaTagsParserTest { + @Test + fun testParse() { + val input = + """ + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + """.trimMargin() + + val exp = + listOf( + listOf("charset" to "utf-8"), + listOf("http-equiv" to "content-type", "content" to "text/html; charset=utf-8"), + listOf("property" to "og:title", "content" to "title"), + listOf("property" to "og:description", "content" to "description"), + listOf("property" to "og:image", "content" to "https://example.com/img/foo.png"), + listOf("name" to "newline", "content" to "newline"), + listOf("name" to "space before gt"), + listOf("name" to "space before ="), + listOf("name" to "space after ="), + listOf("name" to "CAPITAL"), + listOf("name" to "character reference", "content" to ""), + listOf("name" to "attr value with end of head doesn't harm", "content" to "bang!"), + ) + + val metaTags = MetaTagsParser.parse(input).toList() + println(metaTags) + assertEquals(exp.size, metaTags.size) + metaTags.zip(exp).forEach { (meta, expAttrs) -> + expAttrs.forEach { (name, expValue) -> + assertEquals(expValue, meta.attr(name)) + } + } + } +} diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 43a6e6388..9125241ef 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -20,7 +20,6 @@ fragmentKtx = "1.6.2" gms = "4.4.1" jacksonModuleKotlin = "2.17.0" jna = "5.14.0" -jsoup = "1.17.2" junit = "4.13.2" kotlin = "1.9.22" kotlinxCollectionsImmutable = "0.3.7" @@ -93,7 +92,6 @@ google-mlkit-language-id = { group = "com.google.mlkit", name = "language-id", v google-mlkit-translate = { group = "com.google.mlkit", name = "translate", version.ref = "translate" } jackson-module-kotlin = { group = "com.fasterxml.jackson.module", name = "jackson-module-kotlin", version.ref = "jacksonModuleKotlin" } jna = { group = "net.java.dev.jna", name = "jna", version.ref = "jna" } -jsoup = { group = "org.jsoup", name = "jsoup", version.ref = "jsoup" } junit = { group = "junit", name = "junit", version.ref = "junit" } kotlinx-collections-immutable = { group = "org.jetbrains.kotlinx", name = "kotlinx-collections-immutable", version.ref = "kotlinxCollectionsImmutable" } lazysodium-android = { group = "com.goterl", name = "lazysodium-android", version.ref = "lazysodiumAndroid" }