diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt new file mode 100644 index 000000000..5245e92c0 --- /dev/null +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt @@ -0,0 +1,311 @@ +/** + * Copyright (c) 2024 Vitor Pamplona + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package com.vitorpamplona.amethyst.service.previews + +import kotlinx.collections.immutable.toImmutableMap +import java.lang.StringBuilder + +internal data class MetaTag(private val attrs: Map) { + fun attr(name: String): String = attrs[name.lowercase()] ?: "" +} + +// parse a partial HTML document and extract meta tags +internal object MetaTagsParser { + private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/') + private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`') + + fun parse(input: String): Sequence = + sequence { + val s = TagScanner(input) + while (!s.exhausted()) { + val t = s.nextTag() ?: continue + if (t.name == "/head") { + break + } + if (t.name == "meta") { + val attrs = parseAttrs(t.attrPart) ?: continue + yield(MetaTag(attrs)) + } + } + } + + private data class RawTag(val name: String, val attrPart: String) + + private class TagScanner(private val input: String) { + var p = 0 + + fun exhausted(): Boolean = p >= input.length + + private fun peek(): Char = input[p] + + private fun consume(): Char { + return input[p++] + } + + private fun consumeChar(c: Char): Boolean { + if (this.peek() == c) { + this.consume() + return true + } + return false + } + + private fun skipSpaces() { + while (!this.exhausted() && this.peek().isWhitespace()) { + this.consume() + } + } + + private fun skipUntil(c: Char) { + while (!this.exhausted() && this.peek() != c) { + this.consume() + } + } + + private fun readWhile(pred: (Char) -> Boolean): String { + val sb = StringBuilder() + while (!this.exhausted() && pred(this.peek())) { + sb.append(this.consume()) + } + return sb.toString() + } + + fun nextTag(): RawTag? { + skipUntil('<') + consume() + + // read tag name + val name = StringBuilder() + if (consumeChar('/')) { + name.append('/') + } + val n = readWhile { !it.isWhitespace() && it != '>' } + skipSpaces() + + // read until end of tag + val attrsPart = StringBuilder() + var quote: Char? = null + while (!exhausted()) { + val c = consume() + when { + // `/>` out of quote -> end of tag + quote == null && c == '/' && peek() == '>' -> { + consume() + break + } + // `>` out of quote -> end of tag + quote == null && c == '>' -> { + break + } + // entering quote + quote == null && (c == '\'' || c == '"') -> { + quote = c + } + // leaving quote + quote != null && c == quote -> { + quote = null + } + } + attrsPart.append(c) + } + + if (!n.matches(Regex("""[0-9a-zA-Z]+"""))) { + return null + } + return RawTag(name.append(n).toString().lowercase(), attrsPart.toString()) + } + } + + // map of HTML element attribute name to its value, with additional logics: + // - attribute names are matched in a case-insensitive manner + // - attribute names never duplicate + // - commonly used character references in attribute values are resolved + private class Attrs { + companion object { + val RE_CHAR_REF = Regex("""&(\w+)(;?)""") + val BASE_CHAR_REFS = + mapOf( + "amp" to "&", + "AMP" to "&", + "quot" to "\"", + "QUOT" to "\"", + "lt" to "<", + "LT" to "<", + "gt" to ">", + "GT" to ">", + ) + val CHAR_REFS = + mapOf( + "apos" to "'", + "equals" to "=", + "grave" to "`", + "DiacriticalGrave" to "`", + ) + + fun replaceCharRefs(match: MatchResult): String { + val bcr = BASE_CHAR_REFS[match.groupValues[1]] + if (bcr != null) { + return bcr + } + // non-base char refs must be terminated by ';' + if (match.groupValues[2].isNotEmpty()) { + val cr = CHAR_REFS[match.groupValues[1]] + if (cr != null) { + return cr + } + } + return match.value + } + } + + private val attrs = mutableMapOf() + + fun add(attr: Pair) { + val name = attr.first.lowercase() + if (attrs.containsKey(name)) { + throw IllegalArgumentException("duplicated attribute name: $name") + } + val value = attr.second.replace(RE_CHAR_REF, Companion::replaceCharRefs) + attrs += Pair(name, value) + } + + fun freeze(): Map = attrs.toImmutableMap() + } + + private enum class State { + NAME, + BEFORE_EQ, + AFTER_EQ, + VALUE, + SPACE, + } + + private fun parseAttrs(input: String): Map? { + val attrs = Attrs() + + var state = State.NAME + var nameBegin = 0 + var nameEnd = 0 + var valueBegin = 0 + var valueQuote: Char? = null + + input.forEachIndexed { i, c -> + when (state) { + State.NAME -> { + when { + c == '=' -> { + nameEnd = i + state = State.AFTER_EQ + } + + c.isWhitespace() -> { + nameEnd = i + state = State.BEFORE_EQ + } + + NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> { + return null + } + } + } + + State.BEFORE_EQ -> { + when { + c == '=' -> { + state = State.AFTER_EQ + } + + c.isWhitespace() -> {} + else -> return null + } + } + + State.AFTER_EQ -> { + when { + c.isWhitespace() -> {} + c == '\'' || c == '"' -> { + valueBegin = i + 1 + valueQuote = c + state = State.VALUE + } + + else -> { + valueBegin = i + valueQuote = null + state = State.VALUE + } + } + } + + State.VALUE -> { + var attr: Pair? = null + when { + valueQuote != null -> { + if (c == valueQuote) { + attr = + Pair( + input.slice(nameBegin.. { + when { + c.isWhitespace() -> { + attr = + Pair( + input.slice(nameBegin.. { + attr = + Pair( + input.slice(nameBegin.. { + return null + } + } + } + } + if (attr != null) { + runCatching { attrs.add(attr) }.getOrNull() ?: return null + state = State.SPACE + } + } + + State.SPACE -> { + if (!c.isWhitespace()) { + nameBegin = i + state = State.NAME + } + } + } + } + return attrs.freeze() + } +} diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt index 6fee16f92..71553b77b 100644 --- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt @@ -22,13 +22,11 @@ package com.vitorpamplona.amethyst.service.previews import com.vitorpamplona.amethyst.service.HttpClientManager import com.vitorpamplona.amethyst.service.checkNotInMainThread -import kotlinx.collections.immutable.toImmutableMap import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.withContext import okhttp3.MediaType import okhttp3.MediaType.Companion.toMediaType import okhttp3.Request -import okhttp3.ResponseBody import okio.BufferedSource import okio.ByteString.Companion.decodeHex import okio.Options @@ -81,7 +79,7 @@ suspend fun getDocument( "Website returned unknown mimetype: ${it.headers["Content-Type"]}", ) if (mimeType.type == "text" && mimeType.subtype == "html") { - parseHtml(url, it.body, mimeType) + parseHtml(url, it.body.source(), mimeType) } else if (mimeType.type == "image") { UrlInfoItem(url, image = url, mimeType = mimeType) } else if (mimeType.type == "video") { @@ -99,24 +97,22 @@ suspend fun getDocument( suspend fun parseHtml( url: String, - body: ResponseBody, + source: BufferedSource, type: MediaType, ): UrlInfoItem = withContext(Dispatchers.IO) { - val source = body.source() - // sniff charset from Content-Type header or BOM val sniffedCharset = type.charset() ?: source.readBomAsCharset() if (sniffedCharset != null) { - val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset).headTagContents()) - return@withContext parseUrlInfo(url, metaTags, type) + val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset)) + return@withContext extractUrlInfo(url, metaTags, type) } // if sniffing was failed, detect charset from content val bodyBytes = source.readByteArray() val charset = detectCharset(bodyBytes) - val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset).headTagContents()) - return@withContext parseUrlInfo(url, metaTags, type) + val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset)) + return@withContext extractUrlInfo(url, metaTags, type) } // taken from okhttp @@ -151,7 +147,7 @@ private val RE_CONTENT_TYPE_CHARSET = Regex("""charset=([^;]+)""") private fun detectCharset(bodyBytes: ByteArray): Charset { // try to detect charset from meta tags parsed from first 1024 bytes of body val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8")) - val metaTags = runCatching { MetaTagsParser.parse(firstPart) }.getOrDefault(emptySequence()) + val metaTags = MetaTagsParser.parse(firstPart) metaTags.forEach { meta -> val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET) if (charsetAttr.isNotEmpty()) { @@ -172,7 +168,7 @@ private fun detectCharset(bodyBytes: ByteArray): Charset { return Charset.forName("utf-8") } -private fun parseUrlInfo( +private fun extractUrlInfo( url: String, metaTags: Sequence, type: MediaType, @@ -239,200 +235,3 @@ private fun parseUrlInfo( } return UrlInfoItem(url, title, description, image, type) } - -// HTML parsing stuff -private val RE_HEAD = Regex("""(.*?)""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL)) - -private fun String.headTagContents(): String = RE_HEAD.find(this)?.groupValues?.get(1) ?: "" - -private class MetaTag(private val attrs: Map) { - fun attr(name: String): String = attrs[name.lowercase()] ?: "" -} - -// map of HTML element attribute name to its value, with additional logics: -// - attribute names are matched in a case-insensitive manner -// - attribute names never duplicate -// - commonly used character references in attribute values are resolved -private class Attrs { - companion object { - val RE_CHAR_REF = Regex("""&(\w+)(;?)""") - val BASE_CHAR_REFS = - mapOf( - "amp" to "&", - "AMP" to "&", - "quot" to "\"", - "QUOT" to "\"", - "lt" to "<", - "LT" to "<", - "gt" to ">", - "GT" to ">", - ) - val CHAR_REFS = - mapOf( - "apos" to "'", - "equals" to "=", - "grave" to "`", - "DiacriticalGrave" to "`", - ) - - fun replaceCharRefs(match: MatchResult): String { - val bcr = BASE_CHAR_REFS[match.groupValues[1]] - if (bcr != null) { - return bcr - } - // non-base char refs must be terminated by ';' - if (match.groupValues[2].isNotEmpty()) { - val cr = CHAR_REFS[match.groupValues[1]] - if (cr != null) { - return cr - } - } - return match.value - } - } - - private val attrs = mutableMapOf() - - fun add(attr: Pair) { - val name = attr.first.lowercase() - if (attrs.containsKey(name)) { - throw IllegalArgumentException("duplicated attribute name: $name") - } - val value = attr.second.replace(RE_CHAR_REF, Attrs::replaceCharRefs) - attrs += Pair(name, value) - } - - fun freeze(): Map = attrs.toImmutableMap() -} - -// parser for parsing a partial HTML document into meta tags -private object MetaTagsParser { - private val RE_META = Regex("""""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL)) - - private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/') - private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`') - - fun parse(input: String): Sequence = - RE_META.findAll(input).mapNotNull { - runCatching { MetaTag(parseAttrs(it.groupValues[1])) }.getOrNull() - } - - private enum class State { - NAME, - BEFORE_EQ, - AFTER_EQ, - VALUE, - SPACE, - } - - private fun parseAttrs(input: String): Map { - val attrs = Attrs() - - var state = State.NAME - var nameBegin = 0 - var nameEnd = 0 - var valueBegin = 0 - var valueQuote: Char? = null - - input.forEachIndexed { i, c -> - when (state) { - State.NAME -> { - when { - c == '=' -> { - nameEnd = i - state = State.AFTER_EQ - } - - c.isWhitespace() -> { - nameEnd = i - state = State.BEFORE_EQ - } - - NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> { - throw IllegalArgumentException("meta has invalid attributes part") - } - } - } - - State.BEFORE_EQ -> { - when { - c == '=' -> { - state = State.AFTER_EQ - } - - c.isWhitespace() -> {} - else -> throw IllegalArgumentException("meta has invalid attributes part") - } - } - - State.AFTER_EQ -> { - when { - c.isWhitespace() -> {} - c == '\'' || c == '"' -> { - valueBegin = i + 1 - valueQuote = c - state = State.VALUE - } - - else -> { - valueBegin = i - valueQuote = null - state = State.VALUE - } - } - } - - State.VALUE -> { - var attr: Pair? = null - when { - valueQuote != null -> { - if (c == valueQuote) { - attr = - Pair( - input.slice(nameBegin until nameEnd), - input.slice(valueBegin until i), - ) - } - } - - valueQuote == null -> { - when { - c.isWhitespace() -> { - attr = - Pair( - input.slice(nameBegin until nameEnd), - input.slice(valueBegin until i), - ) - } - - i == input.length - 1 -> { - attr = - Pair( - input.slice(nameBegin until nameEnd), - input.slice(valueBegin..i), - ) - } - - NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> { - throw IllegalArgumentException("meta has invalid attributes part") - } - } - } - } - if (attr != null) { - attrs.add(attr) - state = State.SPACE - } - } - - State.SPACE -> { - if (!c.isWhitespace()) { - nameBegin = i - state = State.NAME - } - } - } - } - return attrs.freeze() - } -} diff --git a/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt b/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt new file mode 100644 index 000000000..ef5cc8311 --- /dev/null +++ b/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt @@ -0,0 +1,81 @@ +/** + * Copyright (c) 2024 Vitor Pamplona + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package com.vitorpamplona.amethyst.service.previews + +import org.junit.Assert.assertEquals +import org.junit.Test + +class MetaTagsParserTest { + @Test + fun testParse() { + val input = + """ + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + """.trimMargin() + + val exp = + listOf( + listOf("charset" to "utf-8"), + listOf("http-equiv" to "content-type", "content" to "text/html; charset=utf-8"), + listOf("property" to "og:title", "content" to "title"), + listOf("property" to "og:description", "content" to "description"), + listOf("property" to "og:image", "content" to "https://example.com/img/foo.png"), + listOf("name" to "newline", "content" to "newline"), + listOf("name" to "space before gt"), + listOf("name" to "space before ="), + listOf("name" to "space after ="), + listOf("name" to "CAPITAL"), + listOf("name" to "character reference", "content" to ""), + listOf("name" to "attr value with end of head doesn't harm", "content" to "bang!"), + ) + + val metaTags = MetaTagsParser.parse(input).toList() + println(metaTags) + assertEquals(exp.size, metaTags.size) + metaTags.zip(exp).forEach { (meta, expAttrs) -> + expAttrs.forEach { (name, expValue) -> + assertEquals(expValue, meta.attr(name)) + } + } + } +}