support tags in quoted attribute value

2024-03-27 00:47:35 +09:00 · 2024-03-27 00:47:35 +09:00 · a71ce69cab
commit a71ce69cab
--- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt
+++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt
@ -0,0 +1,311 @@
 /**
 * Copyright (c) 2024 Vitor Pamplona
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
 * Software, and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 package com.vitorpamplona.amethyst.service.previews
 import kotlinx.collections.immutable.toImmutableMap
 import java.lang.StringBuilder
 internal data class MetaTag(private val attrs: Map<String, String>) {
    fun attr(name: String): String = attrs[name.lowercase()] ?: ""
 }
 // parse a partial HTML document and extract meta tags
 internal object MetaTagsParser {
    private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/')
    private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`')
    fun parse(input: String): Sequence<MetaTag> =
        sequence {
            val s = TagScanner(input)
            while (!s.exhausted()) {
                val t = s.nextTag() ?: continue
                if (t.name == "/head") {
                    break
                }
                if (t.name == "meta") {
                    val attrs = parseAttrs(t.attrPart) ?: continue
                    yield(MetaTag(attrs))
                }
            }
        }
    private data class RawTag(val name: String, val attrPart: String)
    private class TagScanner(private val input: String) {
        var p = 0
        fun exhausted(): Boolean = p >= input.length
        private fun peek(): Char = input[p]
        private fun consume(): Char {
            return input[p++]
        }
        private fun consumeChar(c: Char): Boolean {
            if (this.peek() == c) {
                this.consume()
                return true
            }
            return false
        }
        private fun skipSpaces() {
            while (!this.exhausted() && this.peek().isWhitespace()) {
                this.consume()
            }
        }
        private fun skipUntil(c: Char) {
            while (!this.exhausted() && this.peek() != c) {
                this.consume()
            }
        }
        private fun readWhile(pred: (Char) -> Boolean): String {
            val sb = StringBuilder()
            while (!this.exhausted() && pred(this.peek())) {
                sb.append(this.consume())
            }
            return sb.toString()
        }
        fun nextTag(): RawTag? {
            skipUntil('<')
            consume()
            // read tag name
            val name = StringBuilder()
            if (consumeChar('/')) {
                name.append('/')
            }
            val n = readWhile { !it.isWhitespace() && it != '>' }
            skipSpaces()
            // read until end of tag
            val attrsPart = StringBuilder()
            var quote: Char? = null
            while (!exhausted()) {
                val c = consume()
                when {
                    // `/>` out of quote -> end of tag
                    quote == null && c == '/' && peek() == '>' -> {
                        consume()
                        break
                    }
                    // `>` out of quote -> end of tag
                    quote == null && c == '>' -> {
                        break
                    }
                    // entering quote
                    quote == null && (c == '\'' || c == '"') -> {
                        quote = c
                    }
                    // leaving quote
                    quote != null && c == quote -> {
                        quote = null
                    }
                }
                attrsPart.append(c)
            }
            if (!n.matches(Regex("""[0-9a-zA-Z]+"""))) {
                return null
            }
            return RawTag(name.append(n).toString().lowercase(), attrsPart.toString())
        }
    }
    // map of HTML element attribute name to its value, with additional logics:
    // - attribute names are matched in a case-insensitive manner
    // - attribute names never duplicate
    // - commonly used character references in attribute values are resolved
    private class Attrs {
        companion object {
            val RE_CHAR_REF = Regex("""&(\w+)(;?)""")
            val BASE_CHAR_REFS =
                mapOf(
                    "amp" to "&",
                    "AMP" to "&",
                    "quot" to "\"",
                    "QUOT" to "\"",
                    "lt" to "<",
                    "LT" to "<",
                    "gt" to ">",
                    "GT" to ">",
                )
            val CHAR_REFS =
                mapOf(
                    "apos" to "'",
                    "equals" to "=",
                    "grave" to "`",
                    "DiacriticalGrave" to "`",
                )
            fun replaceCharRefs(match: MatchResult): String {
                val bcr = BASE_CHAR_REFS[match.groupValues[1]]
                if (bcr != null) {
                    return bcr
                }
                // non-base char refs must be terminated by ';'
                if (match.groupValues[2].isNotEmpty()) {
                    val cr = CHAR_REFS[match.groupValues[1]]
                    if (cr != null) {
                        return cr
                    }
                }
                return match.value
            }
        }
        private val attrs = mutableMapOf<String, String>()
        fun add(attr: Pair<String, String>) {
            val name = attr.first.lowercase()
            if (attrs.containsKey(name)) {
                throw IllegalArgumentException("duplicated attribute name: $name")
            }
            val value = attr.second.replace(RE_CHAR_REF, Companion::replaceCharRefs)
            attrs += Pair(name, value)
        }
        fun freeze(): Map<String, String> = attrs.toImmutableMap()
    }
    private enum class State {
        NAME,
        BEFORE_EQ,
        AFTER_EQ,
        VALUE,
        SPACE,
    }
    private fun parseAttrs(input: String): Map<String, String>? {
        val attrs = Attrs()
        var state = State.NAME
        var nameBegin = 0
        var nameEnd = 0
        var valueBegin = 0
        var valueQuote: Char? = null
        input.forEachIndexed { i, c ->
            when (state) {
                State.NAME -> {
                    when {
                        c == '=' -> {
                            nameEnd = i
                            state = State.AFTER_EQ
                        }
                        c.isWhitespace() -> {
                            nameEnd = i
                            state = State.BEFORE_EQ
                        }
                        NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> {
                            return null
                        }
                    }
                }
                State.BEFORE_EQ -> {
                    when {
                        c == '=' -> {
                            state = State.AFTER_EQ
                        }
                        c.isWhitespace() -> {}
                        else -> return null
                    }
                }
                State.AFTER_EQ -> {
                    when {
                        c.isWhitespace() -> {}
                        c == '\'' || c == '"' -> {
                            valueBegin = i + 1
                            valueQuote = c
                            state = State.VALUE
                        }
                        else -> {
                            valueBegin = i
                            valueQuote = null
                            state = State.VALUE
                        }
                    }
                }
                State.VALUE -> {
                    var attr: Pair<String, String>? = null
                    when {
                        valueQuote != null -> {
                            if (c == valueQuote) {
                                attr =
                                    Pair(
                                        input.slice(nameBegin..<nameEnd),
                                        input.slice(valueBegin..<i),
                                    )
                            }
                        }
                        valueQuote == null -> {
                            when {
                                c.isWhitespace() -> {
                                    attr =
                                        Pair(
                                            input.slice(nameBegin..<nameEnd),
                                            input.slice(valueBegin..<i),
                                        )
                                }
                                i == input.length - 1 -> {
                                    attr =
                                        Pair(
                                            input.slice(nameBegin..<nameEnd),
                                            input.slice(valueBegin..i),
                                        )
                                }
                                NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> {
                                    return null
                                }
                            }
                        }
                    }
                    if (attr != null) {
                        runCatching { attrs.add(attr) }.getOrNull() ?: return null
                        state = State.SPACE
                    }
                }
                State.SPACE -> {
                    if (!c.isWhitespace()) {
                        nameBegin = i
                        state = State.NAME
                    }
                }
            }
        }
        return attrs.freeze()
    }
 }
--- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt
+++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt
@ -22,13 +22,11 @@ package com.vitorpamplona.amethyst.service.previews
 import com.vitorpamplona.amethyst.service.HttpClientManager
 import com.vitorpamplona.amethyst.service.checkNotInMainThread
 import kotlinx.collections.immutable.toImmutableMap
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.withContext
 import okhttp3.MediaType
 import okhttp3.MediaType.Companion.toMediaType
 import okhttp3.Request
 import okhttp3.ResponseBody
 import okio.BufferedSource
 import okio.ByteString.Companion.decodeHex
 import okio.Options
@ -81,7 +79,7 @@ suspend fun getDocument(
                            "Website returned unknown mimetype: ${it.headers["Content-Type"]}",
                        )
                if (mimeType.type == "text" && mimeType.subtype == "html") {
-                    parseHtml(url, it.body, mimeType)
+                    parseHtml(url, it.body.source(), mimeType)
                } else if (mimeType.type == "image") {
                    UrlInfoItem(url, image = url, mimeType = mimeType)
                } else if (mimeType.type == "video") {
@ -99,24 +97,22 @@ suspend fun getDocument(
 suspend fun parseHtml(
    url: String,
-    body: ResponseBody,
+    source: BufferedSource,
    type: MediaType,
 ): UrlInfoItem =
    withContext(Dispatchers.IO) {
        val source = body.source()
        // sniff charset from Content-Type header or BOM
        val sniffedCharset = type.charset() ?: source.readBomAsCharset()
        if (sniffedCharset != null) {
-            val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset).headTagContents())
+            val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset))
-            return@withContext parseUrlInfo(url, metaTags, type)
+            return@withContext extractUrlInfo(url, metaTags, type)
        }
        // if sniffing was failed, detect charset from content
        val bodyBytes = source.readByteArray()
        val charset = detectCharset(bodyBytes)
-        val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset).headTagContents())
+        val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset))
-        return@withContext parseUrlInfo(url, metaTags, type)
+        return@withContext extractUrlInfo(url, metaTags, type)
    }
 // taken from okhttp
@ -151,7 +147,7 @@ private val RE_CONTENT_TYPE_CHARSET = Regex("""charset=([^;]+)""")
 private fun detectCharset(bodyBytes: ByteArray): Charset {
    // try to detect charset from meta tags parsed from first 1024 bytes of body
    val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8"))
-    val metaTags = runCatching { MetaTagsParser.parse(firstPart) }.getOrDefault(emptySequence())
+    val metaTags = MetaTagsParser.parse(firstPart)
    metaTags.forEach { meta ->
        val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET)
        if (charsetAttr.isNotEmpty()) {
@ -172,7 +168,7 @@ private fun detectCharset(bodyBytes: ByteArray): Charset {
    return Charset.forName("utf-8")
 }
-private fun parseUrlInfo(
+private fun extractUrlInfo(
    url: String,
    metaTags: Sequence<MetaTag>,
    type: MediaType,
@ -239,200 +235,3 @@ private fun parseUrlInfo(
    }
    return UrlInfoItem(url, title, description, image, type)
 }
 // HTML parsing stuff
 private val RE_HEAD = Regex("""<head\s*>(.*?)</head\s*>""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL))
 private fun String.headTagContents(): String = RE_HEAD.find(this)?.groupValues?.get(1) ?: ""
 private class MetaTag(private val attrs: Map<String, String>) {
    fun attr(name: String): String = attrs[name.lowercase()] ?: ""
 }
 // map of HTML element attribute name to its value, with additional logics:
 // - attribute names are matched in a case-insensitive manner
 // - attribute names never duplicate
 // - commonly used character references in attribute values are resolved
 private class Attrs {
    companion object {
        val RE_CHAR_REF = Regex("""&(\w+)(;?)""")
        val BASE_CHAR_REFS =
            mapOf(
                "amp" to "&",
                "AMP" to "&",
                "quot" to "\"",
                "QUOT" to "\"",
                "lt" to "<",
                "LT" to "<",
                "gt" to ">",
                "GT" to ">",
            )
        val CHAR_REFS =
            mapOf(
                "apos" to "'",
                "equals" to "=",
                "grave" to "`",
                "DiacriticalGrave" to "`",
            )
        fun replaceCharRefs(match: MatchResult): String {
            val bcr = BASE_CHAR_REFS[match.groupValues[1]]
            if (bcr != null) {
                return bcr
            }
            // non-base char refs must be terminated by ';'
            if (match.groupValues[2].isNotEmpty()) {
                val cr = CHAR_REFS[match.groupValues[1]]
                if (cr != null) {
                    return cr
                }
            }
            return match.value
        }
    }
    private val attrs = mutableMapOf<String, String>()
    fun add(attr: Pair<String, String>) {
        val name = attr.first.lowercase()
        if (attrs.containsKey(name)) {
            throw IllegalArgumentException("duplicated attribute name: $name")
        }
        val value = attr.second.replace(RE_CHAR_REF, Attrs::replaceCharRefs)
        attrs += Pair(name, value)
    }
    fun freeze(): Map<String, String> = attrs.toImmutableMap()
 }
 // parser for parsing a partial HTML document into meta tags
 private object MetaTagsParser {
    private val RE_META = Regex("""<meta\s+(.+?)\s*>""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL))
    private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/')
    private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`')
    fun parse(input: String): Sequence<MetaTag> =
        RE_META.findAll(input).mapNotNull {
            runCatching { MetaTag(parseAttrs(it.groupValues[1])) }.getOrNull()
        }
    private enum class State {
        NAME,
        BEFORE_EQ,
        AFTER_EQ,
        VALUE,
        SPACE,
    }
    private fun parseAttrs(input: String): Map<String, String> {
        val attrs = Attrs()
        var state = State.NAME
        var nameBegin = 0
        var nameEnd = 0
        var valueBegin = 0
        var valueQuote: Char? = null
        input.forEachIndexed { i, c ->
            when (state) {
                State.NAME -> {
                    when {
                        c == '=' -> {
                            nameEnd = i
                            state = State.AFTER_EQ
                        }
                        c.isWhitespace() -> {
                            nameEnd = i
                            state = State.BEFORE_EQ
                        }
                        NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> {
                            throw IllegalArgumentException("meta has invalid attributes part")
                        }
                    }
                }
                State.BEFORE_EQ -> {
                    when {
                        c == '=' -> {
                            state = State.AFTER_EQ
                        }
                        c.isWhitespace() -> {}
                        else -> throw IllegalArgumentException("meta has invalid attributes part")
                    }
                }
                State.AFTER_EQ -> {
                    when {
                        c.isWhitespace() -> {}
                        c == '\'' || c == '"' -> {
                            valueBegin = i + 1
                            valueQuote = c
                            state = State.VALUE
                        }
                        else -> {
                            valueBegin = i
                            valueQuote = null
                            state = State.VALUE
                        }
                    }
                }
                State.VALUE -> {
                    var attr: Pair<String, String>? = null
                    when {
                        valueQuote != null -> {
                            if (c == valueQuote) {
                                attr =
                                    Pair(
                                        input.slice(nameBegin until nameEnd),
                                        input.slice(valueBegin until i),
                                    )
                            }
                        }
                        valueQuote == null -> {
                            when {
                                c.isWhitespace() -> {
                                    attr =
                                        Pair(
                                            input.slice(nameBegin until nameEnd),
                                            input.slice(valueBegin until i),
                                        )
                                }
                                i == input.length - 1 -> {
                                    attr =
                                        Pair(
                                            input.slice(nameBegin until nameEnd),
                                            input.slice(valueBegin..i),
                                        )
                                }
                                NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> {
                                    throw IllegalArgumentException("meta has invalid attributes part")
                                }
                            }
                        }
                    }
                    if (attr != null) {
                        attrs.add(attr)
                        state = State.SPACE
                    }
                }
                State.SPACE -> {
                    if (!c.isWhitespace()) {
                        nameBegin = i
                        state = State.NAME
                    }
                }
            }
        }
        return attrs.freeze()
    }
 }
--- a/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt
+++ b/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt
@ -0,0 +1,81 @@
 /**
 * Copyright (c) 2024 Vitor Pamplona
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
 * Software, and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 package com.vitorpamplona.amethyst.service.previews
 import org.junit.Assert.assertEquals
 import org.junit.Test
 class MetaTagsParserTest {
    @Test
    fun testParse() {
        val input =
            """<html>
            |  <head>
            |    <meta charset="utf-8">
            |    <meta http-equiv="content-type" content="text/html; charset=utf-8">
            |    <meta property="og:title" content=title>
            |    <meta property="og:description" content='description'>
            |    <meta property="og:image" content="https://example.com/img/foo.png">
            |    <!-- edge cases -->
            |    <meta
            |       name="newline"
            |       content="newline"
            |    >
            |    <meta name="space before gt"    >
            |    <meta name     ="space before =">
            |    <meta name=    "space after =">
            |    <META NAME="CAPITAL">
            |    <meta name="character reference" content="&lt;meta&gt;">
            |    <meta name="attr value with end of head doesn't harm" content="<head>bang!</head>">
            |    <meta name="ignore tags with duplicated attr" name="dup">
            |  </head>
            |  <body>
            |    <meta name="ignore meta tags in body">
            |  </body>
            |</html>
            """.trimMargin()
        val exp =
            listOf(
                listOf("charset" to "utf-8"),
                listOf("http-equiv" to "content-type", "content" to "text/html; charset=utf-8"),
                listOf("property" to "og:title", "content" to "title"),
                listOf("property" to "og:description", "content" to "description"),
                listOf("property" to "og:image", "content" to "https://example.com/img/foo.png"),
                listOf("name" to "newline", "content" to "newline"),
                listOf("name" to "space before gt"),
                listOf("name" to "space before ="),
                listOf("name" to "space after ="),
                listOf("name" to "CAPITAL"),
                listOf("name" to "character reference", "content" to "<meta>"),
                listOf("name" to "attr value with end of head doesn't harm", "content" to "<head>bang!</head>"),
            )
        val metaTags = MetaTagsParser.parse(input).toList()
        println(metaTags)
        assertEquals(exp.size, metaTags.size)
        metaTags.zip(exp).forEach { (meta, expAttrs) ->
            expAttrs.forEach { (name, expValue) ->
                assertEquals(expValue, meta.attr(name))
            }
        }
    }
 }