support tags in quoted attribute value

pull/818/head
jiftechnify 2024-03-27 00:47:35 +09:00
rodzic bffb9f3778
commit a71ce69cab
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 009040DA8C4F544C
3 zmienionych plików z 400 dodań i 209 usunięć

Wyświetl plik

@ -0,0 +1,311 @@
/**
* Copyright (c) 2024 Vitor Pamplona
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package com.vitorpamplona.amethyst.service.previews
import kotlinx.collections.immutable.toImmutableMap
import java.lang.StringBuilder
internal data class MetaTag(private val attrs: Map<String, String>) {
fun attr(name: String): String = attrs[name.lowercase()] ?: ""
}
// parse a partial HTML document and extract meta tags
internal object MetaTagsParser {
private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/')
private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`')
fun parse(input: String): Sequence<MetaTag> =
sequence {
val s = TagScanner(input)
while (!s.exhausted()) {
val t = s.nextTag() ?: continue
if (t.name == "/head") {
break
}
if (t.name == "meta") {
val attrs = parseAttrs(t.attrPart) ?: continue
yield(MetaTag(attrs))
}
}
}
private data class RawTag(val name: String, val attrPart: String)
private class TagScanner(private val input: String) {
var p = 0
fun exhausted(): Boolean = p >= input.length
private fun peek(): Char = input[p]
private fun consume(): Char {
return input[p++]
}
private fun consumeChar(c: Char): Boolean {
if (this.peek() == c) {
this.consume()
return true
}
return false
}
private fun skipSpaces() {
while (!this.exhausted() && this.peek().isWhitespace()) {
this.consume()
}
}
private fun skipUntil(c: Char) {
while (!this.exhausted() && this.peek() != c) {
this.consume()
}
}
private fun readWhile(pred: (Char) -> Boolean): String {
val sb = StringBuilder()
while (!this.exhausted() && pred(this.peek())) {
sb.append(this.consume())
}
return sb.toString()
}
fun nextTag(): RawTag? {
skipUntil('<')
consume()
// read tag name
val name = StringBuilder()
if (consumeChar('/')) {
name.append('/')
}
val n = readWhile { !it.isWhitespace() && it != '>' }
skipSpaces()
// read until end of tag
val attrsPart = StringBuilder()
var quote: Char? = null
while (!exhausted()) {
val c = consume()
when {
// `/>` out of quote -> end of tag
quote == null && c == '/' && peek() == '>' -> {
consume()
break
}
// `>` out of quote -> end of tag
quote == null && c == '>' -> {
break
}
// entering quote
quote == null && (c == '\'' || c == '"') -> {
quote = c
}
// leaving quote
quote != null && c == quote -> {
quote = null
}
}
attrsPart.append(c)
}
if (!n.matches(Regex("""[0-9a-zA-Z]+"""))) {
return null
}
return RawTag(name.append(n).toString().lowercase(), attrsPart.toString())
}
}
// map of HTML element attribute name to its value, with additional logics:
// - attribute names are matched in a case-insensitive manner
// - attribute names never duplicate
// - commonly used character references in attribute values are resolved
private class Attrs {
companion object {
val RE_CHAR_REF = Regex("""&(\w+)(;?)""")
val BASE_CHAR_REFS =
mapOf(
"amp" to "&",
"AMP" to "&",
"quot" to "\"",
"QUOT" to "\"",
"lt" to "<",
"LT" to "<",
"gt" to ">",
"GT" to ">",
)
val CHAR_REFS =
mapOf(
"apos" to "'",
"equals" to "=",
"grave" to "`",
"DiacriticalGrave" to "`",
)
fun replaceCharRefs(match: MatchResult): String {
val bcr = BASE_CHAR_REFS[match.groupValues[1]]
if (bcr != null) {
return bcr
}
// non-base char refs must be terminated by ';'
if (match.groupValues[2].isNotEmpty()) {
val cr = CHAR_REFS[match.groupValues[1]]
if (cr != null) {
return cr
}
}
return match.value
}
}
private val attrs = mutableMapOf<String, String>()
fun add(attr: Pair<String, String>) {
val name = attr.first.lowercase()
if (attrs.containsKey(name)) {
throw IllegalArgumentException("duplicated attribute name: $name")
}
val value = attr.second.replace(RE_CHAR_REF, Companion::replaceCharRefs)
attrs += Pair(name, value)
}
fun freeze(): Map<String, String> = attrs.toImmutableMap()
}
private enum class State {
NAME,
BEFORE_EQ,
AFTER_EQ,
VALUE,
SPACE,
}
private fun parseAttrs(input: String): Map<String, String>? {
val attrs = Attrs()
var state = State.NAME
var nameBegin = 0
var nameEnd = 0
var valueBegin = 0
var valueQuote: Char? = null
input.forEachIndexed { i, c ->
when (state) {
State.NAME -> {
when {
c == '=' -> {
nameEnd = i
state = State.AFTER_EQ
}
c.isWhitespace() -> {
nameEnd = i
state = State.BEFORE_EQ
}
NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> {
return null
}
}
}
State.BEFORE_EQ -> {
when {
c == '=' -> {
state = State.AFTER_EQ
}
c.isWhitespace() -> {}
else -> return null
}
}
State.AFTER_EQ -> {
when {
c.isWhitespace() -> {}
c == '\'' || c == '"' -> {
valueBegin = i + 1
valueQuote = c
state = State.VALUE
}
else -> {
valueBegin = i
valueQuote = null
state = State.VALUE
}
}
}
State.VALUE -> {
var attr: Pair<String, String>? = null
when {
valueQuote != null -> {
if (c == valueQuote) {
attr =
Pair(
input.slice(nameBegin..<nameEnd),
input.slice(valueBegin..<i),
)
}
}
valueQuote == null -> {
when {
c.isWhitespace() -> {
attr =
Pair(
input.slice(nameBegin..<nameEnd),
input.slice(valueBegin..<i),
)
}
i == input.length - 1 -> {
attr =
Pair(
input.slice(nameBegin..<nameEnd),
input.slice(valueBegin..i),
)
}
NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> {
return null
}
}
}
}
if (attr != null) {
runCatching { attrs.add(attr) }.getOrNull() ?: return null
state = State.SPACE
}
}
State.SPACE -> {
if (!c.isWhitespace()) {
nameBegin = i
state = State.NAME
}
}
}
}
return attrs.freeze()
}
}

Wyświetl plik

@ -22,13 +22,11 @@ package com.vitorpamplona.amethyst.service.previews
import com.vitorpamplona.amethyst.service.HttpClientManager import com.vitorpamplona.amethyst.service.HttpClientManager
import com.vitorpamplona.amethyst.service.checkNotInMainThread import com.vitorpamplona.amethyst.service.checkNotInMainThread
import kotlinx.collections.immutable.toImmutableMap
import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.withContext import kotlinx.coroutines.withContext
import okhttp3.MediaType import okhttp3.MediaType
import okhttp3.MediaType.Companion.toMediaType import okhttp3.MediaType.Companion.toMediaType
import okhttp3.Request import okhttp3.Request
import okhttp3.ResponseBody
import okio.BufferedSource import okio.BufferedSource
import okio.ByteString.Companion.decodeHex import okio.ByteString.Companion.decodeHex
import okio.Options import okio.Options
@ -81,7 +79,7 @@ suspend fun getDocument(
"Website returned unknown mimetype: ${it.headers["Content-Type"]}", "Website returned unknown mimetype: ${it.headers["Content-Type"]}",
) )
if (mimeType.type == "text" && mimeType.subtype == "html") { if (mimeType.type == "text" && mimeType.subtype == "html") {
parseHtml(url, it.body, mimeType) parseHtml(url, it.body.source(), mimeType)
} else if (mimeType.type == "image") { } else if (mimeType.type == "image") {
UrlInfoItem(url, image = url, mimeType = mimeType) UrlInfoItem(url, image = url, mimeType = mimeType)
} else if (mimeType.type == "video") { } else if (mimeType.type == "video") {
@ -99,24 +97,22 @@ suspend fun getDocument(
suspend fun parseHtml( suspend fun parseHtml(
url: String, url: String,
body: ResponseBody, source: BufferedSource,
type: MediaType, type: MediaType,
): UrlInfoItem = ): UrlInfoItem =
withContext(Dispatchers.IO) { withContext(Dispatchers.IO) {
val source = body.source()
// sniff charset from Content-Type header or BOM // sniff charset from Content-Type header or BOM
val sniffedCharset = type.charset() ?: source.readBomAsCharset() val sniffedCharset = type.charset() ?: source.readBomAsCharset()
if (sniffedCharset != null) { if (sniffedCharset != null) {
val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset).headTagContents()) val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset))
return@withContext parseUrlInfo(url, metaTags, type) return@withContext extractUrlInfo(url, metaTags, type)
} }
// if sniffing was failed, detect charset from content // if sniffing was failed, detect charset from content
val bodyBytes = source.readByteArray() val bodyBytes = source.readByteArray()
val charset = detectCharset(bodyBytes) val charset = detectCharset(bodyBytes)
val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset).headTagContents()) val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset))
return@withContext parseUrlInfo(url, metaTags, type) return@withContext extractUrlInfo(url, metaTags, type)
} }
// taken from okhttp // taken from okhttp
@ -151,7 +147,7 @@ private val RE_CONTENT_TYPE_CHARSET = Regex("""charset=([^;]+)""")
private fun detectCharset(bodyBytes: ByteArray): Charset { private fun detectCharset(bodyBytes: ByteArray): Charset {
// try to detect charset from meta tags parsed from first 1024 bytes of body // try to detect charset from meta tags parsed from first 1024 bytes of body
val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8")) val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8"))
val metaTags = runCatching { MetaTagsParser.parse(firstPart) }.getOrDefault(emptySequence()) val metaTags = MetaTagsParser.parse(firstPart)
metaTags.forEach { meta -> metaTags.forEach { meta ->
val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET) val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET)
if (charsetAttr.isNotEmpty()) { if (charsetAttr.isNotEmpty()) {
@ -172,7 +168,7 @@ private fun detectCharset(bodyBytes: ByteArray): Charset {
return Charset.forName("utf-8") return Charset.forName("utf-8")
} }
private fun parseUrlInfo( private fun extractUrlInfo(
url: String, url: String,
metaTags: Sequence<MetaTag>, metaTags: Sequence<MetaTag>,
type: MediaType, type: MediaType,
@ -239,200 +235,3 @@ private fun parseUrlInfo(
} }
return UrlInfoItem(url, title, description, image, type) return UrlInfoItem(url, title, description, image, type)
} }
// HTML parsing stuff
private val RE_HEAD = Regex("""<head\s*>(.*?)</head\s*>""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL))
private fun String.headTagContents(): String = RE_HEAD.find(this)?.groupValues?.get(1) ?: ""
private class MetaTag(private val attrs: Map<String, String>) {
fun attr(name: String): String = attrs[name.lowercase()] ?: ""
}
// map of HTML element attribute name to its value, with additional logics:
// - attribute names are matched in a case-insensitive manner
// - attribute names never duplicate
// - commonly used character references in attribute values are resolved
private class Attrs {
companion object {
val RE_CHAR_REF = Regex("""&(\w+)(;?)""")
val BASE_CHAR_REFS =
mapOf(
"amp" to "&",
"AMP" to "&",
"quot" to "\"",
"QUOT" to "\"",
"lt" to "<",
"LT" to "<",
"gt" to ">",
"GT" to ">",
)
val CHAR_REFS =
mapOf(
"apos" to "'",
"equals" to "=",
"grave" to "`",
"DiacriticalGrave" to "`",
)
fun replaceCharRefs(match: MatchResult): String {
val bcr = BASE_CHAR_REFS[match.groupValues[1]]
if (bcr != null) {
return bcr
}
// non-base char refs must be terminated by ';'
if (match.groupValues[2].isNotEmpty()) {
val cr = CHAR_REFS[match.groupValues[1]]
if (cr != null) {
return cr
}
}
return match.value
}
}
private val attrs = mutableMapOf<String, String>()
fun add(attr: Pair<String, String>) {
val name = attr.first.lowercase()
if (attrs.containsKey(name)) {
throw IllegalArgumentException("duplicated attribute name: $name")
}
val value = attr.second.replace(RE_CHAR_REF, Attrs::replaceCharRefs)
attrs += Pair(name, value)
}
fun freeze(): Map<String, String> = attrs.toImmutableMap()
}
// parser for parsing a partial HTML document into meta tags
private object MetaTagsParser {
private val RE_META = Regex("""<meta\s+(.+?)\s*>""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL))
private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/')
private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`')
fun parse(input: String): Sequence<MetaTag> =
RE_META.findAll(input).mapNotNull {
runCatching { MetaTag(parseAttrs(it.groupValues[1])) }.getOrNull()
}
private enum class State {
NAME,
BEFORE_EQ,
AFTER_EQ,
VALUE,
SPACE,
}
private fun parseAttrs(input: String): Map<String, String> {
val attrs = Attrs()
var state = State.NAME
var nameBegin = 0
var nameEnd = 0
var valueBegin = 0
var valueQuote: Char? = null
input.forEachIndexed { i, c ->
when (state) {
State.NAME -> {
when {
c == '=' -> {
nameEnd = i
state = State.AFTER_EQ
}
c.isWhitespace() -> {
nameEnd = i
state = State.BEFORE_EQ
}
NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> {
throw IllegalArgumentException("meta has invalid attributes part")
}
}
}
State.BEFORE_EQ -> {
when {
c == '=' -> {
state = State.AFTER_EQ
}
c.isWhitespace() -> {}
else -> throw IllegalArgumentException("meta has invalid attributes part")
}
}
State.AFTER_EQ -> {
when {
c.isWhitespace() -> {}
c == '\'' || c == '"' -> {
valueBegin = i + 1
valueQuote = c
state = State.VALUE
}
else -> {
valueBegin = i
valueQuote = null
state = State.VALUE
}
}
}
State.VALUE -> {
var attr: Pair<String, String>? = null
when {
valueQuote != null -> {
if (c == valueQuote) {
attr =
Pair(
input.slice(nameBegin until nameEnd),
input.slice(valueBegin until i),
)
}
}
valueQuote == null -> {
when {
c.isWhitespace() -> {
attr =
Pair(
input.slice(nameBegin until nameEnd),
input.slice(valueBegin until i),
)
}
i == input.length - 1 -> {
attr =
Pair(
input.slice(nameBegin until nameEnd),
input.slice(valueBegin..i),
)
}
NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> {
throw IllegalArgumentException("meta has invalid attributes part")
}
}
}
}
if (attr != null) {
attrs.add(attr)
state = State.SPACE
}
}
State.SPACE -> {
if (!c.isWhitespace()) {
nameBegin = i
state = State.NAME
}
}
}
}
return attrs.freeze()
}
}

Wyświetl plik

@ -0,0 +1,81 @@
/**
* Copyright (c) 2024 Vitor Pamplona
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package com.vitorpamplona.amethyst.service.previews
import org.junit.Assert.assertEquals
import org.junit.Test
class MetaTagsParserTest {
@Test
fun testParse() {
val input =
"""<html>
| <head>
| <meta charset="utf-8">
| <meta http-equiv="content-type" content="text/html; charset=utf-8">
| <meta property="og:title" content=title>
| <meta property="og:description" content='description'>
| <meta property="og:image" content="https://example.com/img/foo.png">
| <!-- edge cases -->
| <meta
| name="newline"
| content="newline"
| >
| <meta name="space before gt" >
| <meta name ="space before =">
| <meta name= "space after =">
| <META NAME="CAPITAL">
| <meta name="character reference" content="&lt;meta&gt;">
| <meta name="attr value with end of head doesn't harm" content="<head>bang!</head>">
| <meta name="ignore tags with duplicated attr" name="dup">
| </head>
| <body>
| <meta name="ignore meta tags in body">
| </body>
|</html>
""".trimMargin()
val exp =
listOf(
listOf("charset" to "utf-8"),
listOf("http-equiv" to "content-type", "content" to "text/html; charset=utf-8"),
listOf("property" to "og:title", "content" to "title"),
listOf("property" to "og:description", "content" to "description"),
listOf("property" to "og:image", "content" to "https://example.com/img/foo.png"),
listOf("name" to "newline", "content" to "newline"),
listOf("name" to "space before gt"),
listOf("name" to "space before ="),
listOf("name" to "space after ="),
listOf("name" to "CAPITAL"),
listOf("name" to "character reference", "content" to "<meta>"),
listOf("name" to "attr value with end of head doesn't harm", "content" to "<head>bang!</head>"),
)
val metaTags = MetaTagsParser.parse(input).toList()
println(metaTags)
assertEquals(exp.size, metaTags.size)
metaTags.zip(exp).forEach { (meta, expAttrs) ->
expAttrs.forEach { (name, expValue) ->
assertEquals(expValue, meta.attr(name))
}
}
}
}