Merge pull request #818 from jiftechnify/url-preview

Fix garbled URL preview for non UTF-8 HTML, with optimization
pull/819/head
Vitor Pamplona 2024-03-26 13:40:15 -04:00 zatwierdzone przez GitHub
commit d61d684a27
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
5 zmienionych plików z 529 dodań i 89 usunięć

Wyświetl plik

@ -205,9 +205,6 @@ dependencies {
// Websockets API
implementation libs.okhttp
// HTML Parsing for Link Preview
implementation libs.jsoup
// Encrypted Key Storage
implementation libs.androidx.security.crypto.ktx

Wyświetl plik

@ -0,0 +1,311 @@
/**
* Copyright (c) 2024 Vitor Pamplona
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package com.vitorpamplona.amethyst.service.previews
import kotlinx.collections.immutable.toImmutableMap
import java.lang.StringBuilder
internal data class MetaTag(private val attrs: Map<String, String>) {
fun attr(name: String): String = attrs[name.lowercase()] ?: ""
}
// parse a partial HTML document and extract meta tags
internal object MetaTagsParser {
private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/')
private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`')
fun parse(input: String): Sequence<MetaTag> =
sequence {
val s = TagScanner(input)
while (!s.exhausted()) {
val t = s.nextTag() ?: continue
if (t.name == "/head") {
break
}
if (t.name == "meta") {
val attrs = parseAttrs(t.attrPart) ?: continue
yield(MetaTag(attrs))
}
}
}
private data class RawTag(val name: String, val attrPart: String)
private class TagScanner(private val input: String) {
var p = 0
fun exhausted(): Boolean = p >= input.length
private fun peek(): Char = input[p]
private fun consume(): Char {
return input[p++]
}
private fun consumeChar(c: Char): Boolean {
if (this.peek() == c) {
this.consume()
return true
}
return false
}
private fun skipSpaces() {
while (!this.exhausted() && this.peek().isWhitespace()) {
this.consume()
}
}
private fun skipUntil(c: Char) {
while (!this.exhausted() && this.peek() != c) {
this.consume()
}
}
private fun readWhile(pred: (Char) -> Boolean): String {
val sb = StringBuilder()
while (!this.exhausted() && pred(this.peek())) {
sb.append(this.consume())
}
return sb.toString()
}
fun nextTag(): RawTag? {
skipUntil('<')
consume()
// read tag name
val name = StringBuilder()
if (consumeChar('/')) {
name.append('/')
}
val n = readWhile { !it.isWhitespace() && it != '>' }
skipSpaces()
// read until end of tag
val attrsPart = StringBuilder()
var quote: Char? = null
while (!exhausted()) {
val c = consume()
when {
// `/>` out of quote -> end of tag
quote == null && c == '/' && peek() == '>' -> {
consume()
break
}
// `>` out of quote -> end of tag
quote == null && c == '>' -> {
break
}
// entering quote
quote == null && (c == '\'' || c == '"') -> {
quote = c
}
// leaving quote
quote != null && c == quote -> {
quote = null
}
}
attrsPart.append(c)
}
if (!n.matches(Regex("""[0-9a-zA-Z]+"""))) {
return null
}
return RawTag(name.append(n).toString().lowercase(), attrsPart.toString())
}
}
// map of HTML element attribute name to its value, with additional logics:
// - attribute names are matched in a case-insensitive manner
// - attribute names never duplicate
// - commonly used character references in attribute values are resolved
private class Attrs {
companion object {
val RE_CHAR_REF = Regex("""&(\w+)(;?)""")
val BASE_CHAR_REFS =
mapOf(
"amp" to "&",
"AMP" to "&",
"quot" to "\"",
"QUOT" to "\"",
"lt" to "<",
"LT" to "<",
"gt" to ">",
"GT" to ">",
)
val CHAR_REFS =
mapOf(
"apos" to "'",
"equals" to "=",
"grave" to "`",
"DiacriticalGrave" to "`",
)
fun replaceCharRefs(match: MatchResult): String {
val bcr = BASE_CHAR_REFS[match.groupValues[1]]
if (bcr != null) {
return bcr
}
// non-base char refs must be terminated by ';'
if (match.groupValues[2].isNotEmpty()) {
val cr = CHAR_REFS[match.groupValues[1]]
if (cr != null) {
return cr
}
}
return match.value
}
}
private val attrs = mutableMapOf<String, String>()
fun add(attr: Pair<String, String>) {
val name = attr.first.lowercase()
if (attrs.containsKey(name)) {
throw IllegalArgumentException("duplicated attribute name: $name")
}
val value = attr.second.replace(RE_CHAR_REF, Companion::replaceCharRefs)
attrs += Pair(name, value)
}
fun freeze(): Map<String, String> = attrs.toImmutableMap()
}
private enum class State {
NAME,
BEFORE_EQ,
AFTER_EQ,
VALUE,
SPACE,
}
private fun parseAttrs(input: String): Map<String, String>? {
val attrs = Attrs()
var state = State.NAME
var nameBegin = 0
var nameEnd = 0
var valueBegin = 0
var valueQuote: Char? = null
input.forEachIndexed { i, c ->
when (state) {
State.NAME -> {
when {
c == '=' -> {
nameEnd = i
state = State.AFTER_EQ
}
c.isWhitespace() -> {
nameEnd = i
state = State.BEFORE_EQ
}
NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> {
return null
}
}
}
State.BEFORE_EQ -> {
when {
c == '=' -> {
state = State.AFTER_EQ
}
c.isWhitespace() -> {}
else -> return null
}
}
State.AFTER_EQ -> {
when {
c.isWhitespace() -> {}
c == '\'' || c == '"' -> {
valueBegin = i + 1
valueQuote = c
state = State.VALUE
}
else -> {
valueBegin = i
valueQuote = null
state = State.VALUE
}
}
}
State.VALUE -> {
var attr: Pair<String, String>? = null
when {
valueQuote != null -> {
if (c == valueQuote) {
attr =
Pair(
input.slice(nameBegin..<nameEnd),
input.slice(valueBegin..<i),
)
}
}
valueQuote == null -> {
when {
c.isWhitespace() -> {
attr =
Pair(
input.slice(nameBegin..<nameEnd),
input.slice(valueBegin..<i),
)
}
i == input.length - 1 -> {
attr =
Pair(
input.slice(nameBegin..<nameEnd),
input.slice(valueBegin..i),
)
}
NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> {
return null
}
}
}
}
if (attr != null) {
runCatching { attrs.add(attr) }.getOrNull() ?: return null
state = State.SPACE
}
}
State.SPACE -> {
if (!c.isWhitespace()) {
nameBegin = i
state = State.NAME
}
}
}
}
return attrs.freeze()
}
}

Wyświetl plik

@ -27,60 +27,39 @@ import kotlinx.coroutines.withContext
import okhttp3.MediaType
import okhttp3.MediaType.Companion.toMediaType
import okhttp3.Request
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import okio.BufferedSource
import okio.ByteString.Companion.decodeHex
import okio.Options
import java.nio.charset.Charset
private const val ELEMENT_TAG_META = "meta"
private const val ATTRIBUTE_VALUE_PROPERTY = "property"
private const val ATTRIBUTE_VALUE_NAME = "name"
private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop"
private const val ATTRIBUTE_VALUE_CHARSET = "charset"
private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv"
// for <meta itemprop=... to get title
private val META_X_TITLE =
arrayOf(
"og:title",
"\"og:title\"",
"'og:title'",
"name",
"\"name\"",
"'name'",
"twitter:title",
"\"twitter:title\"",
"'twitter:title'",
"title",
"\"title\"",
"'title'",
)
// for <meta itemprop=... to get description
private val META_X_DESCRIPTION =
arrayOf(
"og:description",
"\"og:description\"",
"'og:description'",
"description",
"\"description\"",
"'description'",
"twitter:description",
"\"twitter:description\"",
"'twitter:description'",
"description",
"\"description\"",
"'description'",
)
// for <meta itemprop=... to get image
private val META_X_IMAGE =
arrayOf(
"og:image",
"\"og:image\"",
"'og:image'",
"image",
"\"image\"",
"'image'",
"twitter:image",
"\"twitter:image\"",
"'twitter:image'",
"image",
)
private const val CONTENT = "content"
@ -95,14 +74,12 @@ suspend fun getDocument(
checkNotInMainThread()
if (it.isSuccessful) {
val mimeType =
it.headers.get("Content-Type")?.toMediaType()
it.headers["Content-Type"]?.toMediaType()
?: throw IllegalArgumentException(
"Website returned unknown mimetype: ${it.headers.get("Content-Type")}",
"Website returned unknown mimetype: ${it.headers["Content-Type"]}",
)
if (mimeType.type == "text" && mimeType.subtype == "html") {
val document = Jsoup.parse(it.body.string())
parseHtml(url, document, mimeType)
parseHtml(url, it.body.source(), mimeType)
} else if (mimeType.type == "image") {
UrlInfoItem(url, image = url, mimeType = mimeType)
} else if (mimeType.type == "video") {
@ -120,65 +97,141 @@ suspend fun getDocument(
suspend fun parseHtml(
url: String,
document: Document,
source: BufferedSource,
type: MediaType,
): UrlInfoItem =
withContext(Dispatchers.IO) {
val metaTags = document.getElementsByTag(ELEMENT_TAG_META)
// sniff charset from Content-Type header or BOM
val sniffedCharset = type.charset() ?: source.readBomAsCharset()
if (sniffedCharset != null) {
val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset))
return@withContext extractUrlInfo(url, metaTags, type)
}
var title: String = ""
var description: String = ""
var image: String = ""
// if sniffing was failed, detect charset from content
val bodyBytes = source.readByteArray()
val charset = detectCharset(bodyBytes)
val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset))
return@withContext extractUrlInfo(url, metaTags, type)
}
metaTags.forEach {
when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
// taken from okhttp
private val UNICODE_BOMS =
Options.of(
// UTF-8
"efbbbf".decodeHex(),
// UTF-16BE
"feff".decodeHex(),
// UTF-16LE
"fffe".decodeHex(),
// UTF-32BE
"0000ffff".decodeHex(),
// UTF-32LE
"ffff0000".decodeHex(),
)
when (it.attr(ATTRIBUTE_VALUE_NAME)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
private fun BufferedSource.readBomAsCharset(): Charset? {
return when (select(UNICODE_BOMS)) {
0 -> Charsets.UTF_8
1 -> Charsets.UTF_16BE
2 -> Charsets.UTF_16LE
3 -> Charsets.UTF_32BE
4 -> Charsets.UTF_32LE
-1 -> null
else -> throw AssertionError()
}
}
when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
private val RE_CONTENT_TYPE_CHARSET = Regex("""charset=([^;]+)""")
if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) {
return@withContext UrlInfoItem(url, title, description, image, type)
private fun detectCharset(bodyBytes: ByteArray): Charset {
// try to detect charset from meta tags parsed from first 1024 bytes of body
val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8"))
val metaTags = MetaTagsParser.parse(firstPart)
metaTags.forEach { meta ->
val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET)
if (charsetAttr.isNotEmpty()) {
runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let {
return it
}
}
return@withContext UrlInfoItem(url, title, description, image, type)
if (meta.attr(ATTRIBUTE_VALUE_HTTP_EQUIV).lowercase() == "content-type") {
RE_CONTENT_TYPE_CHARSET.find(meta.attr(CONTENT))
?.let {
runCatching { Charset.forName(it.groupValues[1]) }.getOrNull()
}?.let {
return it
}
}
}
// defaults to UTF-8
return Charset.forName("utf-8")
}
private fun extractUrlInfo(
url: String,
metaTags: Sequence<MetaTag>,
type: MediaType,
): UrlInfoItem {
var title: String = ""
var description: String = ""
var image: String = ""
metaTags.forEach {
when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
when (it.attr(ATTRIBUTE_VALUE_NAME)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) {
in META_X_TITLE ->
if (title.isEmpty()) {
title = it.attr(CONTENT)
}
in META_X_DESCRIPTION ->
if (description.isEmpty()) {
description = it.attr(CONTENT)
}
in META_X_IMAGE ->
if (image.isEmpty()) {
image = it.attr(CONTENT)
}
}
if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) {
return UrlInfoItem(url, title, description, image, type)
}
}
return UrlInfoItem(url, title, description, image, type)
}

Wyświetl plik

@ -0,0 +1,81 @@
/**
* Copyright (c) 2024 Vitor Pamplona
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package com.vitorpamplona.amethyst.service.previews
import org.junit.Assert.assertEquals
import org.junit.Test
class MetaTagsParserTest {
@Test
fun testParse() {
val input =
"""<html>
| <head>
| <meta charset="utf-8">
| <meta http-equiv="content-type" content="text/html; charset=utf-8">
| <meta property="og:title" content=title>
| <meta property="og:description" content='description'>
| <meta property="og:image" content="https://example.com/img/foo.png">
| <!-- edge cases -->
| <meta
| name="newline"
| content="newline"
| >
| <meta name="space before gt" >
| <meta name ="space before =">
| <meta name= "space after =">
| <META NAME="CAPITAL">
| <meta name="character reference" content="&lt;meta&gt;">
| <meta name="attr value with end of head doesn't harm" content="<head>bang!</head>">
| <meta name="ignore tags with duplicated attr" name="dup">
| </head>
| <body>
| <meta name="ignore meta tags in body">
| </body>
|</html>
""".trimMargin()
val exp =
listOf(
listOf("charset" to "utf-8"),
listOf("http-equiv" to "content-type", "content" to "text/html; charset=utf-8"),
listOf("property" to "og:title", "content" to "title"),
listOf("property" to "og:description", "content" to "description"),
listOf("property" to "og:image", "content" to "https://example.com/img/foo.png"),
listOf("name" to "newline", "content" to "newline"),
listOf("name" to "space before gt"),
listOf("name" to "space before ="),
listOf("name" to "space after ="),
listOf("name" to "CAPITAL"),
listOf("name" to "character reference", "content" to "<meta>"),
listOf("name" to "attr value with end of head doesn't harm", "content" to "<head>bang!</head>"),
)
val metaTags = MetaTagsParser.parse(input).toList()
println(metaTags)
assertEquals(exp.size, metaTags.size)
metaTags.zip(exp).forEach { (meta, expAttrs) ->
expAttrs.forEach { (name, expValue) ->
assertEquals(expValue, meta.attr(name))
}
}
}
}

Wyświetl plik

@ -20,7 +20,6 @@ fragmentKtx = "1.6.2"
gms = "4.4.1"
jacksonModuleKotlin = "2.17.0"
jna = "5.14.0"
jsoup = "1.17.2"
junit = "4.13.2"
kotlin = "1.9.22"
kotlinxCollectionsImmutable = "0.3.7"
@ -93,7 +92,6 @@ google-mlkit-language-id = { group = "com.google.mlkit", name = "language-id", v
google-mlkit-translate = { group = "com.google.mlkit", name = "translate", version.ref = "translate" }
jackson-module-kotlin = { group = "com.fasterxml.jackson.module", name = "jackson-module-kotlin", version.ref = "jacksonModuleKotlin" }
jna = { group = "net.java.dev.jna", name = "jna", version.ref = "jna" }
jsoup = { group = "org.jsoup", name = "jsoup", version.ref = "jsoup" }
junit = { group = "junit", name = "junit", version.ref = "junit" }
kotlinx-collections-immutable = { group = "org.jetbrains.kotlinx", name = "kotlinx-collections-immutable", version.ref = "kotlinxCollectionsImmutable" }
lazysodium-android = { group = "com.goterl", name = "lazysodium-android", version.ref = "lazysodiumAndroid" }