kopia lustrzana https://github.com/vitorpamplona/amethyst
fix garbled URL preview for non-UTF-8 HTML
rodzic
7eefbee0e3
commit
3434c31487
|
@ -27,10 +27,19 @@ import kotlinx.coroutines.withContext
|
|||
import okhttp3.MediaType
|
||||
import okhttp3.MediaType.Companion.toMediaType
|
||||
import okhttp3.Request
|
||||
import okhttp3.ResponseBody
|
||||
import okio.BufferedSource
|
||||
import okio.ByteString.Companion.decodeHex
|
||||
import okio.Options
|
||||
import org.jsoup.Jsoup
|
||||
import org.jsoup.nodes.Document
|
||||
import java.io.ByteArrayInputStream
|
||||
import java.io.IOException
|
||||
import java.nio.charset.Charset
|
||||
|
||||
private const val ELEMENT_TAG_META = "meta"
|
||||
private const val ATTRIBUTE_VALUE_CHARSET = "charset"
|
||||
private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv"
|
||||
private const val ATTRIBUTE_VALUE_PROPERTY = "property"
|
||||
private const val ATTRIBUTE_VALUE_NAME = "name"
|
||||
private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop"
|
||||
|
@ -99,10 +108,8 @@ suspend fun getDocument(
|
|||
?: throw IllegalArgumentException(
|
||||
"Website returned unknown mimetype: ${it.headers.get("Content-Type")}",
|
||||
)
|
||||
|
||||
if (mimeType.type == "text" && mimeType.subtype == "html") {
|
||||
val document = Jsoup.parse(it.body.string())
|
||||
parseHtml(url, document, mimeType)
|
||||
parseHtml(url, it.body, mimeType)
|
||||
} else if (mimeType.type == "image") {
|
||||
UrlInfoItem(url, image = url, mimeType = mimeType)
|
||||
} else if (mimeType.type == "video") {
|
||||
|
@ -120,65 +127,147 @@ suspend fun getDocument(
|
|||
|
||||
suspend fun parseHtml(
|
||||
url: String,
|
||||
document: Document,
|
||||
body: ResponseBody,
|
||||
type: MediaType,
|
||||
): UrlInfoItem =
|
||||
withContext(Dispatchers.IO) {
|
||||
val metaTags = document.getElementsByTag(ELEMENT_TAG_META)
|
||||
val source = body.source()
|
||||
|
||||
var title: String = ""
|
||||
var description: String = ""
|
||||
var image: String = ""
|
||||
// sniff charset from Content-Type header or BOM
|
||||
val sniffedCharset = type.charset() ?: source.readBomAsCharset()
|
||||
if (sniffedCharset != null) {
|
||||
val doc = Jsoup.parse(source.inputStream(), sniffedCharset.name(), url)
|
||||
return@withContext parseUrlInfo(url, doc, type)
|
||||
}
|
||||
|
||||
metaTags.forEach {
|
||||
when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) {
|
||||
in META_X_TITLE ->
|
||||
if (title.isEmpty()) {
|
||||
title = it.attr(CONTENT)
|
||||
}
|
||||
in META_X_DESCRIPTION ->
|
||||
if (description.isEmpty()) {
|
||||
description = it.attr(CONTENT)
|
||||
}
|
||||
in META_X_IMAGE ->
|
||||
if (image.isEmpty()) {
|
||||
image = it.attr(CONTENT)
|
||||
}
|
||||
}
|
||||
// if sniffing was failed, detect charset from content
|
||||
val bodyBytes = source.readByteArray()
|
||||
val charset = detectCharset(bodyBytes, url)
|
||||
val doc = Jsoup.parse(ByteArrayInputStream(bodyBytes), charset.name(), url)
|
||||
return@withContext parseUrlInfo(url, doc, type)
|
||||
}
|
||||
|
||||
when (it.attr(ATTRIBUTE_VALUE_NAME)) {
|
||||
in META_X_TITLE ->
|
||||
if (title.isEmpty()) {
|
||||
title = it.attr(CONTENT)
|
||||
}
|
||||
in META_X_DESCRIPTION ->
|
||||
if (description.isEmpty()) {
|
||||
description = it.attr(CONTENT)
|
||||
}
|
||||
in META_X_IMAGE ->
|
||||
if (image.isEmpty()) {
|
||||
image = it.attr(CONTENT)
|
||||
}
|
||||
}
|
||||
private val UNICODE_BOMS =
|
||||
Options.of(
|
||||
// UTF-8
|
||||
"efbbbf".decodeHex(),
|
||||
// UTF-16BE
|
||||
"feff".decodeHex(),
|
||||
// UTF-16LE
|
||||
"fffe".decodeHex(),
|
||||
// UTF-32BE
|
||||
"0000ffff".decodeHex(),
|
||||
// UTF-32LE
|
||||
"ffff0000".decodeHex(),
|
||||
)
|
||||
|
||||
when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) {
|
||||
in META_X_TITLE ->
|
||||
if (title.isEmpty()) {
|
||||
title = it.attr(CONTENT)
|
||||
}
|
||||
in META_X_DESCRIPTION ->
|
||||
if (description.isEmpty()) {
|
||||
description = it.attr(CONTENT)
|
||||
}
|
||||
in META_X_IMAGE ->
|
||||
if (image.isEmpty()) {
|
||||
image = it.attr(CONTENT)
|
||||
}
|
||||
}
|
||||
@Throws(IOException::class)
|
||||
private fun BufferedSource.readBomAsCharset(): Charset? {
|
||||
return when (select(UNICODE_BOMS)) {
|
||||
0 -> Charsets.UTF_8
|
||||
1 -> Charsets.UTF_16BE
|
||||
2 -> Charsets.UTF_16LE
|
||||
3 -> Charsets.UTF_32BE
|
||||
4 -> Charsets.UTF_32LE
|
||||
-1 -> null
|
||||
else -> throw AssertionError()
|
||||
}
|
||||
}
|
||||
|
||||
if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) {
|
||||
return@withContext UrlInfoItem(url, title, description, image, type)
|
||||
private val RE_CONTENT_TYPE_CHARSET = Regex("""charset\s*=\s*([^;]+)""")
|
||||
|
||||
private fun detectCharset(
|
||||
bodyBytes: ByteArray,
|
||||
url: String,
|
||||
): Charset {
|
||||
// tentatively decode response body as UTF-8
|
||||
val tentativeDoc = Jsoup.parse(ByteArrayInputStream(bodyBytes), "utf-8", url)
|
||||
|
||||
tentativeDoc.getElementsByTag(ELEMENT_TAG_META).forEach { meta ->
|
||||
val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET)
|
||||
if (charsetAttr.isNotEmpty()) {
|
||||
runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let {
|
||||
return it
|
||||
}
|
||||
}
|
||||
return@withContext UrlInfoItem(url, title, description, image, type)
|
||||
if (meta.attr(ATTRIBUTE_VALUE_HTTP_EQUIV).lowercase() == "content-type") {
|
||||
RE_CONTENT_TYPE_CHARSET.find(meta.attr(CONTENT))
|
||||
?.let {
|
||||
runCatching { Charset.forName(it.groupValues[1]) }.getOrNull()
|
||||
}?.let {
|
||||
return it
|
||||
}
|
||||
}
|
||||
}
|
||||
return Charset.forName("utf-8")
|
||||
}
|
||||
|
||||
private fun parseUrlInfo(
|
||||
url: String,
|
||||
document: Document,
|
||||
type: MediaType,
|
||||
): UrlInfoItem {
|
||||
val metaTags = document.getElementsByTag(ELEMENT_TAG_META)
|
||||
|
||||
var title: String = ""
|
||||
var description: String = ""
|
||||
var image: String = ""
|
||||
|
||||
metaTags.forEach {
|
||||
when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) {
|
||||
in META_X_TITLE ->
|
||||
if (title.isEmpty()) {
|
||||
title = it.attr(CONTENT)
|
||||
}
|
||||
|
||||
in META_X_DESCRIPTION ->
|
||||
if (description.isEmpty()) {
|
||||
description = it.attr(CONTENT)
|
||||
}
|
||||
|
||||
in META_X_IMAGE ->
|
||||
if (image.isEmpty()) {
|
||||
image = it.attr(CONTENT)
|
||||
}
|
||||
}
|
||||
|
||||
when (it.attr(ATTRIBUTE_VALUE_NAME)) {
|
||||
in META_X_TITLE ->
|
||||
if (title.isEmpty()) {
|
||||
title = it.attr(CONTENT)
|
||||
}
|
||||
|
||||
in META_X_DESCRIPTION ->
|
||||
if (description.isEmpty()) {
|
||||
description = it.attr(CONTENT)
|
||||
}
|
||||
|
||||
in META_X_IMAGE ->
|
||||
if (image.isEmpty()) {
|
||||
image = it.attr(CONTENT)
|
||||
}
|
||||
}
|
||||
|
||||
when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) {
|
||||
in META_X_TITLE ->
|
||||
if (title.isEmpty()) {
|
||||
title = it.attr(CONTENT)
|
||||
}
|
||||
|
||||
in META_X_DESCRIPTION ->
|
||||
if (description.isEmpty()) {
|
||||
description = it.attr(CONTENT)
|
||||
}
|
||||
|
||||
in META_X_IMAGE ->
|
||||
if (image.isEmpty()) {
|
||||
image = it.attr(CONTENT)
|
||||
}
|
||||
}
|
||||
|
||||
if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) {
|
||||
return UrlInfoItem(url, title, description, image, type)
|
||||
}
|
||||
}
|
||||
return UrlInfoItem(url, title, description, image, type)
|
||||
}
|
||||
|
|
Ładowanie…
Reference in New Issue