auto-archiver/src/auto_archiver/formatters/html_formatter.py

101 wiersze
3.2 KiB
Python
Czysty Zwykły widok Historia

from __future__ import annotations
from dataclasses import dataclass
import mimetypes, os, pathlib
from jinja2 import Environment, FileSystemLoader
2023-01-22 00:48:09 +00:00
from urllib.parse import quote
2023-02-17 15:45:58 +00:00
from loguru import logger
import minify_html, json
import base64
2023-02-08 11:22:38 +00:00
from ..version import __version__
from ..core import Metadata, Media, ArchivingContext
2023-01-21 19:01:02 +00:00
from . import Formatter
2023-05-09 10:17:44 +00:00
from ..enrichers import HashEnricher
from ..utils.misc import random_str
2023-01-21 19:01:02 +00:00
@dataclass
class HtmlFormatter(Formatter):
name = "html_formatter"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")))
2023-01-22 00:48:09 +00:00
# JinjaHelper class static methods are added as filters
2023-01-17 16:29:27 +00:00
self.environment.filters.update({
2023-02-02 11:26:00 +00:00
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
2023-01-17 16:29:27 +00:00
})
self.template = self.environment.get_template("html_template.html")
@staticmethod
def configs() -> dict:
2023-01-17 16:29:27 +00:00
return {
2023-02-17 15:45:58 +00:00
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
2023-01-17 16:29:27 +00:00
}
2023-02-17 15:45:58 +00:00
def format(self, item: Metadata) -> Media:
2023-02-17 15:45:58 +00:00
url = item.get_url()
if item.is_empty():
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
return
content = self.template.render(
2023-02-17 15:45:58 +00:00
url=url,
title=item.get_title(),
media=item.media,
2023-03-23 18:50:30 +00:00
metadata=item.metadata,
2023-02-08 11:22:38 +00:00
version=__version__
)
content = minify_html.minify(content, minify_js=False, minify_css=True)
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
final_media = Media(filename=html_path, _mimetype="text/html")
2023-01-17 16:29:27 +00:00
2023-05-09 10:17:44 +00:00
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}")
return final_media
2023-01-17 16:29:27 +00:00
2023-05-09 10:17:44 +00:00
# JINJA helper filters
2023-01-22 00:48:09 +00:00
class JinjaHelpers:
@staticmethod
def is_list(v) -> bool:
return isinstance(v, list)
2023-01-17 16:29:27 +00:00
2023-01-22 00:48:09 +00:00
@staticmethod
def is_video(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "video" in (m or "")
2023-01-17 16:29:27 +00:00
2023-01-22 00:48:09 +00:00
@staticmethod
def is_image(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "image" in (m or "")
2023-01-17 16:29:27 +00:00
2023-01-22 00:48:09 +00:00
@staticmethod
def is_audio(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "audio" in (m or "")
2023-01-17 16:29:27 +00:00
2023-01-22 00:48:09 +00:00
@staticmethod
def is_media(v) -> bool:
return isinstance(v, Media)
2023-01-17 16:29:27 +00:00
2023-01-22 00:48:09 +00:00
@staticmethod
def get_extension(filename: str) -> str:
return os.path.splitext(filename)[1]
2023-01-21 19:01:02 +00:00
2023-01-22 00:48:09 +00:00
@staticmethod
def quote(s: str) -> str:
return quote(s)
@staticmethod
def json_dump_b64(d: dict) -> str:
j = json.dumps(d, indent=4, default=str)
return base64.b64encode(j.encode()).decode()