Merge branch 'main' into youtubedlp-rewrite

pull/175/head
Patrick Robertson 2025-01-15 17:47:23 +01:00
commit 74cf1f5f23
10 zmienionych plików z 57 dodań i 900 usunięć

Wyświetl plik

@ -277,9 +277,9 @@ pytest -ra -v # or poetry run pytest -ra -v
#### Docker development #### Docker development
working with docker locally: working with docker locally:
* `docker build . -t auto-archiver` to build a local image * `docker compose up` to build the first time and run a local image with the settings in `secrets/orchestration.yaml`
* `docker run --rm -v $PWD/secrets:/app/secrets auto-archiver --config secrets/orchestration.yaml` * To modify/pass additional command line args, use `docker compose run auto-archiver --config secrets/orchestration.yaml [OTHER ARGUMENTS]`
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive` * To rebuild after code changes, just pass the `--build` flag, e.g. `docker compose up --build`
manual release to docker hub manual release to docker hub

Wyświetl plik

@ -0,0 +1,16 @@
version: '3.8'
services:
auto-archiver:
# point to the local dockerfile
build:
context: .
dockerfile: Dockerfile
container_name: auto-archiver
volumes:
- ./secrets:/app/secrets
- ./local_archive:/app/local_archive
environment:
- WACZ_ENABLE_DOCKER=true
- RUNNING_IN_DOCKER=true
command: --config secrets/orchestration.yaml

902
poetry.lock wygenerowano

Plik diff jest za duży Load Diff

Wyświetl plik

@ -23,9 +23,7 @@ classifiers = [
dependencies = [ dependencies = [
"gspread (>=0.0.0)", "gspread (>=0.0.0)",
"argparse (>=0.0.0)",
"beautifulsoup4 (>=0.0.0)", "beautifulsoup4 (>=0.0.0)",
"tiktok-downloader (>=0.0.0)",
"bs4 (>=0.0.0)", "bs4 (>=0.0.0)",
"loguru (>=0.0.0)", "loguru (>=0.0.0)",
"ffmpeg-python (>=0.0.0)", "ffmpeg-python (>=0.0.0)",
@ -55,12 +53,10 @@ dependencies = [
"warcio (>=0.0.0)", "warcio (>=0.0.0)",
"jsonlines (>=0.0.0)", "jsonlines (>=0.0.0)",
"pysubs2 (>=0.0.0)", "pysubs2 (>=0.0.0)",
"minify-html (>=0.0.0)",
"retrying (>=0.0.0)", "retrying (>=0.0.0)",
"tsp-client (>=0.0.0)", "tsp-client (>=0.0.0)",
"certvalidator (>=0.0.0)", "certvalidator (>=0.0.0)",
"toml (>=0.10.2,<0.11.0)", "filetype (>=1.2.0,<2.0.0)",
"filetype (>=1.2.0,<2.0.0)"
] ]
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

Wyświetl plik

@ -1,5 +1,4 @@
import json, os, traceback import json, os, traceback
import tiktok_downloader
from loguru import logger from loguru import logger

Wyświetl plik

@ -21,7 +21,7 @@ class Metadata:
media: List[Media] = field(default_factory=list) media: List[Media] = field(default_factory=list)
def __post_init__(self): def __post_init__(self):
self.set("_processed_at", datetime.datetime.utcnow()) self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
""" """

Wyświetl plik

@ -65,7 +65,7 @@ class GsheetsDb(Database):
media: Media = item.get_final_media() media: Media = item.get_final_media()
if hasattr(media, "urls"): if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls)) batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title()) batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")) batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp()) batch_if_valid('timestamp', item.get_timestamp())

Wyświetl plik

@ -55,5 +55,5 @@ class MetaEnricher(Enricher):
def enrich_archive_duration(self, to_enrich): def enrich_archive_duration(self, to_enrich):
logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ") logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ")
archive_duration = datetime.datetime.utcnow() - to_enrich.get("_processed_at") archive_duration = datetime.datetime.now(datetime.timezone.utc) - to_enrich.get("_processed_at")
to_enrich.set("archive_duration_seconds", archive_duration.seconds) to_enrich.set("archive_duration_seconds", archive_duration.seconds)

Wyświetl plik

@ -4,7 +4,7 @@ import mimetypes, os, pathlib
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote from urllib.parse import quote
from loguru import logger from loguru import logger
import minify_html, json import json
import base64 import base64
from ..version import __version__ from ..version import __version__
@ -47,7 +47,6 @@ class HtmlFormatter(Formatter):
metadata=item.metadata, metadata=item.metadata,
version=__version__ version=__version__
) )
content = minify_html.minify(content, minify_js=False, minify_css=True)
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html") html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
with open(html_path, mode="w", encoding="utf-8") as outf: with open(html_path, mode="w", encoding="utf-8") as outf:

Wyświetl plik

@ -0,0 +1,17 @@
import pytest
from .test_archiver_base import TestArchiverBase
from auto_archiver.archivers.tiktok_archiver import TiktokArchiver
class TestBlueskyArchiver(TestArchiverBase):
archiver_class = TiktokArchiver
config = {}
@pytest.mark.xfail(reason="Tiktok API is not working")
@pytest.mark.download
def test_download_video(self, make_item):
# cat video
url = "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en"
item = self.archiver.download(make_item(url))
assert item.success