kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge branch 'main' into youtubedlp-rewrite
commit
74cf1f5f23
|
@ -277,9 +277,9 @@ pytest -ra -v # or poetry run pytest -ra -v
|
||||||
|
|
||||||
#### Docker development
|
#### Docker development
|
||||||
working with docker locally:
|
working with docker locally:
|
||||||
* `docker build . -t auto-archiver` to build a local image
|
* `docker compose up` to build the first time and run a local image with the settings in `secrets/orchestration.yaml`
|
||||||
* `docker run --rm -v $PWD/secrets:/app/secrets auto-archiver --config secrets/orchestration.yaml`
|
* To modify/pass additional command line args, use `docker compose run auto-archiver --config secrets/orchestration.yaml [OTHER ARGUMENTS]`
|
||||||
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
|
* To rebuild after code changes, just pass the `--build` flag, e.g. `docker compose up --build`
|
||||||
|
|
||||||
|
|
||||||
manual release to docker hub
|
manual release to docker hub
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
auto-archiver:
|
||||||
|
# point to the local dockerfile
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: auto-archiver
|
||||||
|
volumes:
|
||||||
|
- ./secrets:/app/secrets
|
||||||
|
- ./local_archive:/app/local_archive
|
||||||
|
environment:
|
||||||
|
- WACZ_ENABLE_DOCKER=true
|
||||||
|
- RUNNING_IN_DOCKER=true
|
||||||
|
command: --config secrets/orchestration.yaml
|
Plik diff jest za duży
Load Diff
|
@ -23,9 +23,7 @@ classifiers = [
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"gspread (>=0.0.0)",
|
"gspread (>=0.0.0)",
|
||||||
"argparse (>=0.0.0)",
|
|
||||||
"beautifulsoup4 (>=0.0.0)",
|
"beautifulsoup4 (>=0.0.0)",
|
||||||
"tiktok-downloader (>=0.0.0)",
|
|
||||||
"bs4 (>=0.0.0)",
|
"bs4 (>=0.0.0)",
|
||||||
"loguru (>=0.0.0)",
|
"loguru (>=0.0.0)",
|
||||||
"ffmpeg-python (>=0.0.0)",
|
"ffmpeg-python (>=0.0.0)",
|
||||||
|
@ -55,12 +53,10 @@ dependencies = [
|
||||||
"warcio (>=0.0.0)",
|
"warcio (>=0.0.0)",
|
||||||
"jsonlines (>=0.0.0)",
|
"jsonlines (>=0.0.0)",
|
||||||
"pysubs2 (>=0.0.0)",
|
"pysubs2 (>=0.0.0)",
|
||||||
"minify-html (>=0.0.0)",
|
|
||||||
"retrying (>=0.0.0)",
|
"retrying (>=0.0.0)",
|
||||||
"tsp-client (>=0.0.0)",
|
"tsp-client (>=0.0.0)",
|
||||||
"certvalidator (>=0.0.0)",
|
"certvalidator (>=0.0.0)",
|
||||||
"toml (>=0.10.2,<0.11.0)",
|
"filetype (>=1.2.0,<2.0.0)",
|
||||||
"filetype (>=1.2.0,<2.0.0)"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import json, os, traceback
|
import json, os, traceback
|
||||||
import tiktok_downloader
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ class Metadata:
|
||||||
media: List[Media] = field(default_factory=list)
|
media: List[Media] = field(default_factory=list)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
self.set("_processed_at", datetime.datetime.utcnow())
|
self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
|
||||||
|
|
||||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -65,7 +65,7 @@ class GsheetsDb(Database):
|
||||||
media: Media = item.get_final_media()
|
media: Media = item.get_final_media()
|
||||||
if hasattr(media, "urls"):
|
if hasattr(media, "urls"):
|
||||||
batch_if_valid('archive', "\n".join(media.urls))
|
batch_if_valid('archive', "\n".join(media.urls))
|
||||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||||
batch_if_valid('title', item.get_title())
|
batch_if_valid('title', item.get_title())
|
||||||
batch_if_valid('text', item.get("content", ""))
|
batch_if_valid('text', item.get("content", ""))
|
||||||
batch_if_valid('timestamp', item.get_timestamp())
|
batch_if_valid('timestamp', item.get_timestamp())
|
||||||
|
|
|
@ -55,5 +55,5 @@ class MetaEnricher(Enricher):
|
||||||
def enrich_archive_duration(self, to_enrich):
|
def enrich_archive_duration(self, to_enrich):
|
||||||
logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ")
|
logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ")
|
||||||
|
|
||||||
archive_duration = datetime.datetime.utcnow() - to_enrich.get("_processed_at")
|
archive_duration = datetime.datetime.now(datetime.timezone.utc) - to_enrich.get("_processed_at")
|
||||||
to_enrich.set("archive_duration_seconds", archive_duration.seconds)
|
to_enrich.set("archive_duration_seconds", archive_duration.seconds)
|
|
@ -4,7 +4,7 @@ import mimetypes, os, pathlib
|
||||||
from jinja2 import Environment, FileSystemLoader
|
from jinja2 import Environment, FileSystemLoader
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import minify_html, json
|
import json
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
from ..version import __version__
|
from ..version import __version__
|
||||||
|
@ -47,7 +47,6 @@ class HtmlFormatter(Formatter):
|
||||||
metadata=item.metadata,
|
metadata=item.metadata,
|
||||||
version=__version__
|
version=__version__
|
||||||
)
|
)
|
||||||
content = minify_html.minify(content, minify_js=False, minify_css=True)
|
|
||||||
|
|
||||||
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
|
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
|
||||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from .test_archiver_base import TestArchiverBase
|
||||||
|
from auto_archiver.archivers.tiktok_archiver import TiktokArchiver
|
||||||
|
|
||||||
|
class TestBlueskyArchiver(TestArchiverBase):
|
||||||
|
|
||||||
|
archiver_class = TiktokArchiver
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
@pytest.mark.xfail(reason="Tiktok API is not working")
|
||||||
|
@pytest.mark.download
|
||||||
|
def test_download_video(self, make_item):
|
||||||
|
# cat video
|
||||||
|
url = "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en"
|
||||||
|
item = self.archiver.download(make_item(url))
|
||||||
|
assert item.success
|
Ładowanie…
Reference in New Issue