auto-archiver/src/auto_archiver/core/orchestrator.py

175 wiersze
8.0 KiB
Python

from __future__ import annotations
from ast import List
from typing import Union, Dict
from dataclasses import dataclass
from ..archivers import Archiver
from ..feeders import Feeder
from ..formatters import Formatter
from ..storages import Storage
from ..enrichers import Enricher
from ..databases import Database
from .media import Media
from .metadata import Metadata
import tempfile, time, traceback
from loguru import logger
"""
how not to couple the different pieces of logic
due to the use of constants for the metadata keys?
perhaps having methods on the Metadata level that can be used to fetch a limited number of
keys, never using strings but rather methods?
eg: m = Metadata()
m.get("screenshot") vs m.get_all()
m.get_url()
m.get_hash()
m.get_main_file().get_title()
m.get_screenshot() # this method should only exist because of the Screenshot Enricher
# maybe there is a way for Archivers and Enrichers and Storages to add their own methdods
# which raises still the Q of how the database, eg., knows they exist?
# maybe there's a function to fetch them all, and each Database can register wathever they get
# for eg the GoogleSheets will only register based on the available column names, it knows what it wants
# and if it's there: great, otherwise business as usual.
# and a MongoDatabase could register all data, for example.
#
How are Orchestrators created? from a configuration file?
orchestrator = ArchivingOrchestrator(config)
# Config contains 1 URL, or URLs, from the command line
# OR a feeder which is described in the config file
# config.get_feeder() # if called as docker run --url "http...." then the uses the default filter
# if config.yaml says config
orchestrator.start()
Example applications:
1. auto-archiver for GSheets
2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2"
3. archiver backend for a UI that implements a REST API, the API calls CLI
Cisticola considerations:
1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass)
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
"""
class ArchivingOrchestrator:
def __init__(self, config) -> None:
self.feeder: Feeder = config.feeder
self.formatter: Formatter = config.formatter
self.enrichers = config.enrichers
self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases
self.storages: List[Storage] = config.storages
for a in self.archivers: a.setup()
def feed(self) -> None:
for item in self.feeder:
self.feed_item(item)
def feed_item(self, item: Metadata) -> Metadata:
print("ARCHIVING", item)
try:
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
item.set_tmp_dir(tmp_dir)
return self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}")
for d in self.databases: d.aborted(item)
exit()
except Exception as e:
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.databases: d.failed(item)
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
# all of it should have sensible defaults (eg: folder)
# default feeder is a list with 1 element
def archive(self, result: Metadata) -> Union[Metadata, None]:
url = result.get_url()
# TODO: clean urls
for a in self.archivers:
url = a.clean_url(url)
result.set_url(url)
# should_archive = False
# for d in self.databases: should_archive |= d.should_process(url)
# should storages also be able to check?
# for s in self.storages: should_archive |= s.should_process(url)
# if not should_archive:
# print("skipping")
# return "skipping"
# signal to DB that archiving has started
# and propagate already archived if it exists
cached_result = None
for d in self.databases:
# are the databases to decide whether to archive?
# they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
# instagram profile example: it would always re-archive everything
# maybe the database/storage could use a hash/key to decide if there's a need to re-archive
d.started(result)
if (local_result := d.fetch(result)):
cached_result = (cached_result or Metadata()).merge(local_result)
if cached_result and not cached_result.rearchivable:
for d in self.databases:
d.done(cached_result)
return cached_result
# vk, telethon, ...
for a in self.archivers:
# with automatic try/catch in download + archived (+ the other ops below)
# should the archivers come with the config already? are there configs which change at runtime?
# think not, so no need to pass config as parameter
# do they need to be refreshed with every execution?
# this is where the Hashes come from, the place with access to all content
# the archiver does not have access to storage
# a.download(result) # TODO: refactor so there's not merge here
logger.info(f"Trying archiver {a.name}")
result.merge(a.download(result))
# TODO: fix logic to halt when done
if result.is_success(): break
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
# should it call the HTMLgenerator as if it's not an enrichment?
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
# then how to execute it last? should there also be post-processors? are there other examples?
# maybe as a PDF? or a Markdown file
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
for e in self.enrichers:
e.enrich(result)
# store media
for s in self.storages:
for m in result.media:
s.store(m, result) # modifies media
# Media can be inside media properties, examples include transformations on original media
for prop in m.properties.values():
if isinstance(prop, Media):
s.store(prop, result)
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
for prop_media in prop:
s.store(prop_media, result)
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
# TODO: should there only be 1 formatter?
# for f in self.formatters:
# result.merge(f.format(result))
# final format and store it
if (final_media := self.formatter.format(result)):
for s in self.storages:
s.store(final_media, result)
result.set_final_media(final_media)
# signal completion to databases (DBs, Google Sheets, CSV, ...)
# a hash registration service could be one database: forensic archiving
result.cleanup()
for d in self.databases: d.done(result)
return result