auto-archiver/src/auto_archiver/core/orchestrator.py

122 wiersze
5.3 KiB
Python

from __future__ import annotations
from typing import Generator, Union, List
from .context import ArchivingContext
from ..archivers import Archiver
from ..feeders import Feeder
from ..formatters import Formatter
from ..storages import Storage
from ..enrichers import Enricher
from ..databases import Database
from .metadata import Metadata
import tempfile, traceback
from loguru import logger
class ArchivingOrchestrator:
def __init__(self, config) -> None:
self.feeder: Feeder = config.feeder
self.formatter: Formatter = config.formatter
self.enrichers: List[Enricher] = config.enrichers
self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases
self.storages: List[Storage] = config.storages
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
for a in self.archivers: a.setup()
def feed(self) -> Generator[Metadata]:
for item in self.feeder:
yield self.feed_item(item)
def feed_item(self, item: Metadata) -> Metadata:
try:
ArchivingContext.reset()
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
ArchivingContext.set_tmp_dir(tmp_dir)
return self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}")
for d in self.databases: d.aborted(item)
exit()
except Exception as e:
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.databases: d.failed(item)
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
# all of it should have sensible defaults (eg: folder)
# default feeder is a list with 1 element
def archive(self, result: Metadata) -> Union[Metadata, None]:
original_url = result.get_url()
# 1 - cleanup
# each archiver is responsible for cleaning/expanding its own URLs
url = original_url
for a in self.archivers: url = a.sanitize_url(url)
result.set_url(url)
if original_url != url: result.set("original_url", original_url)
# 2 - notify start to DB
# signal to DB that archiving has started
# and propagate already archived if it exists
cached_result = None
for d in self.databases:
# are the databases to decide whether to archive?
# they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
# instagram profile example: it would always re-archive everything
# maybe the database/storage could use a hash/key to decide if there's a need to re-archive
d.started(result)
if (local_result := d.fetch(result)):
cached_result = (cached_result or Metadata()).merge(local_result)
if cached_result:
logger.debug("Found previously archived entry")
for d in self.databases:
d.done(cached_result, cached=True)
return cached_result
# 3 - call archivers until one succeeds
for a in self.archivers:
logger.info(f"Trying archiver {a.name} for {url}")
try:
# Q: should this be refactored so it's just a.download(result)?
result.merge(a.download(result))
if result.is_success(): break
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
# should it call the HTMLgenerator as if it's not an enrichment?
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
# then how to execute it last? should there also be post-processors? are there other examples?
# maybe as a PDF? or a Markdown file
# 4 - call enrichers: have access to archived content, can generate metadata and Media
# eg: screenshot, wacz, webarchive, thumbnails
for e in self.enrichers:
try: e.enrich(result)
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store media
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
result.store()
# 6 - format and store formatted if needed
# enrichers typically need access to already stored URLs etc
if (final_media := self.formatter.format(result)):
final_media.store(url=url)
result.set_final_media(final_media)
if result.is_empty():
result.status = "nothing archived"
# signal completion to databases (DBs, Google Sheets, CSV, ...)
for d in self.databases: d.done(result)
return result