orchestrator design structure

2022-11-11 02:08:48 +00:00 · 2022-11-11 02:08:48 +00:00 · 6a0ce5ced1
commit 6a0ce5ced1
--- a/orchestrate.yaml
+++ b/orchestrate.yaml
@ -0,0 +1,48 @@
 steps:
  # only 1 feeder allowed
  # a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
  feeder: gsheets_feeder # default -> only expects URL from CLI
  archivers: # order matters
    - tiktok
    - telethon
    - twitter
    - instagram
    - webarchive # this way it runs as a failsafe only
  enrichments:
    - screenshot
    - wacz
    - webarchive # this way it runs for every case, webarchive extends archiver and enrichment
    - thumbnails
  formatters:
    - HTMLFormater
    - PDFFormater
  storages:
    - local_storage
    - s3
  databases:
    - gsheets_db
    - mongo_db
 configurations:
  gsheets_feeder:
    - sheet: "Auto archiver"
    - header: "" # defaults to 1 in GSheetsFeeder
    - service_account: "secrets/service_account.json"
  tiktok:
    username: "abc"
    password: "123"
    token: "here"
  screenshot:
    width: 1280
    height: 720
  wacz:
    profile: secrets/profile.tar.gz
  webarchive:
    api_key: "12345"
  s3: 
    - bucket: 123
    - region: "nyc3"
    - cdn: "{region}{bucket}"
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@ -0,0 +1,215 @@
 from typing import Union, Dict
 from __future__ import annotations
 from dataclasses import dataclass
 """
 how not to couple the different pieces of logic
 due to the use of constants for the metadata keys?
 perhaps having methods on the Metadata level that can be used to fetch a limited number of
 keys, never using strings but rather methods?
 eg: m = Metadata() 
    m.get("screenshot") vs m.get_all()
    m.get_url()
    m.get_hash()
    m.get_main_file().get_title()
    m.get_screenshot() # this method should only exist because of the Screenshot Enricher
    # maybe there is a way for Archivers and Enrichers and Storages to add their own methdods
    # which raises still the Q of how the database, eg., knows they exist? 
    # maybe there's a function to fetch them all, and each Database can register wathever they get
    # for eg the GoogleSheets will only register based on the available column names, it knows what it wants
    # and if it's there: great, otherwise business as usual.
    # and a MongoDatabase could register all data, for example.
    # 
 How are Orchestrators created? from a configuration file?
    orchestrator = ArchivingOrchestrator(config)
        # Config contains 1 URL, or URLs, from the command line
        # OR a feeder which is described in the config file
        # config.get_feeder() # if called as docker run --url "http...." then the uses the default filter
        # if config.yaml says config
    orchestrator.start()
 Example applications:
 1. auto-archiver for GSheets
 2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2"
 3. archiver backend for a UI that implements a REST API, the API calls CLI
 Cisticola considerations:
 1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass)
 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
 """
@dataclass
 class Metadata:
    # does not handle files, only primitives
    # the only piece of logic to handle files is the archiver, enricher, and storage
    status: str
    # title: str
    # url: str
    # hash: str
    main_file: Metadata
    metadata: Dict[str, Metadata]
    @staticmethod
    def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
        # should return a merged version of the Metadata
        # will work for archived() and enriched()
        # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
        pass
    def get(self, key) -> Union[Metadata, str]:
        # goes through metadata and returns the Metadata available
        pass
    def as_json(self) -> str:
        # converts all metadata and data into JSON
        pass
 """
@dataclass
 class ArchiveResult:
    # maybe metadata can have status as well, eg: screenshot fails. should that be registered in the databases? likely yes
    status: str
    url: str
    metadata: Metadata
    # title, url, hash, other={}
    # cdn_url: str = None
    # thumbnail: str = None
    # thumbnail_index: str = None
    # duration: float = None
    # title: str = None
    # timestamp: datetime.datetime = None
    # screenshot: str = None
    # wacz: str = None
    # hash: str = None
    # media: list = field(default_factory=list)
    def __init__(self) -> None: pass
    def update(self, metadata) -> None:
        # receive a Metadata instance and update itself with it!
        pass
    def as_json(self) -> str:
        # converts all metadata and data into JSON
        pass
 """
 """
 There is a Superclass for:
    * Database (should_process)
 How can GSheets work? it needs to feed from a READER (GSheets Feeder)
 Once an archiver returns a link to a local file (for eg to a storage), how do we then delete the produced local files? 
 The context metadata should include a temporary folder (maybe a LocalStorage instance?)
 """
 class ArchivingOrchestrator:
    def __init__(self, config) -> None:
        # in config.py we should test that the archivers exist and log mismatches (blocking execution)
        # identify each formatter, storage, database, etc
        self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
        # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI
        # where does that update/processing happen? in config.py
        # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
        self.archivers = [
            Archiver.init(a, config.get(a))
            for a in config.archivers
        ]
        self.enrichments = [
            Enrichment.init(e, config.get(e))
            for e in config.enrichments
        ]
        self.formatters = [
            Formatter.init(f, config.get(f))
            for f in config.formatters
        ]
        self.storages = [
            Storage.init(s, config.get(s))
            for s in config.storages
        ]
        self.databases = [
            Database.init(f, config.get(f))
            for f in config.formatters
        ]
        # these rules are checked in config.py
        assert len(archivers) > 1, "there needs to be at least one Archiver"
    def feed(self, feeder: Feeder) -> list(ArchiveResult):
        for next in feeder:
            self.archive(next)
            # how does this handle the parameters like folder which can be different for each archiver?
            # the storage needs to know where to archive!!
            # solution: feeders have context: extra metadata that they can read or ignore, 
            # all of it should have sensible defaults (eg: folder)
            # default feeder is a list with 1 element
    def archive(url) -> Union[ArchiveResult, None]:
        url = clear_url(url)
        result = Metadata(url=url)
        should_archive = True
        for d in databases: should_archive &= d.should_process(url)
        # should storages also be able to check?
        for s in storages: should_archive &= s.should_process(url)
        if not should_archive:
            return "skipping"
        # signal to DB that archiving has started
        for d in databases:
            # are the databases to decide whether to archive?
            # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive, 
            # instagram profile example: it would always re-archive everything
            # maybe the database/storage could use a hash/key to decide if there's a need to re-archive
            if d.should_process(url):
                d.started(url)
            elif d.exists(url):
                return d.fetch(url)
            else:
                print("Skipping url")
                return
        # vk, telethon, ...
        for a in archivers:
            # with automatic try/catch in download + archived (+ the other ops below)
            # should the archivers come with the config already? are there configs which change at runtime? 
            # think not, so no need to pass config as parameter
            # do they need to be refreshed with every execution? 
            # this is where the Hashes come from, the place with access to all content
            # the archiver does not have access to storage
            result.update(a.download(url))
            if result.is_success(): break
        # what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
        # should it call the HTMLgenerator as if it's not an enrichment?
        # eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes
        # then how to execute it last? should there also be post-processors? are there other examples?
        # maybe as a PDF? or a Markdown file
        # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
        for e in enrichments:
            result.update(e.enrich(result))
        # formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
        for p in formatter:
            result.update(p.process(result))
        # storages
        for s in storages:
            for m in result.media:
                m.update(s.store(m))
        # signal completion to databases (DBs, Google Sheets, CSV, ...)
        # a hash registration service could be one database: forensic archiving
        for d in databases: d.done( result)
        return result