diff --git a/orchestrate.yaml b/orchestrate.yaml new file mode 100644 index 0000000..9a4ec42 --- /dev/null +++ b/orchestrate.yaml @@ -0,0 +1,48 @@ +steps: + # only 1 feeder allowed + # a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary + feeder: gsheets_feeder # default -> only expects URL from CLI + archivers: # order matters + - tiktok + - telethon + - twitter + - instagram + - webarchive # this way it runs as a failsafe only + enrichments: + - screenshot + - wacz + - webarchive # this way it runs for every case, webarchive extends archiver and enrichment + - thumbnails + formatters: + - HTMLFormater + - PDFFormater + storages: + - local_storage + - s3 + databases: + - gsheets_db + - mongo_db + + + +configurations: + gsheets_feeder: + - sheet: "Auto archiver" + - header: "" # defaults to 1 in GSheetsFeeder + - service_account: "secrets/service_account.json" + tiktok: + username: "abc" + password: "123" + token: "here" + screenshot: + width: 1280 + height: 720 + wacz: + profile: secrets/profile.tar.gz + webarchive: + api_key: "12345" + s3: + - bucket: 123 + - region: "nyc3" + - cdn: "{region}{bucket}" + diff --git a/src/orchestrator.py b/src/orchestrator.py new file mode 100644 index 0000000..30f7b3c --- /dev/null +++ b/src/orchestrator.py @@ -0,0 +1,215 @@ +from typing import Union, Dict +from __future__ import annotations +from dataclasses import dataclass + +""" +how not to couple the different pieces of logic +due to the use of constants for the metadata keys? +perhaps having methods on the Metadata level that can be used to fetch a limited number of +keys, never using strings but rather methods? +eg: m = Metadata() + m.get("screenshot") vs m.get_all() + m.get_url() + m.get_hash() + m.get_main_file().get_title() + m.get_screenshot() # this method should only exist because of the Screenshot Enricher + # maybe there is a way for Archivers and Enrichers and Storages to add their own methdods + # which raises still the Q of how the database, eg., knows they exist? + # maybe there's a function to fetch them all, and each Database can register wathever they get + # for eg the GoogleSheets will only register based on the available column names, it knows what it wants + # and if it's there: great, otherwise business as usual. + # and a MongoDatabase could register all data, for example. + # +How are Orchestrators created? from a configuration file? + orchestrator = ArchivingOrchestrator(config) + # Config contains 1 URL, or URLs, from the command line + # OR a feeder which is described in the config file + # config.get_feeder() # if called as docker run --url "http...." then the uses the default filter + # if config.yaml says config + orchestrator.start() + + +Example applications: +1. auto-archiver for GSheets +2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2" +3. archiver backend for a UI that implements a REST API, the API calls CLI + +Cisticola considerations: +1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass) +2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping +""" + +@dataclass +class Metadata: + # does not handle files, only primitives + # the only piece of logic to handle files is the archiver, enricher, and storage + status: str + # title: str + # url: str + # hash: str + main_file: Metadata + metadata: Dict[str, Metadata] + + @staticmethod + def merge(left, right : Metadata, overwrite_left=True) -> Metadata: + # should return a merged version of the Metadata + # will work for archived() and enriched() + # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left + pass + + def get(self, key) -> Union[Metadata, str]: + # goes through metadata and returns the Metadata available + pass + + def as_json(self) -> str: + # converts all metadata and data into JSON + pass + + +""" +@dataclass +class ArchiveResult: + # maybe metadata can have status as well, eg: screenshot fails. should that be registered in the databases? likely yes + status: str + url: str + metadata: Metadata + # title, url, hash, other={} + # cdn_url: str = None + # thumbnail: str = None + # thumbnail_index: str = None + # duration: float = None + # title: str = None + # timestamp: datetime.datetime = None + # screenshot: str = None + # wacz: str = None + # hash: str = None + # media: list = field(default_factory=list) + + def __init__(self) -> None: pass + + def update(self, metadata) -> None: + # receive a Metadata instance and update itself with it! + pass + + def as_json(self) -> str: + # converts all metadata and data into JSON + pass +""" + +""" +There is a Superclass for: + * Database (should_process) + +How can GSheets work? it needs to feed from a READER (GSheets Feeder) + +Once an archiver returns a link to a local file (for eg to a storage), how do we then delete the produced local files? +The context metadata should include a temporary folder (maybe a LocalStorage instance?) +""" + +class ArchivingOrchestrator: + def __init__(self, config) -> None: + # in config.py we should test that the archivers exist and log mismatches (blocking execution) + # identify each formatter, storage, database, etc + self.feeder = Feeder.init(config.feeder, config.get(config.feeder)) + + # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI + # where does that update/processing happen? in config.py + # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__ + self.archivers = [ + Archiver.init(a, config.get(a)) + for a in config.archivers + ] + + self.enrichments = [ + Enrichment.init(e, config.get(e)) + for e in config.enrichments + ] + + self.formatters = [ + Formatter.init(f, config.get(f)) + for f in config.formatters + ] + + self.storages = [ + Storage.init(s, config.get(s)) + for s in config.storages + ] + + self.databases = [ + Database.init(f, config.get(f)) + for f in config.formatters + ] + + # these rules are checked in config.py + assert len(archivers) > 1, "there needs to be at least one Archiver" + + def feed(self, feeder: Feeder) -> list(ArchiveResult): + for next in feeder: + self.archive(next) + # how does this handle the parameters like folder which can be different for each archiver? + # the storage needs to know where to archive!! + # solution: feeders have context: extra metadata that they can read or ignore, + # all of it should have sensible defaults (eg: folder) + # default feeder is a list with 1 element + + def archive(url) -> Union[ArchiveResult, None]: + url = clear_url(url) + result = Metadata(url=url) + + + should_archive = True + for d in databases: should_archive &= d.should_process(url) + # should storages also be able to check? + for s in storages: should_archive &= s.should_process(url) + + if not should_archive: + return "skipping" + + # signal to DB that archiving has started + for d in databases: + # are the databases to decide whether to archive? + # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive, + # instagram profile example: it would always re-archive everything + # maybe the database/storage could use a hash/key to decide if there's a need to re-archive + if d.should_process(url): + d.started(url) + elif d.exists(url): + return d.fetch(url) + else: + print("Skipping url") + return + + # vk, telethon, ... + for a in archivers: + # with automatic try/catch in download + archived (+ the other ops below) + # should the archivers come with the config already? are there configs which change at runtime? + # think not, so no need to pass config as parameter + # do they need to be refreshed with every execution? + # this is where the Hashes come from, the place with access to all content + # the archiver does not have access to storage + result.update(a.download(url)) + if result.is_success(): break + + # what if an archiver returns multiple entries and one is to be part of HTMLgenerator? + # should it call the HTMLgenerator as if it's not an enrichment? + # eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes + # then how to execute it last? should there also be post-processors? are there other examples? + # maybe as a PDF? or a Markdown file + # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator + for e in enrichments: + result.update(e.enrich(result)) + + # formatters, enrichers, and storages will sometimes look for specific properties: eg