kopia lustrzana https://github.com/bellingcat/auto-archiver
orchestrator design structure
rodzic
04263094ad
commit
6a0ce5ced1
|
@ -0,0 +1,48 @@
|
||||||
|
steps:
|
||||||
|
# only 1 feeder allowed
|
||||||
|
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
|
||||||
|
feeder: gsheets_feeder # default -> only expects URL from CLI
|
||||||
|
archivers: # order matters
|
||||||
|
- tiktok
|
||||||
|
- telethon
|
||||||
|
- twitter
|
||||||
|
- instagram
|
||||||
|
- webarchive # this way it runs as a failsafe only
|
||||||
|
enrichments:
|
||||||
|
- screenshot
|
||||||
|
- wacz
|
||||||
|
- webarchive # this way it runs for every case, webarchive extends archiver and enrichment
|
||||||
|
- thumbnails
|
||||||
|
formatters:
|
||||||
|
- HTMLFormater
|
||||||
|
- PDFFormater
|
||||||
|
storages:
|
||||||
|
- local_storage
|
||||||
|
- s3
|
||||||
|
databases:
|
||||||
|
- gsheets_db
|
||||||
|
- mongo_db
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
configurations:
|
||||||
|
gsheets_feeder:
|
||||||
|
- sheet: "Auto archiver"
|
||||||
|
- header: "" # defaults to 1 in GSheetsFeeder
|
||||||
|
- service_account: "secrets/service_account.json"
|
||||||
|
tiktok:
|
||||||
|
username: "abc"
|
||||||
|
password: "123"
|
||||||
|
token: "here"
|
||||||
|
screenshot:
|
||||||
|
width: 1280
|
||||||
|
height: 720
|
||||||
|
wacz:
|
||||||
|
profile: secrets/profile.tar.gz
|
||||||
|
webarchive:
|
||||||
|
api_key: "12345"
|
||||||
|
s3:
|
||||||
|
- bucket: 123
|
||||||
|
- region: "nyc3"
|
||||||
|
- cdn: "{region}{bucket}"
|
||||||
|
|
|
@ -0,0 +1,215 @@
|
||||||
|
from typing import Union, Dict
|
||||||
|
from __future__ import annotations
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
"""
|
||||||
|
how not to couple the different pieces of logic
|
||||||
|
due to the use of constants for the metadata keys?
|
||||||
|
perhaps having methods on the Metadata level that can be used to fetch a limited number of
|
||||||
|
keys, never using strings but rather methods?
|
||||||
|
eg: m = Metadata()
|
||||||
|
m.get("screenshot") vs m.get_all()
|
||||||
|
m.get_url()
|
||||||
|
m.get_hash()
|
||||||
|
m.get_main_file().get_title()
|
||||||
|
m.get_screenshot() # this method should only exist because of the Screenshot Enricher
|
||||||
|
# maybe there is a way for Archivers and Enrichers and Storages to add their own methdods
|
||||||
|
# which raises still the Q of how the database, eg., knows they exist?
|
||||||
|
# maybe there's a function to fetch them all, and each Database can register wathever they get
|
||||||
|
# for eg the GoogleSheets will only register based on the available column names, it knows what it wants
|
||||||
|
# and if it's there: great, otherwise business as usual.
|
||||||
|
# and a MongoDatabase could register all data, for example.
|
||||||
|
#
|
||||||
|
How are Orchestrators created? from a configuration file?
|
||||||
|
orchestrator = ArchivingOrchestrator(config)
|
||||||
|
# Config contains 1 URL, or URLs, from the command line
|
||||||
|
# OR a feeder which is described in the config file
|
||||||
|
# config.get_feeder() # if called as docker run --url "http...." then the uses the default filter
|
||||||
|
# if config.yaml says config
|
||||||
|
orchestrator.start()
|
||||||
|
|
||||||
|
|
||||||
|
Example applications:
|
||||||
|
1. auto-archiver for GSheets
|
||||||
|
2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2"
|
||||||
|
3. archiver backend for a UI that implements a REST API, the API calls CLI
|
||||||
|
|
||||||
|
Cisticola considerations:
|
||||||
|
1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass)
|
||||||
|
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
|
||||||
|
"""
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Metadata:
|
||||||
|
# does not handle files, only primitives
|
||||||
|
# the only piece of logic to handle files is the archiver, enricher, and storage
|
||||||
|
status: str
|
||||||
|
# title: str
|
||||||
|
# url: str
|
||||||
|
# hash: str
|
||||||
|
main_file: Metadata
|
||||||
|
metadata: Dict[str, Metadata]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
|
||||||
|
# should return a merged version of the Metadata
|
||||||
|
# will work for archived() and enriched()
|
||||||
|
# what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get(self, key) -> Union[Metadata, str]:
|
||||||
|
# goes through metadata and returns the Metadata available
|
||||||
|
pass
|
||||||
|
|
||||||
|
def as_json(self) -> str:
|
||||||
|
# converts all metadata and data into JSON
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
@dataclass
|
||||||
|
class ArchiveResult:
|
||||||
|
# maybe metadata can have status as well, eg: screenshot fails. should that be registered in the databases? likely yes
|
||||||
|
status: str
|
||||||
|
url: str
|
||||||
|
metadata: Metadata
|
||||||
|
# title, url, hash, other={}
|
||||||
|
# cdn_url: str = None
|
||||||
|
# thumbnail: str = None
|
||||||
|
# thumbnail_index: str = None
|
||||||
|
# duration: float = None
|
||||||
|
# title: str = None
|
||||||
|
# timestamp: datetime.datetime = None
|
||||||
|
# screenshot: str = None
|
||||||
|
# wacz: str = None
|
||||||
|
# hash: str = None
|
||||||
|
# media: list = field(default_factory=list)
|
||||||
|
|
||||||
|
def __init__(self) -> None: pass
|
||||||
|
|
||||||
|
def update(self, metadata) -> None:
|
||||||
|
# receive a Metadata instance and update itself with it!
|
||||||
|
pass
|
||||||
|
|
||||||
|
def as_json(self) -> str:
|
||||||
|
# converts all metadata and data into JSON
|
||||||
|
pass
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
There is a Superclass for:
|
||||||
|
* Database (should_process)
|
||||||
|
|
||||||
|
How can GSheets work? it needs to feed from a READER (GSheets Feeder)
|
||||||
|
|
||||||
|
Once an archiver returns a link to a local file (for eg to a storage), how do we then delete the produced local files?
|
||||||
|
The context metadata should include a temporary folder (maybe a LocalStorage instance?)
|
||||||
|
"""
|
||||||
|
|
||||||
|
class ArchivingOrchestrator:
|
||||||
|
def __init__(self, config) -> None:
|
||||||
|
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
|
||||||
|
# identify each formatter, storage, database, etc
|
||||||
|
self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
|
||||||
|
|
||||||
|
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI
|
||||||
|
# where does that update/processing happen? in config.py
|
||||||
|
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
|
||||||
|
self.archivers = [
|
||||||
|
Archiver.init(a, config.get(a))
|
||||||
|
for a in config.archivers
|
||||||
|
]
|
||||||
|
|
||||||
|
self.enrichments = [
|
||||||
|
Enrichment.init(e, config.get(e))
|
||||||
|
for e in config.enrichments
|
||||||
|
]
|
||||||
|
|
||||||
|
self.formatters = [
|
||||||
|
Formatter.init(f, config.get(f))
|
||||||
|
for f in config.formatters
|
||||||
|
]
|
||||||
|
|
||||||
|
self.storages = [
|
||||||
|
Storage.init(s, config.get(s))
|
||||||
|
for s in config.storages
|
||||||
|
]
|
||||||
|
|
||||||
|
self.databases = [
|
||||||
|
Database.init(f, config.get(f))
|
||||||
|
for f in config.formatters
|
||||||
|
]
|
||||||
|
|
||||||
|
# these rules are checked in config.py
|
||||||
|
assert len(archivers) > 1, "there needs to be at least one Archiver"
|
||||||
|
|
||||||
|
def feed(self, feeder: Feeder) -> list(ArchiveResult):
|
||||||
|
for next in feeder:
|
||||||
|
self.archive(next)
|
||||||
|
# how does this handle the parameters like folder which can be different for each archiver?
|
||||||
|
# the storage needs to know where to archive!!
|
||||||
|
# solution: feeders have context: extra metadata that they can read or ignore,
|
||||||
|
# all of it should have sensible defaults (eg: folder)
|
||||||
|
# default feeder is a list with 1 element
|
||||||
|
|
||||||
|
def archive(url) -> Union[ArchiveResult, None]:
|
||||||
|
url = clear_url(url)
|
||||||
|
result = Metadata(url=url)
|
||||||
|
|
||||||
|
|
||||||
|
should_archive = True
|
||||||
|
for d in databases: should_archive &= d.should_process(url)
|
||||||
|
# should storages also be able to check?
|
||||||
|
for s in storages: should_archive &= s.should_process(url)
|
||||||
|
|
||||||
|
if not should_archive:
|
||||||
|
return "skipping"
|
||||||
|
|
||||||
|
# signal to DB that archiving has started
|
||||||
|
for d in databases:
|
||||||
|
# are the databases to decide whether to archive?
|
||||||
|
# they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
|
||||||
|
# instagram profile example: it would always re-archive everything
|
||||||
|
# maybe the database/storage could use a hash/key to decide if there's a need to re-archive
|
||||||
|
if d.should_process(url):
|
||||||
|
d.started(url)
|
||||||
|
elif d.exists(url):
|
||||||
|
return d.fetch(url)
|
||||||
|
else:
|
||||||
|
print("Skipping url")
|
||||||
|
return
|
||||||
|
|
||||||
|
# vk, telethon, ...
|
||||||
|
for a in archivers:
|
||||||
|
# with automatic try/catch in download + archived (+ the other ops below)
|
||||||
|
# should the archivers come with the config already? are there configs which change at runtime?
|
||||||
|
# think not, so no need to pass config as parameter
|
||||||
|
# do they need to be refreshed with every execution?
|
||||||
|
# this is where the Hashes come from, the place with access to all content
|
||||||
|
# the archiver does not have access to storage
|
||||||
|
result.update(a.download(url))
|
||||||
|
if result.is_success(): break
|
||||||
|
|
||||||
|
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||||
|
# should it call the HTMLgenerator as if it's not an enrichment?
|
||||||
|
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes
|
||||||
|
# then how to execute it last? should there also be post-processors? are there other examples?
|
||||||
|
# maybe as a PDF? or a Markdown file
|
||||||
|
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
|
||||||
|
for e in enrichments:
|
||||||
|
result.update(e.enrich(result))
|
||||||
|
|
||||||
|
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
|
||||||
|
for p in formatter:
|
||||||
|
result.update(p.process(result))
|
||||||
|
|
||||||
|
# storages
|
||||||
|
for s in storages:
|
||||||
|
for m in result.media:
|
||||||
|
m.update(s.store(m))
|
||||||
|
|
||||||
|
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||||
|
# a hash registration service could be one database: forensic archiving
|
||||||
|
for d in databases: d.done( result)
|
||||||
|
|
||||||
|
return result
|
Ładowanie…
Reference in New Issue