kopia lustrzana https://github.com/bellingcat/auto-archiver
gsheet feeder + db WIP
rodzic
96845305a3
commit
bb512b36c9
|
@ -1,7 +1,7 @@
|
||||||
steps:
|
steps:
|
||||||
# only 1 feeder allowed
|
# only 1 feeder allowed
|
||||||
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
|
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
|
||||||
feeder: gsheets_feeder # default -> only expects URL from CLI
|
feeder: gsheet_feeder # default -> only expects URL from CLI
|
||||||
archivers: # order matters
|
archivers: # order matters
|
||||||
- telethon
|
- telethon
|
||||||
# - tiktok
|
# - tiktok
|
||||||
|
@ -28,7 +28,7 @@ steps:
|
||||||
configurations:
|
configurations:
|
||||||
global:
|
global:
|
||||||
- save_logs: False
|
- save_logs: False
|
||||||
gsheets_feeder:
|
gsheet_feeder:
|
||||||
sheet: my-auto-archiver
|
sheet: my-auto-archiver
|
||||||
header: 2 # defaults to 1 in GSheetsFeeder
|
header: 2 # defaults to 1 in GSheetsFeeder
|
||||||
service_account: "secrets/service_account.json"
|
service_account: "secrets/service_account.json"
|
||||||
|
|
|
@ -23,6 +23,10 @@ class Archiverv2(Step):
|
||||||
# used when archivers need to login or do other one-time setup
|
# used when archivers need to login or do other one-time setup
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def clean_url(self, url:str) -> str:
|
||||||
|
# used to clean unnecessary URL parameters
|
||||||
|
return url
|
||||||
|
|
||||||
def _guess_file_type(self, path: str) -> str:
|
def _guess_file_type(self, path: str) -> str:
|
||||||
"""
|
"""
|
||||||
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
||||||
|
|
|
@ -5,6 +5,8 @@ from dataclasses import dataclass, field
|
||||||
from typing import List
|
from typing import List
|
||||||
from archivers import Archiverv2
|
from archivers import Archiverv2
|
||||||
from feeders import Feeder
|
from feeders import Feeder
|
||||||
|
from databases import Database
|
||||||
|
from storages import StorageV2
|
||||||
from steps.step import Step
|
from steps.step import Step
|
||||||
from enrichers import Enricher
|
from enrichers import Enricher
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
@ -13,10 +15,13 @@ from collections import defaultdict
|
||||||
@dataclass
|
@dataclass
|
||||||
class ConfigV2:
|
class ConfigV2:
|
||||||
# TODO: should Config inherit from Step so it can have it's own configurations?
|
# TODO: should Config inherit from Step so it can have it's own configurations?
|
||||||
|
# these are only detected if they are put to the respective __init__.py
|
||||||
configurable_parents = [
|
configurable_parents = [
|
||||||
Feeder,
|
Feeder,
|
||||||
Enricher,
|
Enricher,
|
||||||
Archiverv2,
|
Archiverv2,
|
||||||
|
Database,
|
||||||
|
StorageV2
|
||||||
# Util
|
# Util
|
||||||
]
|
]
|
||||||
feeder: Step # TODO:= BaseFeeder
|
feeder: Step # TODO:= BaseFeeder
|
||||||
|
@ -24,14 +29,14 @@ class ConfigV2:
|
||||||
enrichers: List[Enricher] = field(default_factory=[])
|
enrichers: List[Enricher] = field(default_factory=[])
|
||||||
formatters: List[Step] = field(default_factory=[]) # TODO: fix type
|
formatters: List[Step] = field(default_factory=[]) # TODO: fix type
|
||||||
storages: List[Step] = field(default_factory=[]) # TODO: fix type
|
storages: List[Step] = field(default_factory=[]) # TODO: fix type
|
||||||
databases: List[Step] = field(default_factory=[]) # TODO: fix type
|
databases: List[Database] = field(default_factory=[])
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.defaults = {}
|
self.defaults = {}
|
||||||
self.cli_ops = {}
|
self.cli_ops = {}
|
||||||
self.config = {}
|
self.config = {}
|
||||||
|
|
||||||
# TODO: make this work for nested props like gsheets_feeder.columns.url = "URL"
|
# TODO: make this work for nested props like gsheet_feeder.columns.url = "URL"
|
||||||
def parse(self):
|
def parse(self):
|
||||||
# 1. parse CLI values
|
# 1. parse CLI values
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
|
@ -84,10 +89,12 @@ class ConfigV2:
|
||||||
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
|
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
|
||||||
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
|
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
|
||||||
self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])]
|
self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])]
|
||||||
|
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
|
||||||
|
|
||||||
print("feeder", self.feeder)
|
print("feeder", self.feeder)
|
||||||
print("enrichers", [e for e in self.enrichers])
|
print("enrichers", [e for e in self.enrichers])
|
||||||
print("archivers", [e for e in self.archivers])
|
print("archivers", [e for e in self.archivers])
|
||||||
|
print("databases", [e for e in self.databases])
|
||||||
|
|
||||||
def validate(self):
|
def validate(self):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
from .database import Database
|
||||||
|
from .gsheet_db import GsheetsDb
|
|
@ -1,9 +1,11 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from abc import abstractmethod, ABC
|
from abc import abstractmethod, ABC
|
||||||
|
from typing import Union
|
||||||
from metadata import Metadata
|
from metadata import Metadata
|
||||||
from steps.step import Step
|
from steps.step import Step
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Database(Step, ABC):
|
class Database(Step, ABC):
|
||||||
name = "database"
|
name = "database"
|
||||||
|
@ -12,10 +14,29 @@ class Database(Step, ABC):
|
||||||
# without this STEP.__init__ is not called
|
# without this STEP.__init__ is not called
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
|
|
||||||
# only for typing...
|
|
||||||
def init(name: str, config: dict) -> Database:
|
def init(name: str, config: dict) -> Database:
|
||||||
|
# only for typing...
|
||||||
return Step.init(name, config, Database)
|
return Step.init(name, config, Database)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def enrich(self, item: Metadata) -> Metadata: pass
|
def started(self, item: Metadata) -> None:
|
||||||
|
"""signals the DB that the given item archival has started"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def failed(self, item: Metadata) -> None:
|
||||||
|
"""update DB accordingly for failure"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def aborted(self, item: Metadata) -> None:
|
||||||
|
"""abort notification if user cancelled after start"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
# @abstractmethod
|
||||||
|
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||||
|
"""check if the given item has been archived already"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def done(self, item: Metadata) -> None:
|
||||||
|
"""archival result ready - should be saved to DB"""
|
||||||
|
pass
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
from typing import Union, Tuple
|
||||||
|
import gspread
|
||||||
|
|
||||||
|
# from metadata import Metadata
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
# from . import Enricher
|
||||||
|
from databases import Database
|
||||||
|
from metadata import Metadata
|
||||||
|
from steps.gsheet import Gsheets
|
||||||
|
from utils import GWorksheet
|
||||||
|
|
||||||
|
|
||||||
|
class GsheetsDb(Database):
|
||||||
|
"""
|
||||||
|
NB: only works if GsheetFeeder is used.
|
||||||
|
could be updated in the future to support non-GsheetFeeder metadata
|
||||||
|
"""
|
||||||
|
name = "gsheet_db"
|
||||||
|
|
||||||
|
def __init__(self, config: dict) -> None:
|
||||||
|
# without this STEP.__init__ is not called
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def configs() -> dict:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def started(self, item: Metadata) -> None:
|
||||||
|
logger.warning(f"STARTED {item}")
|
||||||
|
gw, row = self._retrieve_gsheet(item)
|
||||||
|
gw.set_cell(row, 'status', 'Archive in progress')
|
||||||
|
|
||||||
|
def failed(self, item: Metadata) -> None:
|
||||||
|
logger.error(f"FAILED {item}")
|
||||||
|
self._safe_status_update(item, 'Archive failed')
|
||||||
|
|
||||||
|
def aborted(self, item: Metadata) -> None:
|
||||||
|
logger.warning(f"ABORTED {item}")
|
||||||
|
self._safe_status_update(item, '')
|
||||||
|
|
||||||
|
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||||
|
"""check if the given item has been archived already"""
|
||||||
|
# TODO: this should not be done at the feeder stage then!
|
||||||
|
return False
|
||||||
|
|
||||||
|
def done(self, item: Metadata) -> None:
|
||||||
|
"""archival result ready - should be saved to DB"""
|
||||||
|
logger.success(f"DONE {item}")
|
||||||
|
gw, row = self._retrieve_gsheet(item)
|
||||||
|
self._safe_status_update(item, 'done')
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||||
|
try:
|
||||||
|
gw, row = self._retrieve_gsheet(item)
|
||||||
|
gw.set_cell(row, 'status', new_status)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Unable to update sheet: {e}")
|
||||||
|
|
||||||
|
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||||
|
gw: GWorksheet = item.get("gsheet").get("worksheet")
|
||||||
|
row: int = item.get("gsheet").get("row")
|
||||||
|
return gw, row
|
|
@ -1,2 +1,2 @@
|
||||||
from .enricher import Enricher
|
from .enricher import Enricher
|
||||||
from .enricher_screenshot import ScreenshotEnricher
|
from .screenshot_enricher import ScreenshotEnricher
|
|
@ -1,2 +1,2 @@
|
||||||
from.feeder import Feeder
|
from.feeder import Feeder
|
||||||
from .feeder_gsheet import GsheetsFeeder
|
from .gsheet_feeder import GsheetsFeeder
|
|
@ -1,7 +1,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
# from metadata import Metadata
|
from metadata import Metadata
|
||||||
from steps.step import Step
|
from steps.step import Step
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,7 +17,5 @@ class Feeder(Step):
|
||||||
# only for code typing
|
# only for code typing
|
||||||
return Step.init(name, config, Feeder)
|
return Step.init(name, config, Feeder)
|
||||||
|
|
||||||
# def feed(self, item: Metadata) -> Metadata: pass
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __iter__(self) -> Feeder: return None
|
def __iter__(self) -> Metadata: return None
|
|
@ -1,16 +1,17 @@
|
||||||
import json, gspread
|
import gspread
|
||||||
|
|
||||||
# from metadata import Metadata
|
# from metadata import Metadata
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
# from . import Enricher
|
# from . import Enricher
|
||||||
from feeders import Feeder
|
from feeders import Feeder
|
||||||
|
from metadata import Metadata
|
||||||
from steps.gsheet import Gsheets
|
from steps.gsheet import Gsheets
|
||||||
from utils import GWorksheet
|
from utils import GWorksheet
|
||||||
|
|
||||||
|
|
||||||
class GsheetsFeeder(Gsheets, Feeder):
|
class GsheetsFeeder(Gsheets, Feeder):
|
||||||
name = "gsheets_feeder"
|
name = "gsheet_feeder"
|
||||||
|
|
||||||
def __init__(self, config: dict) -> None:
|
def __init__(self, config: dict) -> None:
|
||||||
# without this STEP.__init__ is not called
|
# without this STEP.__init__ is not called
|
||||||
|
@ -35,7 +36,7 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
def __iter__(self) -> str:
|
def __iter__(self) -> Metadata:
|
||||||
sh = self.gsheets_client.open(self.sheet)
|
sh = self.gsheets_client.open(self.sheet)
|
||||||
for ii, wks in enumerate(sh.worksheets()):
|
for ii, wks in enumerate(sh.worksheets()):
|
||||||
if not self.should_process_sheet(wks.title):
|
if not self.should_process_sheet(wks.title):
|
||||||
|
@ -52,17 +53,16 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||||
url = gw.get_cell(row, 'url').strip()
|
url = gw.get_cell(row, 'url').strip()
|
||||||
if not len(url): continue
|
if not len(url): continue
|
||||||
# TODO: gsheet_db should check later if this is supposed to be archived
|
|
||||||
# static_status = gw.get_cell(row, 'status')
|
|
||||||
# status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '')
|
|
||||||
# All checks done - archival process starts here
|
|
||||||
yield url
|
|
||||||
logger.success(f'Finished worksheet {wks.title}')
|
|
||||||
|
|
||||||
# GWorksheet(self.sheet)
|
original_status = gw.get_cell(row, 'status')
|
||||||
print(self.sheet)
|
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||||
for u in ["url1", "url2"]:
|
# TODO: custom status parser(?) aka should_retry_from_status
|
||||||
yield u
|
if status not in ['', None]: continue
|
||||||
|
|
||||||
|
# All checks done - archival process starts here
|
||||||
|
yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
|
||||||
|
|
||||||
|
logger.success(f'Finished worksheet {wks.title}')
|
||||||
|
|
||||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from ast import List
|
from ast import List, Set
|
||||||
from typing import Any, Union, Dict
|
from typing import Any, Union, Dict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
@ -13,7 +13,13 @@ from media import Media
|
||||||
class Metadata:
|
class Metadata:
|
||||||
status: str = ""
|
status: str = ""
|
||||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
tmp_keys: Set[str] = field(default_factory=set) # keys that are not to be saved in DBs
|
||||||
media: List[Media] = field(default_factory=list)
|
media: List[Media] = field(default_factory=list)
|
||||||
|
rearchivable: bool = False
|
||||||
|
|
||||||
|
# def __init__(self, url, metadata = {}) -> None:
|
||||||
|
# self.set_url(url)
|
||||||
|
# self.metadata = metadata
|
||||||
|
|
||||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||||
"""
|
"""
|
||||||
|
@ -21,6 +27,7 @@ class Metadata:
|
||||||
"""
|
"""
|
||||||
if overwrite_left:
|
if overwrite_left:
|
||||||
self.status = right.status
|
self.status = right.status
|
||||||
|
self.rearchivable |= right.rearchivable
|
||||||
for k, v in right.metadata.items():
|
for k, v in right.metadata.items():
|
||||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||||
|
@ -33,8 +40,10 @@ class Metadata:
|
||||||
return right.merge(self)
|
return right.merge(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set(self, key: str, val: Any) -> Metadata:
|
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
|
||||||
|
# if not self.metadata: self.metadata = {}
|
||||||
self.metadata[key] = val
|
self.metadata[key] = val
|
||||||
|
if is_tmp: self.tmp_keys.add(key)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
|
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
|
||||||
|
@ -75,3 +84,12 @@ class Metadata:
|
||||||
# # converts all metadata and data into JSON
|
# # converts all metadata and data into JSON
|
||||||
# return json.dumps(self.metadata)
|
# return json.dumps(self.metadata)
|
||||||
# #TODO: datetime is not serializable
|
# #TODO: datetime is not serializable
|
||||||
|
|
||||||
|
def cleanup(self) -> Metadata:
|
||||||
|
#TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
|
||||||
|
# the code below leads to errors if database needs tmp_keys after they are removed
|
||||||
|
# """removes temporary metadata fields, ideally called after all ops except writing"""
|
||||||
|
# for tmp_key in self.tmp_keys:
|
||||||
|
# self.metadata.pop(tmp_key, None)
|
||||||
|
# self.tmp_keys = set()
|
||||||
|
pass
|
||||||
|
|
|
@ -5,8 +5,11 @@ from dataclasses import dataclass
|
||||||
from archivers.archiver import Archiverv2
|
from archivers.archiver import Archiverv2
|
||||||
|
|
||||||
from enrichers.enricher import Enricher
|
from enrichers.enricher import Enricher
|
||||||
|
from databases.database import Database
|
||||||
from metadata import Metadata
|
from metadata import Metadata
|
||||||
import tempfile, time
|
import tempfile, time, traceback
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
how not to couple the different pieces of logic
|
how not to couple the different pieces of logic
|
||||||
|
@ -119,7 +122,7 @@ class ArchivingOrchestrator:
|
||||||
# identify each formatter, storage, database, etc
|
# identify each formatter, storage, database, etc
|
||||||
# self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
|
# self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
|
||||||
|
|
||||||
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI
|
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI
|
||||||
# where does that update/processing happen? in config.py
|
# where does that update/processing happen? in config.py
|
||||||
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
|
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
|
||||||
# self.archivers = [
|
# self.archivers = [
|
||||||
|
@ -129,12 +132,12 @@ class ArchivingOrchestrator:
|
||||||
self.feeder = config.feeder
|
self.feeder = config.feeder
|
||||||
self.enrichers = config.enrichers
|
self.enrichers = config.enrichers
|
||||||
self.archivers: List[Archiverv2] = config.archivers
|
self.archivers: List[Archiverv2] = config.archivers
|
||||||
|
self.databases: List[Database] = config.databases
|
||||||
|
|
||||||
for a in self.archivers: a.setup()
|
for a in self.archivers: a.setup()
|
||||||
|
|
||||||
self.formatters = []
|
self.formatters = []
|
||||||
self.storages = []
|
self.storages = []
|
||||||
self.databases = []
|
|
||||||
# self.formatters = [
|
# self.formatters = [
|
||||||
# Formatter.init(f, config)
|
# Formatter.init(f, config)
|
||||||
# for f in config.formatters
|
# for f in config.formatters
|
||||||
|
@ -154,51 +157,61 @@ class ArchivingOrchestrator:
|
||||||
# assert len(archivers) > 1, "there needs to be at least one Archiver"
|
# assert len(archivers) > 1, "there needs to be at least one Archiver"
|
||||||
|
|
||||||
def feed(self) -> list(Metadata):
|
def feed(self) -> list(Metadata):
|
||||||
for url in self.feeder:
|
for item in self.feeder:
|
||||||
print("ARCHIVING", url)
|
print("ARCHIVING", item)
|
||||||
|
try:
|
||||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||||
result = self.archive(url, tmp_dir)
|
item.set("tmp_dir", tmp_dir, True)
|
||||||
print(type(result))
|
result = self.archive(item)
|
||||||
print(result)
|
print(result)
|
||||||
# print(result.as_json())
|
except KeyboardInterrupt:
|
||||||
print("holding on")
|
# catches keyboard interruptions to do a clean exit
|
||||||
|
logger.warning(f"caught interrupt on {item=}")
|
||||||
|
for d in self.databases: d.aborted(item)
|
||||||
|
exit()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||||
|
for d in self.databases: d.failed(item)
|
||||||
|
|
||||||
|
print("holding on 5min")
|
||||||
time.sleep(300)
|
time.sleep(300)
|
||||||
|
|
||||||
# how does this handle the parameters like folder which can be different for each archiver?
|
# how does this handle the parameters like folder which can be different for each archiver?
|
||||||
# the storage needs to know where to archive!!
|
# the storage needs to know where to archive!!
|
||||||
# solution: feeders have context: extra metadata that they can read or ignore,
|
# solution: feeders have context: extra metadata that they can read or ignore,
|
||||||
# all of it should have sensible defaults (eg: folder)
|
# all of it should have sensible defaults (eg: folder)
|
||||||
# default feeder is a list with 1 element
|
# default feeder is a list with 1 element
|
||||||
|
|
||||||
def archive(self, url: str, tmp_dir: str) -> Union[Metadata, None]:
|
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||||
# TODO:
|
url = result.get_url()
|
||||||
# url = clear_url(url) # should we save if they differ?
|
# TODO: clean urls
|
||||||
# result = Metadata(url=url)
|
for a in self.archivers:
|
||||||
result = Metadata()
|
url = a.clean_url(url)
|
||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
result.set("tmp_dir", tmp_dir)
|
# should_archive = False
|
||||||
|
# for d in self.databases: should_archive |= d.should_process(url)
|
||||||
should_archive = True
|
|
||||||
for d in self.databases: should_archive &= d.should_process(url)
|
|
||||||
# should storages also be able to check?
|
# should storages also be able to check?
|
||||||
for s in self.storages: should_archive &= s.should_process(url)
|
# for s in self.storages: should_archive |= s.should_process(url)
|
||||||
|
|
||||||
if not should_archive:
|
# if not should_archive:
|
||||||
print("skipping")
|
# print("skipping")
|
||||||
return "skipping"
|
# return "skipping"
|
||||||
|
|
||||||
# signal to DB that archiving has started
|
# signal to DB that archiving has started
|
||||||
|
# and propagate already archived if it exists
|
||||||
|
cached_result = None
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
# are the databases to decide whether to archive?
|
# are the databases to decide whether to archive?
|
||||||
# they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
|
# they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
|
||||||
# instagram profile example: it would always re-archive everything
|
# instagram profile example: it would always re-archive everything
|
||||||
# maybe the database/storage could use a hash/key to decide if there's a need to re-archive
|
# maybe the database/storage could use a hash/key to decide if there's a need to re-archive
|
||||||
if d.should_process(url):
|
d.started(result)
|
||||||
d.started(url)
|
if (local_result := d.fetch(result)):
|
||||||
elif d.exists(url):
|
cached_result = (cached_result or Metadata()).merge(local_result)
|
||||||
return d.fetch(url)
|
if cached_result and not cached_result.rearchivable:
|
||||||
else:
|
for d in self.databases:
|
||||||
print("Skipping url")
|
d.done(cached_result)
|
||||||
return
|
return cached_result
|
||||||
|
|
||||||
# vk, telethon, ...
|
# vk, telethon, ...
|
||||||
for a in self.archivers:
|
for a in self.archivers:
|
||||||
|
@ -209,6 +222,7 @@ class ArchivingOrchestrator:
|
||||||
# this is where the Hashes come from, the place with access to all content
|
# this is where the Hashes come from, the place with access to all content
|
||||||
# the archiver does not have access to storage
|
# the archiver does not have access to storage
|
||||||
result.merge(a.download(result))
|
result.merge(a.download(result))
|
||||||
|
# TODO: fix logic
|
||||||
if True or result.is_success(): break
|
if True or result.is_success(): break
|
||||||
|
|
||||||
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||||
|
@ -224,13 +238,14 @@ class ArchivingOrchestrator:
|
||||||
for f in self.formatters:
|
for f in self.formatters:
|
||||||
result.merge(f.format(result))
|
result.merge(f.format(result))
|
||||||
|
|
||||||
# storages
|
# storage
|
||||||
for s in self.storages:
|
for s in self.storages:
|
||||||
for m in result.media:
|
for m in result.media:
|
||||||
m.merge(s.store(m))
|
result.merge(s.store(m))
|
||||||
|
|
||||||
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||||
# a hash registration service could be one database: forensic archiving
|
# a hash registration service could be one database: forensic archiving
|
||||||
|
result.cleanup()
|
||||||
for d in self.databases: d.done(result)
|
for d in self.databases: d.done(result)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
|
@ -14,7 +14,7 @@ class Step(ABC):
|
||||||
def __init__(self, config: dict) -> None:
|
def __init__(self, config: dict) -> None:
|
||||||
# reads the configs into object properties
|
# reads the configs into object properties
|
||||||
# self.config = config[self.name]
|
# self.config = config[self.name]
|
||||||
for k, v in config[self.name].items():
|
for k, v in config.get(self.name, {}).items():
|
||||||
self.__setattr__(k, v)
|
self.__setattr__(k, v)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -5,3 +5,4 @@ from .s3_storage import S3Config, S3Storage
|
||||||
from .gd_storage import GDConfig, GDStorage
|
from .gd_storage import GDConfig, GDStorage
|
||||||
|
|
||||||
from .storage import StorageV2
|
from .storage import StorageV2
|
||||||
|
from .s3 import S3StorageV2
|
Ładowanie…
Reference in New Issue