fix config parsing in manifests, remove module level configs

pull/183/head
erinhmclark 2025-01-24 13:33:12 +00:00
rodzic 0453d95f56
commit 024fe58377
16 zmienionych plików z 23 dodań i 191 usunięć

Wyświetl plik

@ -15,10 +15,6 @@ class AtlosStorage(Storage):
def __init__(self, config: dict) -> None:
super().__init__(config)
@staticmethod
def configs() -> dict:
return dict(Storage.configs(), **get_atlos_config_options())
def get_cdn_url(self, _media: Media) -> str:
# It's not always possible to provide an exact URL, because it's
# possible that the media once uploaded could have been copied to

Wyświetl plik

@ -22,11 +22,6 @@ class AtlosDb(Database):
# without this STEP.__init__ is not called
super().__init__(config)
# TODO
@staticmethod
def configs() -> dict:
return get_atlos_config_options()
def failed(self, item: Metadata, reason: str) -> None:
"""Update DB accordingly for failure"""
# If the item has no Atlos ID, there's nothing for us to do

Wyświetl plik

@ -15,11 +15,6 @@ class AtlosFeeder(Feeder):
if type(self.api_token) != str:
raise Exception("Atlos Feeder did not receive an Atlos API token")
# TODO
@staticmethod
def configs() -> dict:
return get_atlos_config_options()
def __iter__(self) -> Metadata:
# Get all the urls from the Atlos API
count = 0

Wyświetl plik

@ -13,16 +13,6 @@ class CLIFeeder(Feeder):
if type(self.urls) != list or len(self.urls) == 0:
raise Exception("CLI Feeder did not receive any URL to process")
# @staticmethod
# def configs() -> dict:
# return {
# "urls": {
# "default": None,
# "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
# "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
# },
# }
def __iter__(self) -> Metadata:
for url in self.urls:
logger.debug(f"Processing {url}")

Wyświetl plik

@ -9,23 +9,6 @@ class CSVFeeder(Feeder):
name = "csv_feeder"
@staticmethod
def configs() -> dict:
return {
"files": {
"default": None,
"help": "Path to the input file(s) to read the URLs from, comma separated. \
Input files should be formatted with one URL per line",
"type": "auto_archiver.utils.parse_csv_to_set",
},
"column": {
"default": None,
"help": "Column number or name to read the URLs from, 0-indexed",
}
}
def __iter__(self) -> Metadata:
url_column = self.column or 0
for file in self.files:

Wyświetl plik

@ -58,16 +58,6 @@ class GDriveStorage(Storage):
self.service = build('drive', 'v3', credentials=creds)
@staticmethod
def configs() -> dict:
return dict(
Storage.configs(),
** {
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
})
def get_cdn_url(self, media: Media) -> str:
"""
only support files saved in a folder for GD

Wyświetl plik

@ -14,7 +14,7 @@
"block_worksheets": {
"default": set(),
"help": "(CSV) explicitly block some worksheets from being processed",
"type": auto_archiver.utils.parse_csv_to_set,
"type": "auto_archiver.utils.parse_csv_to_set",
},
"use_sheet_names_in_stored_paths": {
"default": True,

Wyświetl plik

@ -26,27 +26,6 @@ class GsheetsFeeder(Gsheets, Feeder):
super().__init__(config)
self.gsheets_client = gspread.service_account(filename=self.service_account)
# @staticmethod
# def configs() -> dict:
# return dict(
# Gsheets.configs(),
# ** {
# "allow_worksheets": {
# "default": set(),
# "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
# },
# "block_worksheets": {
# "default": set(),
# "help": "(CSV) explicitly block some worksheets from being processed",
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
# },
# "use_sheet_names_in_stored_paths": {
# "default": True,
# "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
# }
# })
def __iter__(self) -> Metadata:
sh = self.open_sheet()
for ii, wks in enumerate(sh.worksheets()):

Wyświetl plik

@ -28,12 +28,6 @@ class HtmlFormatter(Formatter):
})
self.template = self.environment.get_template("html_template.html")
# @staticmethod
# def configs() -> dict:
# return {
# "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
# }
def format(self, item: Metadata) -> Media:
url = item.get_url()
if item.is_empty():

Wyświetl plik

@ -15,15 +15,6 @@ class LocalStorage(Storage):
super().__init__(config)
os.makedirs(self.save_to, exist_ok=True)
@staticmethod
def configs() -> dict:
return dict(
Storage.configs(),
** {
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
})
def get_cdn_url(self, media: Media) -> str:
# TODO: is this viable with Storage.configs on path/filename?
dest = os.path.join(self.save_to, media.key)

Wyświetl plik

@ -26,27 +26,6 @@ class S3Storage(Storage):
if self.random_no_duplicate:
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
@staticmethod
def configs() -> dict:
return dict(
Storage.configs(),
** {
"bucket": {"default": None, "help": "S3 bucket name"},
"region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"},
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
"endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime"
},
"cdn_url": {
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
},
"private": {"default": False, "help": "if true S3 files will not be readable online"},
})
def get_cdn_url(self, media: Media) -> str:
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)

Wyświetl plik

@ -14,21 +14,6 @@ class ScreenshotEnricher(Enricher):
def __init__(self, config: dict) -> None:
super().__init__(config)
# TODO?
# @staticmethod
# def configs() -> dict:
# return {
# "width": {"default": 1280, "help": "width of the screenshots"},
# "height": {"default": 720, "help": "height of the screenshots"},
# "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
# "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
# "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
# "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
# "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
# }
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()

Wyświetl plik

@ -15,13 +15,28 @@
"configs": {
"tsa_urls": {
"default": [
"http://timestamp.digicert.com",
"http://timestamp.identrust.com",
"http://timestamp.globalsign.com/tsa/r6advanced1",
"http://tss.accv.es:8318/tsa"
],
# [Adobe Approved Trust List] and [Windows Cert Store]
"http://timestamp.digicert.com",
"http://timestamp.identrust.com",
# "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
# "https://timestamp.sectigo.com", # wait 15 seconds between each request.
# [Adobe: European Union Trusted Lists].
# "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
# [Windows Cert Store]
"http://timestamp.globalsign.com/tsa/r6advanced1",
# [Adobe: European Union Trusted Lists] and [Windows Cert Store]
# "http://ts.quovadisglobal.com/eu", # not valid for timestamping
# "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
# "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
# "http://tsa.sep.bg", # self-signed certificate in certificate chain
# "http://tsa.izenpe.com", #unable to get local issuer certificate
# "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
"http://tss.accv.es:8318/tsa",
],
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
"type": auto_archiver.utils.parse_csv_to_set,
"type": "auto_archiver.utils.parse_csv_to_set",
}
},
"description": """

Wyświetl plik

@ -26,37 +26,6 @@ class TimestampingEnricher(Enricher):
def __init__(self, config: dict) -> None:
super().__init__(config)
# @staticmethod
# def configs() -> dict:
# return {
# "tsa_urls": {
# "default": [
# # [Adobe Approved Trust List] and [Windows Cert Store]
# "http://timestamp.digicert.com",
# "http://timestamp.identrust.com",
# # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
# # "https://timestamp.sectigo.com", # wait 15 seconds between each request.
#
# # [Adobe: European Union Trusted Lists].
# # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
#
# # [Windows Cert Store]
# "http://timestamp.globalsign.com/tsa/r6advanced1",
#
# # [Adobe: European Union Trusted Lists] and [Windows Cert Store]
# # "http://ts.quovadisglobal.com/eu", # not valid for timestamping
# # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
# # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
# # "http://tsa.sep.bg", # self-signed certificate in certificate chain
# # "http://tsa.izenpe.com", #unable to get local issuer certificate
# # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
# "http://tss.accv.es:8318/tsa",
# ],
# "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
# }
# }
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"RFC3161 timestamping existing files for {url=}")

Wyświetl plik

@ -12,7 +12,7 @@
"configs": {
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
"type": auto_archiver.utils.parse_csv_to_set,},
"type": "auto_archiver.utils.parse_csv_to_set",},
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
"access_token": {"default": None, "help": "twitter API access_token"},

Wyświetl plik

@ -16,35 +16,6 @@ class Gsheets(Step):
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
@staticmethod
def configs() -> dict:
return {
"sheet": {"default": None, "help": "name of the sheet to archive"},
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
"columns": {
"default": {
'url': 'link',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'thumbnail': 'thumbnail',
'timestamp': 'upload timestamp',
'title': 'upload title',
'text': 'text content',
'screenshot': 'screenshot',
'hash': 'hash',
'pdq_hash': 'perceptual hashes',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
},
"help": "names of columns in the google sheet (stringified JSON object)",
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
},
}
def open_sheet(self):
if self.sheet:
return self.gsheets_client.open(self.sheet)