kopia lustrzana https://github.com/bellingcat/auto-archiver
fix config parsing in manifests, remove module level configs
rodzic
0453d95f56
commit
024fe58377
|
@ -15,10 +15,6 @@ class AtlosStorage(Storage):
|
|||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return dict(Storage.configs(), **get_atlos_config_options())
|
||||
|
||||
def get_cdn_url(self, _media: Media) -> str:
|
||||
# It's not always possible to provide an exact URL, because it's
|
||||
# possible that the media once uploaded could have been copied to
|
||||
|
|
|
@ -22,11 +22,6 @@ class AtlosDb(Database):
|
|||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
# TODO
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return get_atlos_config_options()
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
"""Update DB accordingly for failure"""
|
||||
# If the item has no Atlos ID, there's nothing for us to do
|
||||
|
|
|
@ -15,11 +15,6 @@ class AtlosFeeder(Feeder):
|
|||
if type(self.api_token) != str:
|
||||
raise Exception("Atlos Feeder did not receive an Atlos API token")
|
||||
|
||||
# TODO
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return get_atlos_config_options()
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
# Get all the urls from the Atlos API
|
||||
count = 0
|
||||
|
|
|
@ -13,16 +13,6 @@ class CLIFeeder(Feeder):
|
|||
if type(self.urls) != list or len(self.urls) == 0:
|
||||
raise Exception("CLI Feeder did not receive any URL to process")
|
||||
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "urls": {
|
||||
# "default": None,
|
||||
# "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
# "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
# },
|
||||
# }
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
for url in self.urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
|
|
|
@ -9,23 +9,6 @@ class CSVFeeder(Feeder):
|
|||
|
||||
name = "csv_feeder"
|
||||
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"files": {
|
||||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
Input files should be formatted with one URL per line",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
"help": "Column number or name to read the URLs from, 0-indexed",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
url_column = self.column or 0
|
||||
for file in self.files:
|
||||
|
|
|
@ -58,16 +58,6 @@ class GDriveStorage(Storage):
|
|||
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return dict(
|
||||
Storage.configs(),
|
||||
** {
|
||||
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
||||
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
|
||||
})
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
"""
|
||||
only support files saved in a folder for GD
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"type": auto_archiver.utils.parse_csv_to_set,
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
|
|
|
@ -26,27 +26,6 @@ class GsheetsFeeder(Gsheets, Feeder):
|
|||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return dict(
|
||||
# Gsheets.configs(),
|
||||
# ** {
|
||||
# "allow_worksheets": {
|
||||
# "default": set(),
|
||||
# "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
# },
|
||||
# "block_worksheets": {
|
||||
# "default": set(),
|
||||
# "help": "(CSV) explicitly block some worksheets from being processed",
|
||||
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
# },
|
||||
# "use_sheet_names_in_stored_paths": {
|
||||
# "default": True,
|
||||
# "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
# }
|
||||
# })
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
|
|
|
@ -28,12 +28,6 @@ class HtmlFormatter(Formatter):
|
|||
})
|
||||
self.template = self.environment.get_template("html_template.html")
|
||||
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
# }
|
||||
|
||||
def format(self, item: Metadata) -> Media:
|
||||
url = item.get_url()
|
||||
if item.is_empty():
|
||||
|
|
|
@ -15,15 +15,6 @@ class LocalStorage(Storage):
|
|||
super().__init__(config)
|
||||
os.makedirs(self.save_to, exist_ok=True)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return dict(
|
||||
Storage.configs(),
|
||||
** {
|
||||
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
})
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
# TODO: is this viable with Storage.configs on path/filename?
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
|
|
|
@ -26,27 +26,6 @@ class S3Storage(Storage):
|
|||
if self.random_no_duplicate:
|
||||
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return dict(
|
||||
Storage.configs(),
|
||||
** {
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
},
|
||||
"cdn_url": {
|
||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
},
|
||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||
})
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||
|
||||
|
|
|
@ -14,21 +14,6 @@ class ScreenshotEnricher(Enricher):
|
|||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
# TODO?
|
||||
|
||||
|
||||
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "width": {"default": 1280, "help": "width of the screenshots"},
|
||||
# "height": {"default": 720, "help": "height of the screenshots"},
|
||||
# "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
# "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
# "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
# "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
# "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
# }
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
|
|
@ -15,13 +15,28 @@
|
|||
"configs": {
|
||||
"tsa_urls": {
|
||||
"default": [
|
||||
"http://timestamp.digicert.com",
|
||||
"http://timestamp.identrust.com",
|
||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
"http://tss.accv.es:8318/tsa"
|
||||
],
|
||||
# [Adobe Approved Trust List] and [Windows Cert Store]
|
||||
"http://timestamp.digicert.com",
|
||||
"http://timestamp.identrust.com",
|
||||
# "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||
# "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||
|
||||
# [Adobe: European Union Trusted Lists].
|
||||
# "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||
|
||||
# [Windows Cert Store]
|
||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
# [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||
# "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||
# "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||
# "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||
# "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||
# "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||
# "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||
"http://tss.accv.es:8318/tsa",
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
"type": auto_archiver.utils.parse_csv_to_set,
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -26,37 +26,6 @@ class TimestampingEnricher(Enricher):
|
|||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "tsa_urls": {
|
||||
# "default": [
|
||||
# # [Adobe Approved Trust List] and [Windows Cert Store]
|
||||
# "http://timestamp.digicert.com",
|
||||
# "http://timestamp.identrust.com",
|
||||
# # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||
# # "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||
#
|
||||
# # [Adobe: European Union Trusted Lists].
|
||||
# # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||
#
|
||||
# # [Windows Cert Store]
|
||||
# "http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
#
|
||||
# # [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||
# # "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||
# # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||
# # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||
# # "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||
# # "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||
# # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||
# "http://tss.accv.es:8318/tsa",
|
||||
# ],
|
||||
# "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
# }
|
||||
# }
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"RFC3161 timestamping existing files for {url=}")
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
"configs": {
|
||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
||||
"type": auto_archiver.utils.parse_csv_to_set,},
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
|
|
|
@ -16,35 +16,6 @@ class Gsheets(Step):
|
|||
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
|
||||
assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||
"columns": {
|
||||
"default": {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'pdq_hash': 'perceptual hashes',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
},
|
||||
}
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
|
|
Ładowanie…
Reference in New Issue