auto-archiver/configs/config.py

276 wiersze
13 KiB
Python

import argparse, yaml, json
import gspread
from loguru import logger
from selenium import webdriver
from dataclasses import asdict
from selenium.common.exceptions import TimeoutException
from utils import GWorksheet, getattr_or
from .wayback_config import WaybackConfig
from .telethon_config import TelethonConfig
from .selenium_config import SeleniumConfig
from .vk_config import VkConfig
from .twitter_api_config import TwitterApiConfig
from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
class Config:
"""
Controls the current execution parameters and manages API configurations
Usage:
c = Config() # initializes the argument parser
c.parse() # parses the values and initializes the Services and API clients
# you can then access the Services and APIs like 'c.s3_config'
All the configurations available as cmd line options, when included, will
override the configurations in the config.yaml file.
Configurations are split between:
1. "secrets" containing API keys for generating services - not kept in memory
2. "execution" containing specific execution configurations
"""
AVAILABLE_STORAGES = {"s3", "gd", "local"}
def __init__(self):
self.parser = self.get_argument_parser()
self.folder = ""
def parse(self):
self.args = self.parser.parse_args()
logger.success(f'Command line arguments parsed successfully')
self.config_file = self.args.config
self.read_config_yaml()
logger.info(f'APIs and Services initialized:\n{self}')
def read_config_yaml(self):
with open(self.config_file, "r", encoding="utf-8") as inf:
self.config = yaml.safe_load(inf)
# ----------------------EXECUTION - execution configurations
execution = self.config.get("execution", {})
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
def ensure_set(l):
# always returns a set of strings, can receive a set or a string
l = l if isinstance(l, list) else [l]
return set([x for x in l if isinstance(x, str) and len(x) > 0])
self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
if self.save_logs:
self.set_log_files()
self.check_if_exists = getattr(self.args, "check_if_exists") or execution.get("check_if_exists", False)
# Column names come from config and can be overwritten by CMD
# in the end all are considered as lower case
config_column_names = execution.get("column_names", {})
self.column_names = {}
for k in GWorksheet.COLUMN_NAMES.keys():
self.column_names[k] = getattr_or(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
# selenium driver
selenium_configs = execution.get("selenium", {})
self.selenium_config = SeleniumConfig(
timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)),
window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)),
window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height))
)
self.webdriver = "not initialized"
# ---------------------- SECRETS - APIs and service configurations
secrets = self.config.get("secrets", {})
# assert selected storage credentials exist
for key, name in [("s3", "s3"), ("gd", "google_drive"), ("local", "local")]:
assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
# google sheets config
self.gsheets_client = gspread.service_account(
filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
)
# facebook config
self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None)
# s3 config
if "s3" in secrets:
s3 = secrets["s3"]
self.s3_config = S3Config(
bucket=s3["bucket"],
region=s3["region"],
key=s3["key"],
secret=s3["secret"],
endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
cdn_url=s3.get("cdn_url", S3Config.cdn_url),
key_path=s3.get("key_path", S3Config.key_path),
private=getattr_or(self.args, "s3-private", s3.get("private", S3Config.private))
)
# GDrive config
if "google_drive" in secrets:
gd = secrets["google_drive"]
self.gd_config = GDConfig(
root_folder_id=gd.get("root_folder_id"),
service_account=gd.get("service_account", GDConfig.service_account)
)
if "local" in secrets:
self.local_config = LocalConfig(
save_to=secrets["local"].get("save_to", LocalConfig.save_to),
)
# wayback machine config
if "wayback" in secrets:
self.wayback_config = WaybackConfig(
key=secrets["wayback"]["key"],
secret=secrets["wayback"]["secret"],
)
else:
self.wayback_config = None
logger.debug(f"'wayback' key not present in the {self.config_file=}")
# telethon config
if "telegram" in secrets:
self.telegram_config = TelethonConfig(
api_id=secrets["telegram"]["api_id"],
api_hash=secrets["telegram"]["api_hash"],
bot_token=secrets["telegram"].get("bot_token", None)
)
else:
self.telegram_config = None
logger.debug(f"'telegram' key not present in the {self.config_file=}")
# twitter config
if "twitter" in secrets:
self.twitter_config = TwitterApiConfig(
bearer_token=secrets["twitter"].get("bearer_token"),
consumer_key=secrets["twitter"].get("consumer_key"),
consumer_secret=secrets["twitter"].get("consumer_secret"),
access_token=secrets["twitter"].get("access_token"),
access_secret=secrets["twitter"].get("access_secret"),
)
else:
self.twitter_config = None
logger.debug(f"'twitter' key not present in the {self.config_file=}")
# vk config
if "vk" in secrets:
self.vk_config = VkConfig(
username=secrets["vk"]["username"],
password=secrets["vk"]["password"]
)
else:
self.vk_config = None
logger.debug(f"'vk' key not present in the {self.config_file=}")
del self.config["secrets"] # delete to prevent leaks
def set_log_files(self):
# called only when config.execution.save_logs=true
logger.add("logs/1trace.log", level="TRACE")
logger.add("logs/2info.log", level="INFO")
logger.add("logs/3success.log", level="SUCCESS")
logger.add("logs/4warning.log", level="WARNING")
logger.add("logs/5error.log", level="ERROR")
def get_argument_parser(self):
"""
Creates the CMD line arguments. 'python auto_archive.py --help'
"""
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]')
parser.add_argument('--check-if-exists', action='store_true', dest='check_if_exists', help='when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]')
parser.add_argument('--save-logs', action='store_true', dest='save_logs', help='creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]')
parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]')
for k, v in GWorksheet.COLUMN_NAMES.items():
help = f"the name of the column to FILL WITH {k} (default='{v}')"
if k in ["url", "folder"]:
help = f"the name of the column to READ {k} FROM (default='{v}')"
parser.add_argument(f'--col-{k}', action='store', dest=k, help=help)
return parser
def set_folder(self, folder):
"""
update the folder in each of the storages
"""
self.folder = folder
# s3
if hasattr(self, "s3_config"): self.s3_config.folder = folder
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
# gdrive
if hasattr(self, "gd_config"): self.gd_config.folder = folder
if hasattr(self, "gd_storage"): self.gd_storage.folder = folder
# local
if hasattr(self, "local_config"): self.local_config.folder = folder
if hasattr(self, "local_storage"): self.local_storage.folder = folder
def get_storage(self):
"""
returns the configured type of storage, creating if needed
"""
if self.storage == "s3":
self.s3_storage = getattr_or(self, "s3_storage", S3Storage(self.s3_config))
return self.s3_storage
elif self.storage == "gd":
self.gd_storage = getattr_or(self, "gd_storage", GDStorage(self.gd_config))
return self.gd_storage
elif self.storage == "local":
self.local_storage = getattr_or(self, "local_storage", LocalStorage(self.local_config))
return self.local_storage
raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
def destroy_webdriver(self):
if self.webdriver is not None and type(self.webdriver) != str:
self.webdriver.close()
self.webdriver.quit()
del self.webdriver
def recreate_webdriver(self):
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
try:
new_webdriver = webdriver.Firefox(options=options)
# only destroy if creation is successful
self.destroy_webdriver()
self.webdriver = new_webdriver
self.webdriver.set_window_size(self.selenium_config.window_width,
self.selenium_config.window_height)
self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
def __str__(self) -> str:
return json.dumps({
"config_file": self.config_file,
"sheet": self.sheet,
"worksheet_allow": list(self.worksheet_allow),
"worksheet_block": list(self.worksheet_block),
"storage": self.storage,
"header": self.header,
"check_if_exists": self.check_if_exists,
"save_logs": self.save_logs,
"selenium_config": asdict(self.selenium_config),
"selenium_webdriver": self.webdriver != None,
"s3_config": hasattr(self, "s3_config"),
"s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None),
"gd_config": hasattr(self, "gd_config"),
"local_config": hasattr(self, "local_config"),
"wayback_config": self.wayback_config != None,
"telegram_config": self.telegram_config != None,
"twitter_config": self.twitter_config != None,
"vk_config": self.vk_config != None,
"gsheets_client": self.gsheets_client != None,
"column_names": self.column_names,
}, ensure_ascii=False, indent=4)