kopia lustrzana https://github.com/bellingcat/auto-archiver
Separate setup() and module_setup().
rodzic
2c3d1f591f
commit
e97ccf8a73
|
@ -14,7 +14,7 @@ class BaseModule(ABC):
|
||||||
Base module class. All modules should inherit from this class.
|
Base module class. All modules should inherit from this class.
|
||||||
|
|
||||||
The exact methods a class implements will depend on the type of module it is,
|
The exact methods a class implements will depend on the type of module it is,
|
||||||
however all modules have a .setup(config: dict) method to run any setup code
|
however modules can have a .setup() method to run any setup code
|
||||||
(e.g. logging in to a site, spinning up a browser etc.)
|
(e.g. logging in to a site, spinning up a browser etc.)
|
||||||
|
|
||||||
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
|
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
|
||||||
|
@ -60,7 +60,7 @@ class BaseModule(ABC):
|
||||||
def storages(self) -> list:
|
def storages(self) -> list:
|
||||||
return self.config.get('storages', [])
|
return self.config.get('storages', [])
|
||||||
|
|
||||||
def setup(self, config: dict):
|
def config_setup(self, config: dict):
|
||||||
|
|
||||||
authentication = config.get('authentication', {})
|
authentication = config.get('authentication', {})
|
||||||
# extract out concatenated sites
|
# extract out concatenated sites
|
||||||
|
@ -80,7 +80,7 @@ class BaseModule(ABC):
|
||||||
for key, val in config.get(self.name, {}).items():
|
for key, val in config.get(self.name, {}).items():
|
||||||
setattr(self, key, val)
|
setattr(self, key, val)
|
||||||
|
|
||||||
def module_setup(self):
|
def setup(self):
|
||||||
# For any additional setup required by modules, e.g. autehntication
|
# For any additional setup required by modules, e.g. autehntication
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -58,7 +58,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
|
||||||
|
|
||||||
This has all the information about the module, but does not load the module itself or its dependencies
|
This has all the information about the module, but does not load the module itself or its dependencies
|
||||||
|
|
||||||
To load an actual module, call .setup() on a laz module
|
To load an actual module, call .setup() on a lazy module
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if module_name in _LAZY_LOADED_MODULES:
|
if module_name in _LAZY_LOADED_MODULES:
|
||||||
|
@ -241,8 +241,8 @@ class LazyBaseModule:
|
||||||
# merge the default config with the user config
|
# merge the default config with the user config
|
||||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
|
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
|
||||||
config[self.name] = default_config | config.get(self.name, {})
|
config[self.name] = default_config | config.get(self.name, {})
|
||||||
instance.setup(config)
|
instance.config_setup(config)
|
||||||
instance.module_setup()
|
instance.setup()
|
||||||
return instance
|
return instance
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
|
|
@ -19,7 +19,7 @@ from auto_archiver.core import Storage
|
||||||
|
|
||||||
class GDriveStorage(Storage):
|
class GDriveStorage(Storage):
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.scopes = ['https://www.googleapis.com/auth/drive']
|
self.scopes = ['https://www.googleapis.com/auth/drive']
|
||||||
# Initialize Google Drive service
|
# Initialize Google Drive service
|
||||||
self._setup_google_drive_service()
|
self._setup_google_drive_service()
|
||||||
|
|
|
@ -21,7 +21,7 @@ from . import GWorksheet
|
||||||
|
|
||||||
class GsheetsFeeder(Feeder):
|
class GsheetsFeeder(Feeder):
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||||
# TODO mv to validators
|
# TODO mv to validators
|
||||||
assert self.sheet or self.sheet_id, (
|
assert self.sheet or self.sheet_id, (
|
||||||
|
|
|
@ -17,7 +17,7 @@ class HtmlFormatter(Formatter):
|
||||||
environment: Environment = None
|
environment: Environment = None
|
||||||
template: any = None
|
template: any = None
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
"""Sets up the Jinja2 environment and loads the template."""
|
"""Sets up the Jinja2 environment and loads the template."""
|
||||||
template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
|
template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
|
||||||
self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
|
self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
|
||||||
|
|
|
@ -32,7 +32,7 @@ class InstagramAPIExtractor(Extractor):
|
||||||
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
|
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
|
||||||
)
|
)
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
if self.api_endpoint[-1] == "/":
|
if self.api_endpoint[-1] == "/":
|
||||||
self.api_endpoint = self.api_endpoint[:-1]
|
self.api_endpoint = self.api_endpoint[:-1]
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ class InstagramExtractor(Extractor):
|
||||||
profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
|
profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
|
||||||
# TODO: links to stories
|
# TODO: links to stories
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
|
|
||||||
self.insta = instaloader.Instaloader(
|
self.insta = instaloader.Instaloader(
|
||||||
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
||||||
|
|
|
@ -27,7 +27,7 @@ class InstagramTbotExtractor(Extractor):
|
||||||
https://t.me/instagram_load_bot
|
https://t.me/instagram_load_bot
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
"""
|
"""
|
||||||
1. makes a copy of session_file that is removed in cleanup
|
1. makes a copy of session_file that is removed in cleanup
|
||||||
2. checks if the session file is valid
|
2. checks if the session file is valid
|
||||||
|
|
|
@ -13,7 +13,7 @@ NO_DUPLICATES_FOLDER = "no-dups/"
|
||||||
|
|
||||||
class S3Storage(Storage):
|
class S3Storage(Storage):
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.s3 = boto3.client(
|
self.s3 = boto3.client(
|
||||||
's3',
|
's3',
|
||||||
region_name=self.region,
|
region_name=self.region,
|
||||||
|
|
|
@ -18,7 +18,7 @@ class TelethonExtractor(Extractor):
|
||||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||||
|
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
1. makes a copy of session_file that is removed in cleanup
|
1. makes a copy of session_file that is removed in cleanup
|
||||||
|
|
|
@ -15,7 +15,7 @@ class TwitterApiExtractor(Extractor):
|
||||||
|
|
||||||
valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.api_index = 0
|
self.api_index = 0
|
||||||
self.apis = []
|
self.apis = []
|
||||||
if len(self.bearer_tokens):
|
if len(self.bearer_tokens):
|
||||||
|
|
|
@ -12,7 +12,7 @@ class VkExtractor(Extractor):
|
||||||
Currently only works for /wall posts
|
Currently only works for /wall posts
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
|
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
|
|
|
@ -18,7 +18,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||||
When used as an archiver it will extract the media from the .WACZ archive so it can be enriched.
|
When used as an archiver it will extract the media from the .WACZ archive so it can be enriched.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
|
|
||||||
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
|
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
|
||||||
self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
|
self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
|
||||||
|
|
|
@ -13,7 +13,7 @@ class WhisperEnricher(Enricher):
|
||||||
Only works if an S3 compatible storage is used
|
Only works if an S3 compatible storage is used
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def module_setup(self) -> None:
|
def setup(self) -> None:
|
||||||
self.stores = self.config['steps']['storages']
|
self.stores = self.config['steps']['storages']
|
||||||
self.s3 = get_module("s3_storage", self.config)
|
self.s3 = get_module("s3_storage", self.config)
|
||||||
if not "s3_storage" in self.stores:
|
if not "s3_storage" in self.stores:
|
||||||
|
|
Ładowanie…
Reference in New Issue