kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge branch 'load_modules' into add_module_tests
# Conflicts: # src/auto_archiver/modules/telethon_extractor/telethon_extractor.pypull/189/head
commit
c8cd7ea63c
|
@ -15,15 +15,9 @@ from .module import BaseModule
|
||||||
|
|
||||||
from typing import Any, List, Type, Tuple
|
from typing import Any, List, Type, Tuple
|
||||||
|
|
||||||
yaml: YAML = YAML()
|
_yaml: YAML = YAML()
|
||||||
|
|
||||||
b = yaml.load("""
|
EMPTY_CONFIG = _yaml.load("""
|
||||||
# This is a comment
|
|
||||||
site.com,site2.com:
|
|
||||||
key: value
|
|
||||||
key2: value2
|
|
||||||
""")
|
|
||||||
EMPTY_CONFIG = yaml.load("""
|
|
||||||
# Auto Archiver Configuration
|
# Auto Archiver Configuration
|
||||||
# Steps are the modules that will be run in the order they are defined
|
# Steps are the modules that will be run in the order they are defined
|
||||||
|
|
||||||
|
@ -149,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||||
config = None
|
config = None
|
||||||
try:
|
try:
|
||||||
with open(yaml_filename, "r", encoding="utf-8") as inf:
|
with open(yaml_filename, "r", encoding="utf-8") as inf:
|
||||||
config = yaml.load(inf)
|
config = _yaml.load(inf)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -166,4 +160,4 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||||
|
|
||||||
config_to_save.pop('urls', None)
|
config_to_save.pop('urls', None)
|
||||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||||
yaml.dump(config_to_save, outf)
|
_yaml.dump(config_to_save, outf)
|
|
@ -44,6 +44,7 @@ class Metadata:
|
||||||
if overwrite_left:
|
if overwrite_left:
|
||||||
if right.status and len(right.status):
|
if right.status and len(right.status):
|
||||||
self.status = right.status
|
self.status = right.status
|
||||||
|
self._context.update(right._context)
|
||||||
for k, v in right.metadata.items():
|
for k, v in right.metadata.items():
|
||||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||||
|
|
|
@ -20,7 +20,7 @@ from rich_argparse import RichHelpFormatter
|
||||||
|
|
||||||
from .metadata import Metadata, Media
|
from .metadata import Metadata, Media
|
||||||
from auto_archiver.version import __version__
|
from auto_archiver.version import __version__
|
||||||
from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||||
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
||||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||||
from .module import BaseModule
|
from .module import BaseModule
|
||||||
|
@ -50,7 +50,7 @@ class AuthenticationJsonParseAction(JsonParseAction):
|
||||||
auth_dict = json.load(f)
|
auth_dict = json.load(f)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# maybe it's yaml, try that
|
# maybe it's yaml, try that
|
||||||
auth_dict = yaml.load(f)
|
auth_dict = _yaml.load(f)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -424,8 +424,8 @@ class ArchivingOrchestrator:
|
||||||
cached_result = None
|
cached_result = None
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
d.started(result)
|
d.started(result)
|
||||||
if (local_result := d.fetch(result)):
|
if local_result := d.fetch(result):
|
||||||
cached_result = (cached_result or Metadata()).merge(local_result)
|
cached_result = (cached_result or Metadata()).merge(local_result).merge(result)
|
||||||
if cached_result:
|
if cached_result:
|
||||||
logger.debug("Found previously archived entry")
|
logger.debug("Found previously archived entry")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"name": "Auto-Archiver API Database",
|
"name": "Auto-Archiver API Database",
|
||||||
"type": ["database"],
|
"type": ["database"],
|
||||||
"entry_point": "api_db:AAApiDb",
|
"entry_point": "api_db::AAApiDb",
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"python": ["requests", "loguru"],
|
"python": ["requests", "loguru"],
|
||||||
|
@ -23,7 +23,7 @@
|
||||||
"default": None,
|
"default": None,
|
||||||
"help": "which group of users have access to the archive in case public=false as author",
|
"help": "which group of users have access to the archive in case public=false as author",
|
||||||
},
|
},
|
||||||
"allow_rearchive": {
|
"use_api_cache": {
|
||||||
"default": True,
|
"default": True,
|
||||||
"type": "bool",
|
"type": "bool",
|
||||||
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
|
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
|
||||||
|
@ -43,7 +43,7 @@
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
- **API Integration**: Supports querying for existing archives and submitting results.
|
- **API Integration**: Supports querying for existing archives and submitting results.
|
||||||
- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
|
- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled.
|
||||||
- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
|
- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
|
||||||
- **Tagging and Metadata**: Adds tags and manages metadata for archives.
|
- **Tagging and Metadata**: Adds tags and manages metadata for archives.
|
||||||
- **Optional Storage**: Archives results conditionally based on configuration.
|
- **Optional Storage**: Archives results conditionally based on configuration.
|
||||||
|
|
|
@ -15,11 +15,11 @@ class AAApiDb(Database):
|
||||||
""" query the database for the existence of this item.
|
""" query the database for the existence of this item.
|
||||||
Helps avoid re-archiving the same URL multiple times.
|
Helps avoid re-archiving the same URL multiple times.
|
||||||
"""
|
"""
|
||||||
if not self.allow_rearchive: return
|
if not self.use_api_cache: return
|
||||||
|
|
||||||
params = {"url": item.get_url(), "limit": 15}
|
params = {"url": item.get_url(), "limit": 15}
|
||||||
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
||||||
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
|
response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
if len(response.json()):
|
if len(response.json()):
|
||||||
|
@ -30,8 +30,7 @@ class AAApiDb(Database):
|
||||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
|
||||||
"""archival result ready - should be saved to DB"""
|
"""archival result ready - should be saved to DB"""
|
||||||
if not self.store_results: return
|
if not self.store_results: return
|
||||||
if cached:
|
if cached:
|
||||||
|
@ -39,12 +38,18 @@ class AAApiDb(Database):
|
||||||
return
|
return
|
||||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||||
|
|
||||||
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
payload = {
|
||||||
|
'author_id': self.author_id,
|
||||||
|
'url': item.get_url(),
|
||||||
|
'public': self.public,
|
||||||
|
'group_id': self.group_id,
|
||||||
|
'tags': list(self.tags),
|
||||||
|
'result': item.to_json(),
|
||||||
|
}
|
||||||
headers = {"Authorization": f"Bearer {self.api_token}"}
|
headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||||
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
|
response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 201:
|
||||||
logger.success(f"AA API: {response.json()}")
|
logger.success(f"AA API: {response.json()}")
|
||||||
else:
|
else:
|
||||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"name": "Atlos Database",
|
"name": "Atlos Database",
|
||||||
"type": ["database"],
|
"type": ["database"],
|
||||||
"entry_point": "atlos_db:AtlosDb",
|
"entry_point": "atlos_db::AtlosDb",
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies":
|
"dependencies":
|
||||||
{"python": ["loguru",
|
{"python": ["loguru",
|
||||||
|
|
|
@ -109,6 +109,6 @@ class GsheetsDb(Database):
|
||||||
gw: GWorksheet = gsheet.get("worksheet")
|
gw: GWorksheet = gsheet.get("worksheet")
|
||||||
row: int = gsheet.get("row")
|
row: int = gsheet.get("row")
|
||||||
elif self.sheet_id:
|
elif self.sheet_id:
|
||||||
print(self.sheet_id)
|
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||||
|
|
||||||
return gw, row
|
return gw, row
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
|
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
|
||||||
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
|
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
|
||||||
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
|
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
|
||||||
is advised for more comprehensive functionality.
|
is advised for more comprehensive functionality, and higher quality media extraction.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
- Extracts images and videos from public Telegram message links (`t.me`).
|
- Extracts images and videos from public Telegram message links (`t.me`).
|
||||||
|
|
Ładowanie…
Reference in New Issue