Merge branch 'load_modules' into timestamping_rewrite

pull/224/head
Patrick Robertson 2025-02-11 15:21:31 +00:00
commit 7bb4d68a22
122 zmienionych plików z 3281 dodań i 1011 usunięć

108
poetry.lock wygenerowano
Wyświetl plik

@ -1025,7 +1025,7 @@ version = "0.7.3"
description = "Python logging made (stupidly) simple"
optional = false
python-versions = "<4.0,>=3.5"
groups = ["main"]
groups = ["main", "dev"]
files = [
{file = "loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c"},
{file = "loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6"},
@ -1750,6 +1750,24 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
[package.extras]
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-loguru"
version = "0.4.0"
description = "Pytest Loguru"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
files = [
{file = "pytest_loguru-0.4.0-py3-none-any.whl", hash = "sha256:3cc7b9c6b22cb158209ccbabf0d678dacd3f3c7497d6f46f1c338c13bee1ac77"},
{file = "pytest_loguru-0.4.0.tar.gz", hash = "sha256:0d9e4e72ae9bfd92f774c666e7353766af11b0b78edd59c290e89be116050f03"},
]
[package.dependencies]
loguru = "*"
[package.extras]
test = ["pytest", "pytest-cov"]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@ -1818,7 +1836,7 @@ version = "6.0.2"
description = "YAML parser and emitter for Python"
optional = false
python-versions = ">=3.8"
groups = ["main", "docs"]
groups = ["docs"]
files = [
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@ -2086,6 +2104,82 @@ files = [
[package.dependencies]
pyasn1 = ">=0.1.3"
[[package]]
name = "ruamel-yaml"
version = "0.18.10"
description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "ruamel.yaml-0.18.10-py3-none-any.whl", hash = "sha256:30f22513ab2301b3d2b577adc121c6471f28734d3d9728581245f1e76468b4f1"},
{file = "ruamel.yaml-0.18.10.tar.gz", hash = "sha256:20c86ab29ac2153f80a428e1254a8adf686d3383df04490514ca3b79a362db58"},
]
[package.dependencies]
"ruamel.yaml.clib" = {version = ">=0.2.7", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.13\""}
[package.extras]
docs = ["mercurial (>5.7)", "ryd"]
jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"]
[[package]]
name = "ruamel-yaml-clib"
version = "0.2.12"
description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml"
optional = false
python-versions = ">=3.9"
groups = ["main"]
markers = "platform_python_implementation == \"CPython\""
files = [
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:11f891336688faf5156a36293a9c362bdc7c88f03a8a027c2c1d8e0bcde998e5"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a52d48f4e7bf9005e8f0a89209bf9a73f7190ddf0489eee5eb51377385f59f2a"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f6f3eac23941b32afccc23081e1f50612bdbe4e982012ef4f5797986828cd01"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2c59aa6170b990d8d2719323e628aaf36f3bfbc1c26279c0eeeb24d05d2d11c7"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
{file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
]
[[package]]
name = "s3transfer"
version = "0.11.2"
@ -2956,7 +3050,7 @@ version = "1.2.0"
description = "A small Python utility to set file creation time on Windows"
optional = false
python-versions = ">=3.5"
groups = ["main"]
groups = ["main", "dev"]
markers = "sys_platform == \"win32\""
files = [
{file = "win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390"},
@ -2983,14 +3077,14 @@ h11 = ">=0.9.0,<1"
[[package]]
name = "yt-dlp"
version = "2025.1.12"
version = "2025.1.26"
description = "A feature-rich command-line audio/video downloader"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "yt_dlp-2025.1.12-py3-none-any.whl", hash = "sha256:f7ea19afb64f8e457a1b9598ddb67f8deaa313bf1d57abd5612db9272ab10795"},
{file = "yt_dlp-2025.1.12.tar.gz", hash = "sha256:8e7e246e2a5a2cff0a9c13db46844a37a547680702012058c94ec18fce0ca25a"},
{file = "yt_dlp-2025.1.26-py3-none-any.whl", hash = "sha256:3e76bd896b9f96601021ca192ca0fbdd195e3c3dcc28302a3a34c9bc4979da7b"},
{file = "yt_dlp-2025.1.26.tar.gz", hash = "sha256:1c9738266921ad43c568ad01ac3362fb7c7af549276fbec92bd72f140da16240"},
]
[package.extras]
@ -3006,4 +3100,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.13"
content-hash = "d1af74e7fc7c919eda55dd383208edab906508353b4a9eff8e979967484823f8"
content-hash = "9ca114395e73af8982abbccc25b385bbca62e50ba7cca8239e52e5c1227cb4b0"

Wyświetl plik

@ -37,7 +37,6 @@ dependencies = [
"pdqhash (>=0.0.0)",
"pillow (>=0.0.0)",
"python-slugify (>=0.0.0)",
"pyyaml (>=0.0.0)",
"dateparser (>=0.0.0)",
"python-twitter-v2 (>=0.0.0)",
"instaloader (>=0.0.0)",
@ -47,7 +46,7 @@ dependencies = [
"cryptography (>=41.0.0,<42.0.0)",
"boto3 (>=1.28.0,<2.0.0)",
"dataclasses-json (>=0.0.0)",
"yt-dlp (==2025.1.12)",
"yt-dlp (>=2025.1.26,<2026.0.0)",
"numpy (==2.1.3)",
"vk-url-scraper (>=0.0.0)",
"requests[socks] (>=0.0.0)",
@ -58,11 +57,13 @@ dependencies = [
"tsp-client (>=0.0.0)",
"certvalidator (>=0.0.0)",
"rich-argparse (>=1.6.0,<2.0.0)",
"ruamel-yaml (>=0.18.10,<0.19.0)",
]
[tool.poetry.group.dev.dependencies]
pytest = "^8.3.4"
autopep8 = "^2.3.1"
pytest-loguru = "^0.4.0"
[tool.poetry.group.docs.dependencies]
sphinx = "^8.1.3"

Wyświetl plik

@ -12,7 +12,7 @@ from googleapiclient.errors import HttpError
# Code below from https://developers.google.com/drive/api/quickstart/python
# Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json
SCOPES = ['https://www.googleapis.com/auth/drive']
SCOPES = ["https://www.googleapis.com/auth/drive.file"]
@click.command(
@ -23,7 +23,7 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
"-c",
type=click.Path(exists=True),
help="path to the credentials.json file downloaded from https://console.cloud.google.com/apis/credentials",
required=True
required=True,
)
@click.option(
"--token",
@ -31,59 +31,62 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
type=click.Path(exists=False),
default="gd-token.json",
help="file where to place the OAuth token, defaults to gd-token.json which you must then move to where your orchestration file points to, defaults to gd-token.json",
required=True
required=True,
)
def main(credentials, token):
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first time.
creds = None
if os.path.exists(token):
with open(token, 'r') as stream:
with open(token, "r") as stream:
creds_json = json.load(stream)
# creds = Credentials.from_authorized_user_file(creds_json, SCOPES)
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
creds_json["refresh_token"] = creds_json.get("refresh_token", "")
creds = Credentials.from_authorized_user_info(creds_json, SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
print('Requesting new token')
print("Requesting new token")
creds.refresh(Request())
else:
print('First run through so putting up login dialog')
print("First run through so putting up login dialog")
# credentials.json downloaded from https://console.cloud.google.com/apis/credentials
flow = InstalledAppFlow.from_client_secrets_file(credentials, SCOPES)
creds = flow.run_local_server(port=55192)
# Save the credentials for the next run
with open(token, 'w') as token:
print('Saving new token')
with open(token, "w") as token:
print("Saving new token")
token.write(creds.to_json())
else:
print('Token valid')
print("Token valid")
try:
service = build('drive', 'v3', credentials=creds)
service = build("drive", "v3", credentials=creds)
# About the user
results = service.about().get(fields="*").execute()
emailAddress = results['user']['emailAddress']
emailAddress = results["user"]["emailAddress"]
print(emailAddress)
# Call the Drive v3 API and return some files
results = service.files().list(
pageSize=10, fields="nextPageToken, files(id, name)").execute()
items = results.get('files', [])
results = (
service.files()
.list(pageSize=10, fields="nextPageToken, files(id, name)")
.execute()
)
items = results.get("files", [])
if not items:
print('No files found.')
print("No files found.")
return
print('Files:')
print("Files:")
for item in items:
print(u'{0} ({1})'.format(item['name'], item['id']))
print("{0} ({1})".format(item["name"], item["id"]))
except HttpError as error:
print(f'An error occurred: {error}')
print(f"An error occurred: {error}")
if __name__ == '__main__':
if __name__ == "__main__":
main()

Wyświetl plik

@ -0,0 +1,29 @@
"""
This script is used to create a new session file for the Telegram client.
To do this you must first create a Telegram application at https://my.telegram.org/apps
And store your id and hash in the environment variables TELEGRAM_API_ID and TELEGRAM_API_HASH.
Create a .env file, or add the following to your environment :
```
export TELEGRAM_API_ID=[YOUR_ID_HERE]
export TELEGRAM_API_HASH=[YOUR_HASH_HERE]
```
Then run this script to create a new session file.
You will need to provide your phone number and a 2FA code the first time you run this script.
"""
import os
from telethon.sync import TelegramClient
from loguru import logger
# Create a
API_ID = os.getenv("TELEGRAM_API_ID")
API_HASH = os.getenv("TELEGRAM_API_HASH")
SESSION_FILE = "secrets/anon-insta"
os.makedirs("secrets", exist_ok=True)
with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
logger.success(f"New session file created: {SESSION_FILE}.session")

Wyświetl plik

@ -1,8 +1,9 @@
""" Entry point for the auto_archiver package. """
from auto_archiver.core.orchestrator import ArchivingOrchestrator
import sys
def main():
ArchivingOrchestrator().run()
ArchivingOrchestrator().run(sys.argv[1:])
if __name__ == "__main__":
main()

Wyświetl plik

@ -4,7 +4,6 @@
from .metadata import Metadata
from .media import Media
from .module import BaseModule
from .context import ArchivingContext
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

Wyświetl plik

@ -0,0 +1,146 @@
from urllib.parse import urlparse
from typing import Mapping, Any
from abc import ABC
from copy import deepcopy, copy
from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil
from loguru import logger
class BaseModule(ABC):
"""
Base module class. All modules should inherit from this class.
The exact methods a class implements will depend on the type of module it is,
however modules can have a .setup() method to run any setup code
(e.g. logging in to a site, spinning up a browser etc.)
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
a subclass can be of multiple types. For example, a module that extracts data from
a website and stores it in a database would be both an 'extractor' and a 'database' module.
Each module is a python package, and should have a __manifest__.py file in the
same directory as the module file. The __manifest__.py specifies the module information
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
default manifest structure.
"""
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
_DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}
config: Mapping[str, Any]
authentication: Mapping[str, Mapping[str, str]]
name: str
# this is set by the orchestrator prior to archiving
tmp_dir: TemporaryDirectory = None
@property
def storages(self) -> list:
return self.config.get('storages', [])
def config_setup(self, config: dict):
authentication = config.get('authentication', {})
# extract out concatenated sites
for key, val in copy(authentication).items():
if "," in key:
for site in key.split(","):
authentication[site] = val
del authentication[key]
# this is important. Each instance is given its own deepcopied config, so modules cannot
# change values to affect other modules
config = deepcopy(config)
authentication = deepcopy(config.pop('authentication', {}))
self.authentication = authentication
self.config = config
for key, val in config.get(self.name, {}).items():
setattr(self, key, val)
def setup(self):
# For any additional setup required by modules, e.g. autehntication
pass
def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
"""
Returns the authentication information for a given site. This is used to authenticate
with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
extract_cookies: bool - whether or not to extract cookies from the given browser and return the
cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
Currently, the dict can have keys of the following types:
- username: str - the username to use for login
- password: str - the password to use for login
- api_key: str - the API key to use for login
- api_secret: str - the API secret to use for login
- cookie: str - a cookie string to use for login (specific to this site)
- cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
"""
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
site = UrlUtil.domain_for_url(site)
# add the 'www' version of the site to the list of sites to check
authdict = {}
for to_try in [site, f"www.{site}"]:
if to_try in self.authentication:
authdict.update(self.authentication[to_try])
break
# do a fuzzy string match just to print a warning - don't use it since it's insecure
if not authdict:
for key in self.authentication.keys():
if key in site or site in key:
logger.debug(f"Could not find exact authentication information for site '{site}'. \
did find information for '{key}' which is close, is this what you meant? \
If so, edit your authentication settings to make sure it exactly matches.")
def get_ytdlp_cookiejar(args):
import yt_dlp
from yt_dlp import parse_options
logger.debug(f"Extracting cookies from settings: {args[1]}")
# parse_options returns a named tuple as follows, we only need the ydl_options part
# collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
# get the cookies jar, prefer the browser cookies than the file
if 'cookies_from_browser' in self.authentication:
authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
if extract_cookies:
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
elif 'cookies_file' in self.authentication:
authdict['cookies_file'] = self.authentication['cookies_file']
if extract_cookies:
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
return authdict
def repr(self):
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"

Wyświetl plik

@ -11,20 +11,39 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
from loguru import logger
from copy import deepcopy
from .module import MODULE_TYPES
from .module import BaseModule
from typing import Any, List, Type, Tuple
yaml = YAML()
_yaml: YAML = YAML()
EMPTY_CONFIG = yaml.load("""
EMPTY_CONFIG = _yaml.load("""
# Auto Archiver Configuration
# Steps are the modules that will be run in the order they are defined
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
"""
# Global configuration
# Authentication
# a dictionary of authentication information that can be used by extractors to login to website.
# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
# Common login 'types' are username/password, cookie, api key/token.
# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser.
# Some Examples:
# facebook.com:
# username: "my_username"
# password: "my_password"
# or for a site that uses an API key:
# twitter.com,x.com:
# api_key
# api_secret
# youtube.com:
# cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ;
authentication: {}
# These are the global configurations that are used by the modules
logging:
@ -48,6 +67,10 @@ class DefaultValidatingParser(argparse.ArgumentParser):
"""
for action in self._actions:
if not namespace or action.dest not in namespace:
# for actions that are required and already have a default value, remove the 'required' check
if action.required and action.default is not None:
action.required = False
if action.default is not None:
try:
self._check_value(action, action.default)
@ -120,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
config = None
try:
with open(yaml_filename, "r", encoding="utf-8") as inf:
config = yaml.load(inf)
config = _yaml.load(inf)
except FileNotFoundError:
pass
@ -132,12 +155,9 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
# TODO: make this tidier/find a way to notify of which keys should not be stored
def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None:
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
config_to_save = deepcopy(config)
for key1, key2 in do_not_store_keys:
if key1 in config_to_save and key2 in config_to_save[key1]:
del config_to_save[key1][key2]
config_to_save.pop('urls', None)
with open(yaml_filename, "w", encoding="utf-8") as outf:
yaml.dump(config_to_save, outf)
_yaml.dump(config_to_save, outf)

Wyświetl plik

@ -1,64 +0,0 @@
""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
This singleton class allows for:
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
- Marking certain values to persist across resets using `keep_on_reset`.
- Managing temporary directories and other shared data used during the archiving process.
### Key Features:
- Creates a single global instance.
- Reset functionality allows for clearing configurations, with options for partial or full resets.
- Custom getters and setters for commonly used context values like temporary directories.
"""
class ArchivingContext:
"""
Singleton context class for managing global configurations and temporary data.
ArchivingContext._get_instance() to retrieve it if needed
otherwise just
ArchivingContext.set(key, value)
and
ArchivingContext.get(key, default)
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
reset(full_reset=True) will recreate everything including the keep_on_reset status
"""
_instance = None
def __init__(self):
self.configs = {}
self.keep_on_reset = set()
@staticmethod
def get_instance():
if ArchivingContext._instance is None:
ArchivingContext._instance = ArchivingContext()
return ArchivingContext._instance
@staticmethod
def set(key, value, keep_on_reset: bool = False):
ac = ArchivingContext.get_instance()
ac.configs[key] = value
if keep_on_reset: ac.keep_on_reset.add(key)
@staticmethod
def get(key: str, default=None):
return ArchivingContext.get_instance().configs.get(key, default)
@staticmethod
def reset(full_reset: bool = False):
ac = ArchivingContext.get_instance()
if full_reset: ac.keep_on_reset = set()
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
# ---- custom getters/setters for widely used context values
@staticmethod
def set_tmp_dir(tmp_dir: str):
ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
@staticmethod
def get_tmp_dir() -> str:
return ArchivingContext.get_instance().configs.get("tmp_dir")

Wyświetl plik

@ -1,12 +1,9 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from abc import abstractmethod
from typing import Union
from auto_archiver.core import Metadata, BaseModule
@dataclass
class Database(BaseModule):
def started(self, item: Metadata) -> None:

Wyświetl plik

@ -9,11 +9,9 @@ the archiving step and before storage or formatting.
Enrichers are optional but highly useful for making the archived data more powerful.
"""
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from abc import abstractmethod
from auto_archiver.core import Metadata, BaseModule
@dataclass
class Enricher(BaseModule):
"""Base classes and utilities for enrichers in the Auto-Archiver system."""

Wyświetl plik

@ -11,20 +11,23 @@ from abc import abstractmethod
from dataclasses import dataclass
import mimetypes
import os
import mimetypes, requests
import mimetypes
import requests
from loguru import logger
from retrying import retry
import re
from ..core import Metadata, ArchivingContext, BaseModule
from auto_archiver.core import Metadata, BaseModule
@dataclass
class Extractor(BaseModule):
"""
Base class for implementing extractors in the media archiving framework.
Subclasses must implement the `download` method to define platform-specific behavior.
"""
valid_url: re.Pattern = None
def cleanup(self) -> None:
# called when extractors are done, or upon errors, cleanup any resources
pass
@ -32,13 +35,20 @@ class Extractor(BaseModule):
def sanitize_url(self, url: str) -> str:
# used to clean unnecessary URL parameters OR unfurl redirect links
return url
def match_link(self, url: str) -> re.Match:
return self.valid_url.match(url)
def suitable(self, url: str) -> bool:
"""
Returns True if this extractor can handle the given URL
Should be overridden by subclasses
"""
if self.valid_url:
return self.match_link(url) is not None
return True
def _guess_file_type(self, path: str) -> str:
@ -60,7 +70,7 @@ class Extractor(BaseModule):
to_filename = url.split('/')[-1].split('?')[0]
if len(to_filename) > 64:
to_filename = to_filename[-64:]
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
to_filename = os.path.join(self.tmp_dir, to_filename)
if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
@ -85,5 +95,11 @@ class Extractor(BaseModule):
logger.warning(f"Failed to fetch the Media URL: {e}")
@abstractmethod
def download(self, item: Metadata) -> Metadata:
def download(self, item: Metadata) -> Metadata | False:
"""
Downloads the media from the given URL and returns a Metadata object with the downloaded media.
If the URL is not supported or the download fails, this method should return False.
"""
pass

Wyświetl plik

@ -1,11 +1,8 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
from auto_archiver.core import Metadata
from auto_archiver.core import BaseModule
@dataclass
class Feeder(BaseModule):
@abstractmethod

Wyświetl plik

@ -1,10 +1,8 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
from auto_archiver.core import Metadata, Media, BaseModule
@dataclass
class Formatter(BaseModule):
@abstractmethod

Wyświetl plik

@ -11,8 +11,6 @@ from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config
import mimetypes
from .context import ArchivingContext
from loguru import logger
@ -36,12 +34,11 @@ class Media:
_mimetype: str = None # eg: image/jpeg
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None:
# 'Any' typing for metadata to avoid circular imports. Stores the media
# into the provided/available storages [Storage] repeats the process for
# its properties, in case they have inner media themselves for now it
# only goes down 1 level but it's easy to make it recursive if needed.
storages = override_storages or ArchivingContext.get("storages")
if not len(storages):
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
return
@ -66,8 +63,9 @@ class Media:
for inner_media in prop_media.all_inner_media(include_self=True):
yield inner_media
def is_stored(self) -> bool:
return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
def is_stored(self, in_storage) -> bool:
# checks if the media is already stored in the given storage
return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])
def set(self, key: str, value: Any) -> Media:
self.properties[key] = value

Wyświetl plik

@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt
from loguru import logger
from .media import Media
from .context import ArchivingContext
@dataclass_json # annotation order matters
@dataclass
@ -32,6 +30,7 @@ class Metadata:
def __post_init__(self):
self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
self._context = {}
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
@ -45,6 +44,7 @@ class Metadata:
if overwrite_left:
if right.status and len(right.status):
self.status = right.status
self._context.update(right._context)
for k, v in right.metadata.items():
assert k not in self.metadata or type(v) == type(self.get(k))
if type(v) not in [dict, list, set] or k not in self.metadata:
@ -57,12 +57,11 @@ class Metadata:
return right.merge(self)
return self
def store(self: Metadata, override_storages: List = None):
def store(self, storages=[]):
# calls .store for all contained media. storages [Storage]
self.remove_duplicate_media_by_hash()
storages = override_storages or ArchivingContext.get("storages")
for media in self.media:
media.store(override_storages=storages, url=self.get_url(), metadata=self)
media.store(url=self.get_url(), metadata=self, storages=storages)
def set(self, key: str, val: Any) -> Metadata:
self.metadata[key] = val
@ -206,3 +205,10 @@ class Metadata:
if len(r.media) > len(most_complete.media): most_complete = r
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
return most_complete
def set_context(self, key: str, val: Any) -> Metadata:
self._context[key] = val
return self
def get_context(self, key: str, default: Any = None) -> Any:
return self._context.get(key, default)

Wyświetl plik

@ -7,59 +7,70 @@ from __future__ import annotations
from dataclasses import dataclass
from typing import List
from abc import ABC
import shutil
import ast
import copy
import sys
from importlib.util import find_spec
import os
from os.path import join, dirname
from os.path import join
from loguru import logger
import auto_archiver
from .base_module import BaseModule
_LAZY_LOADED_MODULES = {}
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
MANIFEST_FILE = "__manifest__.py"
_DEFAULT_MANIFEST = {
'name': '',
'author': 'Bellingcat',
'type': [],
'requires_setup': True,
'description': '',
'dependencies': {},
'entry_point': '',
'version': '1.0',
'configs': {}
}
class BaseModule(ABC):
config: dict
name: str
def setup_paths(paths: list[str]) -> None:
"""
Sets up the paths for the modules to be loaded from
This is necessary for the modules to be imported correctly
"""
for path in paths:
# check path exists, if it doesn't, log a warning
if not os.path.exists(path):
logger.warning(f"Path '{path}' does not exist. Skipping...")
continue
def setup(self, config: dict):
self.config = config
for key, val in config.get(self.name, {}).items():
setattr(self, key, val)
# see odoo/module/module.py -> initialize_sys_path
if path not in auto_archiver.modules.__path__:
auto_archiver.modules.__path__.append(path)
def get_module(module_name: str, additional_paths: List[str] = []) -> LazyBaseModule:
# sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
def get_module(module_name: str, config: dict) -> BaseModule:
"""
Gets and sets up a module using the provided config
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
"""
return get_module_lazy(module_name).load(config)
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
"""
Lazily loads a module, returning a LazyBaseModule
This has all the information about the module, but does not load the module itself or its dependencies
To load an actual module, call .setup() on a lazy module
"""
if module_name in _LAZY_LOADED_MODULES:
return _LAZY_LOADED_MODULES[module_name]
module = available_modules(additional_paths=additional_paths, limit_to_modules=[module_name])[0]
_LAZY_LOADED_MODULES[module_name] = module
return module
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
@ -67,10 +78,9 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
if os.path.isfile(join(module_path, MANIFEST_FILE)):
return True
default_path = [join(dirname(dirname((__file__))), "modules")]
all_modules = []
for module_folder in default_path + additional_paths:
for module_folder in auto_archiver.modules.__path__:
# walk through each module in module_folder and check if it has a valid manifest
try:
possible_modules = os.listdir(module_folder)
@ -85,8 +95,13 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
continue
all_modules.append(LazyBaseModule(possible_module, possible_module_path))
if _LAZY_LOADED_MODULES.get(possible_module):
continue
lazy_module = LazyBaseModule(possible_module, possible_module_path)
_LAZY_LOADED_MODULES[possible_module] = lazy_module
all_modules.append(lazy_module)
if not suppress_warnings:
for module in limit_to_modules:
@ -97,8 +112,14 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
@dataclass
class LazyBaseModule:
"""
A lazy module class, which only loads the manifest and does not load the module itself.
This is useful for getting information about a module without actually loading it.
"""
name: str
display_name: str
type: list
description: str
path: str
@ -129,6 +150,10 @@ class LazyBaseModule:
@property
def requires_setup(self) -> bool:
return self.manifest['requires_setup']
@property
def display_name(self) -> str:
return self.manifest['name']
@property
def manifest(self) -> dict:
@ -136,7 +161,7 @@ class LazyBaseModule:
return self._manifest
# print(f"Loading manifest for module {module_path}")
# load the manifest file
manifest = copy.deepcopy(_DEFAULT_MANIFEST)
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
with open(join(self.path, MANIFEST_FILE)) as f:
try:
@ -145,7 +170,6 @@ class LazyBaseModule:
logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
self._manifest = manifest
self.display_name = manifest['name']
self.type = manifest['type']
self._entry_point = manifest['entry_point']
self.description = manifest['description']
@ -153,7 +177,7 @@ class LazyBaseModule:
return manifest
def load(self) -> BaseModule:
def load(self, config) -> BaseModule:
if self._instance:
return self._instance
@ -161,11 +185,31 @@ class LazyBaseModule:
# check external dependencies are installed
def check_deps(deps, check):
for dep in deps:
if not len(dep):
# clear out any empty strings that a user may have erroneously added
continue
if not check(dep):
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
exit(1)
check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep))
def check_python_dep(dep):
# first check if it's a module:
try:
m = get_module_lazy(dep, suppress_warnings=True)
try:
# we must now load this module and set it up with the config
m.load(config)
return True
except:
logger.error(f"Unable to setup module '{dep}' for use in module '{self.name}'")
return False
except IndexError:
# not a module, continue
pass
return find_spec(dep)
check_deps(self.dependencies.get('python', []), check_python_dep)
check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep))
@ -184,9 +228,8 @@ class LazyBaseModule:
sub_qualname = f'{qualname}.{file_name}'
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
# finally, get the class instance
instance = getattr(sys.modules[sub_qualname], class_name)()
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
if not getattr(instance, 'name', None):
instance.name = self.name
@ -194,6 +237,12 @@ class LazyBaseModule:
instance.display_name = self.display_name
self._instance = instance
# merge the default config with the user config
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
config[self.name] = default_config | config.get(self.name, {})
instance.config_setup(config)
instance.setup()
return instance
def __repr__(self):

Wyświetl plik

@ -5,30 +5,61 @@
"""
from __future__ import annotations
from typing import Generator, Union, List
from typing import Generator, Union, List, Type
from urllib.parse import urlparse
from ipaddress import ip_address
import argparse
import os
import sys
import json
from tempfile import TemporaryDirectory
import traceback
from rich_argparse import RichHelpFormatter
from .context import ArchivingContext
from .metadata import Metadata
from ..version import __version__
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .module import available_modules, LazyBaseModule, MODULE_TYPES, get_module
from . import validators
from .metadata import Metadata, Media
from auto_archiver.version import __version__
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .module import available_modules, LazyBaseModule, get_module, setup_paths
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .module import BaseModule
import tempfile, traceback
from loguru import logger
DEFAULT_CONFIG_FILE = "orchestration.yaml"
class JsonParseAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
try:
setattr(namespace, self.dest, json.loads(values))
except json.JSONDecodeError as e:
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
class AuthenticationJsonParseAction(JsonParseAction):
def __call__(self, parser, namespace, values, option_string=None):
super().__call__(parser, namespace, values, option_string)
auth_dict = getattr(namespace, self.dest)
if isinstance(auth_dict, str):
# if it's a string
try:
with open(auth_dict, 'r') as f:
try:
auth_dict = json.load(f)
except json.JSONDecodeError:
# maybe it's yaml, try that
auth_dict = _yaml.load(f)
except:
pass
if not isinstance(auth_dict, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
for site, auth in auth_dict.items():
if not isinstance(site, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
setattr(namespace, self.dest, auth_dict)
class UniqueAppendAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
if not hasattr(namespace, self.dest):
@ -39,10 +70,16 @@ class UniqueAppendAction(argparse.Action):
class ArchivingOrchestrator:
_do_not_store_keys = []
feeders: List[Type[Feeder]]
extractors: List[Type[Extractor]]
enrichers: List[Type[Enricher]]
databases: List[Type[Database]]
storages: List[Type[Storage]]
formatters: List[Type[Formatter]]
def setup_basic_parser(self):
parser = argparse.ArgumentParser(
prog="auto-archiver",
add_help=False,
description="""
Auto Archiver is a CLI tool to archive media/metadata from online URLs;
@ -51,14 +88,16 @@ class ArchivingOrchestrator:
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
formatter_class=RichHelpFormatter,
)
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
parser.add_argument('--version', action='version', version=__version__)
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
# override the default 'help' so we can inject all the configs and show those
parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
self.basic_parser = parser
return parser
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
parser = DefaultValidatingParser(
@ -76,18 +115,22 @@ class ArchivingOrchestrator:
# only load the modules enabled in config
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
enabled_modules = []
for module_type in MODULE_TYPES:
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
# first loads the modules from the config file, then from the command line
for config in [yaml_config['steps'], basic_config.__dict__]:
for module_type in BaseModule.MODULE_TYPES:
enabled_modules.extend(config.get(f"{module_type}s", []))
# add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
for module_type in MODULE_TYPES:
if modules := getattr(basic_config, f"{module_type}s", []):
enabled_modules.extend(modules)
self.add_module_args(available_modules(with_manifest=True, limit_to_modules=set(enabled_modules), suppress_warnings=True), parser)
# clear out duplicates, but keep the order
enabled_modules = list(dict.fromkeys(enabled_modules))
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_module_args(avail_modules, parser)
elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
self.add_module_args(simple_modules, parser)
# for simple mode, we use the cli_feeder and any modules that don't require setup
yaml_config['steps']['feeders'] = ['cli_feeder']
# add them to the config
for module in simple_modules:
for module_type in module.type:
@ -115,7 +158,7 @@ class ArchivingOrchestrator:
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
logger.info(f"Storing configuration file to {basic_config.config_file}")
store_yaml(self.config, basic_config.config_file, self._do_not_store_keys)
store_yaml(self.config, basic_config.config_file)
return self.config
@ -123,28 +166,37 @@ class ArchivingOrchestrator:
if not parser:
parser = self.parser
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', help='the feeders to use', action=UniqueAppendAction)
# allow passing URLs directly on the command line
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
(token, username etc.) that extractors can use to log into \
a website. If passing this on the command line, use a JSON string. \
You may also pass a path to a valid JSON/YAML file which will be parsed.',\
default={},
action=AuthenticationJsonParseAction)
# logging arguments
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO')
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
# additional modules
parser.add_argument('--additional-modules', dest='additional_modules', nargs='+', help='additional paths to search for modules', action=UniqueAppendAction)
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None):
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
if not modules:
modules = available_modules(with_manifest=True)
module: LazyBaseModule
for module in modules:
if not module.configs:
# this module has no configs, don't show anything in the help
# (TODO: do we want to show something about this module though, like a description?)
@ -153,54 +205,54 @@ class ArchivingOrchestrator:
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
for name, kwargs in module.configs.items():
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
# in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
do_not_store = kwargs.pop('do_not_store', False)
if do_not_store:
self._do_not_store_keys.append((module.name, name))
if not kwargs.get('metavar', None):
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
kwargs['metavar'] = name.upper()
if kwargs.get('required', False):
# required args shouldn't have a 'default' value, remove it
kwargs.pop('default', None)
kwargs.pop('cli_set', None)
should_store = kwargs.pop('should_store', False)
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
try:
kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
except AttributeError:
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
except KeyError:
kwargs['type'] = getattr(validators, kwargs['type'])
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
arg.should_store = should_store
def show_help(self):
def show_help(self, basic_config: dict):
# for the help message, we want to load *all* possible modules and show the help
# add configs as arg parser arguments
self.add_additional_args(self.basic_parser)
self.add_module_args(parser=self.basic_parser)
self.basic_parser.print_help()
exit()
self.basic_parser.exit()
def setup_logging(self):
# setup loguru logging
logger.remove() # remove the default logger
logger.remove(0) # remove the default logger
logging_config = self.config['logging']
logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
def install_modules(self):
def install_modules(self, modules_by_type):
"""
Swaps out the previous 'strings' in the config with the actual modules
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
are loaded, the program will exit with an error message.
"""
invalid_modules = []
for module_type in MODULE_TYPES:
for module_type in BaseModule.MODULE_TYPES:
step_items = []
modules_to_load = self.config['steps'][f"{module_type}s"]
modules_to_load = modules_by_type[f"{module_type}s"]
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
def check_steps_ok():
if not len(step_items):
@ -214,14 +266,37 @@ class ArchivingOrchestrator:
exit()
for module in modules_to_load:
if module == 'cli_feeder':
# pseudo module, don't load it
urls = self.config['urls']
if not urls:
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
exit()
# cli_feeder is a pseudo module, it just takes the command line args
def feed(self) -> Generator[Metadata]:
for url in urls:
logger.debug(f"Processing URL: '{url}'")
yield Metadata().set_url(url)
pseudo_module = type('CLIFeeder', (Feeder,), {
'name': 'cli_feeder',
'display_name': 'CLI Feeder',
'__iter__': feed
})()
pseudo_module.__iter__ = feed
step_items.append(pseudo_module)
continue
if module in invalid_modules:
continue
loaded_module: BaseModule = get_module(module).load()
try:
loaded_module.setup(self.config)
loaded_module: BaseModule = get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
if module_type == 'extractor':
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if module_type == 'extractor' and loaded_module.name == module:
loaded_module.cleanup()
exit()
@ -230,59 +305,58 @@ class ArchivingOrchestrator:
continue
if loaded_module:
step_items.append(loaded_module)
# TODO temp solution
if module_type == "storage":
ArchivingContext.set("storages", step_items, keep_on_reset=True)
check_steps_ok()
self.config['steps'][f"{module_type}s"] = step_items
setattr(self, f"{module_type}s", step_items)
def load_config(self, config_file: str) -> dict:
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
exit()
assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again."
self.config['steps'][f"{module_type}s"] = step_items
return read_yaml(config_file)
def run(self) -> None:
def run(self, args: list) -> None:
self.setup_basic_parser()
# parse the known arguments for now (basically, we want the config file)
basic_config, unused_args = self.basic_parser.parse_known_args(args)
# load the config file to get the list of enabled items
basic_config, unused_args = self.basic_parser.parse_known_args()
# setup any custom module paths, so they'll show in the help and for arg parsing
setup_paths(basic_config.module_paths)
# if help flag was called, then show the help
if basic_config.help:
self.show_help()
self.show_help(basic_config)
# load the config file
yaml_config = {}
if not os.path.exists(basic_config.config_file) and basic_config.config_file != DEFAULT_CONFIG_FILE:
logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
exit()
yaml_config = read_yaml(basic_config.config_file)
yaml_config = self.load_config(basic_config.config_file)
self.setup_complete_parser(basic_config, yaml_config, unused_args)
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
self.install_modules()
self.install_modules(self.config['steps'])
# log out the modules that were loaded
for module_type in MODULE_TYPES:
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"]))
for module_type in BaseModule.MODULE_TYPES:
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
for item in self.feed():
for _ in self.feed():
pass
def cleanup(self)->None:
logger.info("Cleaning up")
for e in self.config['steps']['extractors']:
for e in self.extractors:
e.cleanup()
def feed(self) -> Generator[Metadata]:
for feeder in self.config['steps']['feeders']:
url_count = 0
for feeder in self.feeders:
for item in feeder:
yield self.feed_item(item)
url_count += 1
logger.success(f"Processed {url_count} URL(s)")
self.cleanup()
def feed_item(self, item: Metadata) -> Metadata:
@ -291,22 +365,33 @@ class ArchivingOrchestrator:
- catches keyboard interruptions to do a clean exit
- catches any unexpected error, logs it, and does a clean exit
"""
tmp_dir: TemporaryDirectory = None
try:
ArchivingContext.reset()
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
ArchivingContext.set_tmp_dir(tmp_dir)
return self.archive(item)
tmp_dir = TemporaryDirectory(dir="./")
# set tmp_dir on all modules
for m in self.all_modules:
m.tmp_dir = tmp_dir.name
return self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}")
for d in self.config['steps']['databases']: d.aborted(item)
for d in self.databases:
d.aborted(item)
self.cleanup()
exit()
except Exception as e:
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.config['steps']['databases']:
if type(e) == AssertionError: d.failed(item, str(e))
else: d.failed(item, reason="unexpected error")
for d in self.databases:
if type(e) == AssertionError:
d.failed(item, str(e))
else:
d.failed(item, reason="unexpected error")
finally:
if tmp_dir:
# remove the tmp_dir from all modules
for m in self.all_modules:
m.tmp_dir = None
tmp_dir.cleanup()
def archive(self, result: Metadata) -> Union[Metadata, None]:
@ -319,31 +404,38 @@ class ArchivingOrchestrator:
5. Store all downloaded/generated media
6. Call selected Formatter and store formatted if needed
"""
original_url = result.get_url().strip()
self.assert_valid_url(original_url)
try:
self.assert_valid_url(original_url)
except AssertionError as e:
logger.error(f"Error archiving URL {original_url}: {e}")
raise e
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
url = original_url
for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url)
for a in self.extractors:
url = a.sanitize_url(url)
result.set_url(url)
if original_url != url: result.set("original_url", original_url)
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
cached_result = None
for d in self.config["steps"]["databases"]:
for d in self.databases:
d.started(result)
if (local_result := d.fetch(result)):
cached_result = (cached_result or Metadata()).merge(local_result)
if local_result := d.fetch(result):
cached_result = (cached_result or Metadata()).merge(local_result).merge(result)
if cached_result:
logger.debug("Found previously archived entry")
for d in self.config["steps"]["databases"]:
for d in self.databases:
try: d.done(cached_result, cached=True)
except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
return cached_result
# 3 - call extractors until one succeeds
for a in self.config["steps"]["extractors"]:
for a in self.extractors:
logger.info(f"Trying extractor {a.name} for {url}")
try:
result.merge(a.download(result))
@ -352,24 +444,25 @@ class ArchivingOrchestrator:
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
# 4 - call enrichers to work with archived content
for e in self.config["steps"]["enrichers"]:
for e in self.enrichers:
try: e.enrich(result)
except Exception as exc:
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store all downloaded/generated media
result.store()
result.store(storages=self.storages)
# 6 - format and store formatted if needed
if final_media := self.config["steps"]["formatters"][0].format(result):
final_media.store(url=url, metadata=result)
final_media: Media
if final_media := self.formatters[0].format(result):
final_media.store(url=url, metadata=result, storages=self.storages)
result.set_final_media(final_media)
if result.is_empty():
result.status = "nothing archived"
# signal completion to databases and archivers
for d in self.config["steps"]["databases"]:
for d in self.databases:
try: d.done(result)
except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
@ -394,4 +487,11 @@ class ArchivingOrchestrator:
assert ip.is_global, f"Invalid IP used"
assert not ip.is_reserved, f"Invalid IP used"
assert not ip.is_link_local, f"Invalid IP used"
assert not ip.is_private, f"Invalid IP used"
assert not ip.is_private, f"Invalid IP used"
# Helper Properties
@property
def all_modules(self) -> List[Type[BaseModule]]:
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters

Wyświetl plik

@ -1,25 +1,23 @@
from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
from typing import IO, Optional
from typing import IO
import os
from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
from loguru import logger
from slugify import slugify
from auto_archiver.utils.misc import random_str
@dataclass
from auto_archiver.core import Media, BaseModule, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
from auto_archiver.core.module import get_module
class Storage(BaseModule):
def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
if media.is_stored():
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
if media.is_stored(in_storage=self):
logger.debug(f"{media.key} already stored, skipping")
return
self.set_key(media, url)
self.set_key(media, url, metadata)
self.upload(media, metadata=metadata)
media.add_url(self.get_cdn_url(media))
@ -30,34 +28,35 @@ class Storage(BaseModule):
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
def upload(self, media: Media, **kwargs) -> bool:
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, url) -> None:
def set_key(self, media: Media, url, metadata: Metadata) -> None:
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
folder = ArchivingContext.get("folder", "")
folder = metadata.get_context('folder', '')
filename, ext = os.path.splitext(media.filename)
# Handle path_generator logic
path_generator = ArchivingContext.get("path_generator", "url")
path_generator = self.config.get("path_generator", "url")
if path_generator == "flat":
path = ""
filename = slugify(filename) # Ensure filename is slugified
elif path_generator == "url":
path = slugify(url)
elif path_generator == "random":
path = ArchivingContext.get("random_path", random_str(24), True)
path = self.config.get("random_path", random_str(24), True)
else:
raise ValueError(f"Invalid path_generator: {path_generator}")
# Handle filename_generator logic
filename_generator = ArchivingContext.get("filename_generator", "random")
filename_generator = self.config.get("filename_generator", "random")
if filename_generator == "random":
filename = random_str(24)
elif filename_generator == "static":
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
# load the hash_enricher module
he = get_module(HashEnricher, self.config)
hd = he.calculate_hash(media.filename)
filename = hd[:24]
else:

Wyświetl plik

@ -1,7 +1,19 @@
# used as validators for config values.
# used as validators for config values. Should raise an exception if the value is invalid.
from pathlib import Path
import argparse
def example_validator(value):
return "example" in value
if "example" not in value:
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
return value
def positive_number(value):
return value > 0
if value < 0:
raise argparse.ArgumentTypeError(f"{value} is not a positive number")
return value
def valid_file(value):
if not Path(value).is_file():
raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
return value

Wyświetl plik

@ -1 +1 @@
from api_db import AAApiDb
from .api_db import AAApiDb

Wyświetl plik

@ -1,28 +1,49 @@
{
"name": "Auto-Archiver API Database",
"type": ["database"],
"entry_point": "api_db:AAApiDb",
"entry_point": "api_db::AAApiDb",
"requires_setup": True,
"external_dependencies": {
"python": ["requests",
"loguru"],
"dependencies": {
"python": ["requests", "loguru"],
},
"configs": {
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
"api_token": {"default": None, "help": "API Bearer token."},
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
"author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",},
"store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",},
"tags": {"default": [], "help": "what tags to add to the archived URL",}
"api_endpoint": {
"required": True,
"help": "API endpoint where calls are made to",
},
"api_token": {"default": None,
"help": "API Bearer token."},
"public": {
"default": False,
"type": "bool",
"help": "whether the URL should be publicly available via the API",
},
"author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {
"default": None,
"help": "which group of users have access to the archive in case public=false as author",
},
"use_api_cache": {
"default": True,
"type": "bool",
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
},
"store_results": {
"default": True,
"type": "bool",
"help": "when set, will send the results to the API database.",
},
"tags": {
"default": [],
"help": "what tags to add to the archived URL",
},
},
"description": """
Provides integration with the Auto-Archiver API for querying and storing archival data.
### Features
- **API Integration**: Supports querying for existing archives and submitting results.
- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled.
- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
- **Tagging and Metadata**: Adds tags and manages metadata for archives.
- **Optional Storage**: Archives results conditionally based on configuration.

Wyświetl plik

@ -1,5 +1,7 @@
from typing import Union
import requests, os
import os
import requests
from loguru import logger
from auto_archiver.core import Database
@ -7,27 +9,17 @@ from auto_archiver.core import Metadata
class AAApiDb(Database):
"""
Connects to auto-archiver-api instance
"""
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.allow_rearchive = bool(self.allow_rearchive)
self.store_results = bool(self.store_results)
self.assert_valid_string("api_endpoint")
"""Connects to auto-archiver-api instance"""
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
""" query the database for the existence of this item.
Helps avoid re-archiving the same URL multiple times.
"""
if not self.allow_rearchive: return
if not self.use_api_cache: return
params = {"url": item.get_url(), "limit": 15}
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers)
if response.status_code == 200:
if len(response.json()):
@ -38,21 +30,26 @@ class AAApiDb(Database):
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
return False
def done(self, item: Metadata, cached: bool=False) -> None:
def done(self, item: Metadata, cached: bool = False) -> None:
"""archival result ready - should be saved to DB"""
if not self.store_results: return
if cached:
if cached:
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
return
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
payload = {
'author_id': self.author_id,
'url': item.get_url(),
'public': self.public,
'group_id': self.group_id,
'tags': list(self.tags),
'result': item.to_json(),
}
headers = {"Authorization": f"Bearer {self.api_token}"}
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
if response.status_code == 200:
if response.status_code == 201:
logger.success(f"AA API: {response.json()}")
else:
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")

Wyświetl plik

@ -1 +0,0 @@
from .atlos import AtlosStorage

Wyświetl plik

@ -1,40 +0,0 @@
{
"name": "atlos_storage",
"type": ["storage"],
"requires_setup": True,
"external_dependencies": {"python": ["loguru", "requests"], "bin": [""]},
"configs": {
"path_generator": {
"default": "url",
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
},
"filename_generator": {
"default": "random",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
},
"api_token": {
"default": None,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"type": "str",
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"type": "str",
},
},
"description": """
AtlosStorage: A storage module for saving media files to the Atlos platform.
### Features
- Uploads media files to Atlos using Atlos-specific APIs.
- Automatically calculates SHA-256 hashes of media files for integrity verification.
- Skips uploads for files that already exist on Atlos with the same hash.
- Supports attaching metadata, such as `atlos_id`, to the uploaded files.
- Provides CDN-like URLs for accessing uploaded media.
### Notes
- Requires Atlos API configuration, including `atlos_url` and `api_token`.
- Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials.
""",
}

Wyświetl plik

@ -1,9 +1,9 @@
{
"name": "Atlos Database",
"type": ["database"],
"entry_point": "atlos_db:AtlosDb",
"entry_point": "atlos_db::AtlosDb",
"requires_setup": True,
"external_dependencies":
"dependencies":
{"python": ["loguru",
""],
"bin": [""]},

Wyświetl plik

@ -1,14 +1,10 @@
import os
from typing import Union
from loguru import logger
from csv import DictWriter
from dataclasses import asdict
import requests
from loguru import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata
from auto_archiver.utils import get_atlos_config_options
class AtlosDb(Database):

Wyświetl plik

@ -2,14 +2,14 @@
"name": "Atlos Feeder",
"type": ["feeder"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["loguru", "requests"],
},
"configs": {
"api_token": {
"default": None,
"type": "str",
"required": True,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"type": "str"
},
"atlos_url": {
"default": "https://platform.atlos.org",

Wyświetl plik

@ -1,19 +1,12 @@
from loguru import logger
import requests
from loguru import logger
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.utils import get_atlos_config_options
from auto_archiver.core import Metadata
class AtlosFeeder(Feeder):
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
if type(self.api_token) != str:
raise Exception("Atlos Feeder did not receive an Atlos API token")
def __iter__(self) -> Metadata:
# Get all the urls from the Atlos API
count = 0
@ -47,5 +40,3 @@ class AtlosFeeder(Feeder):
if len(data["results"]) == 0 or cursor is None:
break
logger.success(f"Processed {count} URL(s)")

Wyświetl plik

@ -1,12 +1,12 @@
import os
from typing import IO, List, Optional
from loguru import logger
import requests
import hashlib
import os
from typing import IO, Optional
import requests
from loguru import logger
from auto_archiver.core import Media, Metadata
from auto_archiver.core import Storage
from auto_archiver.utils import get_atlos_config_options
class AtlosStorage(Storage):

Wyświetl plik

@ -1 +0,0 @@
from .cli_feeder import CLIFeeder

Wyświetl plik

@ -1,27 +0,0 @@
{
"name": "CLI Feeder",
"type": ["feeder"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru"],
},
'entry_point': 'cli_feeder::CLIFeeder',
"configs": {
"urls": {
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
"nargs": "+",
"required": True,
"do_not_store": True,
"metavar": "INPUT URLS",
},
},
"description": """
Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
### Features
- Takes a single URL or a list of URLs provided via the command line.
- Converts each URL into a `Metadata` object and yields it for processing.
- Ensures URLs are processed only if they are explicitly provided.
"""
}

Wyświetl plik

@ -1,15 +0,0 @@
from loguru import logger
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
class CLIFeeder(Feeder):
def __iter__(self) -> Metadata:
for url in self.urls:
logger.debug(f"Processing URL: '{url}'")
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
logger.success(f"Processed {len(self.urls)} URL(s)")

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Console Database",
"type": ["database"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
},
"description": """

Wyświetl plik

@ -2,7 +2,7 @@
"name": "CSV Database",
"type": ["database"],
"requires_setup": False,
"external_dependencies": {"python": ["loguru"]
"dependencies": {"python": ["loguru"]
},
'entry_point': 'csv_db::CSVDb',
"configs": {

Wyświetl plik

@ -2,7 +2,7 @@
"name": "CSV Feeder",
"type": ["feeder"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
"bin": [""]
},
@ -13,6 +13,9 @@
"default": None,
"help": "Path to the input file(s) to read the URLs from, comma separated. \
Input files should be formatted with one URL per line",
"required": True,
"type": "valid_file",
"nargs": "+",
},
"column": {
"default": None,
@ -26,9 +29,9 @@
- Supports reading URLs from multiple input files, specified as a comma-separated list.
- Allows specifying the column number or name to extract URLs from.
- Skips header rows if the first value is not a valid URL.
- Integrates with the `ArchivingContext` to manage URL feeding.
### Setu N
- Input files should be formatted with one URL per line.
### Setup
- Input files should be formatted with one URL per line, with or without a header row.
- If you have a header row, you can specify the column number or name to read URLs from using the 'column' config option.
"""
}

Wyświetl plik

@ -2,24 +2,37 @@ from loguru import logger
import csv
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.core import Metadata
from auto_archiver.utils import url_or_none
class CSVFeeder(Feeder):
column = None
def __iter__(self) -> Metadata:
url_column = self.column or 0
for file in self.files:
with open(file, "r") as f:
reader = csv.reader(f)
first_row = next(reader)
if not(url_or_none(first_row[url_column])):
# it's a header row, skip it
url_column = self.column or 0
if isinstance(url_column, str):
try:
url_column = first_row.index(url_column)
except ValueError:
logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
return
elif not(url_or_none(first_row[url_column])):
# it's a header row, but we've been given a column number already
logger.debug(f"Skipping header row: {first_row}")
for row in reader:
url = row[0]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
else:
# first row isn't a header row, rewind the file
f.seek(0)
logger.success(f"Processed {len(self.urls)} URL(s)")
for row in reader:
if not url_or_none(row[url_column]):
logger.warning(f"Not a valid URL in row: {row}, skipping")
continue
url = row[url_column]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)

Wyświetl plik

@ -1,14 +1,14 @@
{
"name": "Google Drive Storage",
"type": ["storage"],
"author": "Dave Mateer",
"entry_point": "gdrive_storage::GDriveStorage",
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": [
"loguru",
"google-api-python-client",
"google-auth",
"google-auth-oauthlib",
"google-auth-httplib2"
"googleapiclient",
"google",
],
},
"configs": {
@ -18,17 +18,23 @@
"choices": ["flat", "url", "random"],
},
"filename_generator": {
"default": "random",
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"choices": ["random", "static"],
},
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
"root_folder_id": {"required": True,
"help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
"oauth_token": {"default": None,
"help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
},
"description": """
GDriveStorage: A storage module for saving archived content to Google Drive.
Author: Dave Mateer, (And maintained by: )
Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
### Features
- Saves media files to Google Drive, organizing them into folders based on the provided path structure.
- Supports OAuth token-based authentication or service account credentials for API access.
@ -39,5 +45,55 @@
- Requires setup with either a Google OAuth token or a service account JSON file.
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
- Automatically handles Google Drive API token refreshes for long-running jobs.
"""
## Overview
This module integrates Google Drive as a storage backend, enabling automatic folder creation and file uploads. It supports authentication via **service accounts** (recommended for automation) or **OAuth tokens** (for user-based authentication).
## Features
- Saves files to Google Drive, organizing them into structured folders.
- Supports both **service account** and **OAuth token** authentication.
- Automatically creates folders if they don't exist.
- Generates public URLs for easy file sharing.
## Setup Guide
1. **Enable Google Drive API**
- Create a Google Cloud project at [Google Cloud Console](https://console.cloud.google.com/)
- Enable the **Google Drive API**.
2. **Set Up a Google Drive Folder**
- Create a folder in **Google Drive** and copy its **folder ID** from the URL.
- Add the **folder ID** to your configuration (`orchestration.yaml`):
```yaml
root_folder_id: "FOLDER_ID"
```
3. **Authentication Options**
- **Option 1: Service Account (Recommended)**
- Create a **service account** in Google Cloud IAM.
- Download the JSON key file and save it as:
```
secrets/service_account.json
```
- **Share your Drive folder** with the service accounts `client_email` (found in the JSON file).
- **Option 2: OAuth Token (User Authentication)**
- Create OAuth **Desktop App credentials** in Google Cloud.
- Save the credentials as:
```
secrets/oauth_credentials.json
```
- Generate an OAuth token by running:
```sh
python scripts/create_update_gdrive_oauth_token.py -c secrets/oauth_credentials.json
```
Notes on the OAuth token:
Tokens are refreshed after 1 hour however keep working for 7 days (tbc)
so as long as the job doesn't last for 7 days then this method of refreshing only once per run will work
see this link for details on the token:
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
"""
}

Wyświetl plik

@ -1,68 +1,67 @@
import shutil, os, time, json
import json
import os
import time
from typing import IO
from loguru import logger
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.auth.transport.requests import Request
from google.oauth2 import service_account
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from loguru import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
class GDriveStorage(Storage):
def __init__(self, config: dict) -> None:
super().__init__(config)
def setup(self) -> None:
self.scopes = ['https://www.googleapis.com/auth/drive']
# Initialize Google Drive service
self._setup_google_drive_service()
SCOPES = ['https://www.googleapis.com/auth/drive']
if self.oauth_token is not None:
"""
Tokens are refreshed after 1 hour
however keep working for 7 days (tbc)
so as long as the job doesn't last for 7 days
then this method of refreshing only once per run will work
see this link for details on the token
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
"""
logger.debug(f'Using GD OAuth token {self.oauth_token}')
# workaround for missing 'refresh_token' in from_authorized_user_file
with open(self.oauth_token, 'r') as stream:
creds_json = json.load(stream)
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
creds = Credentials.from_authorized_user_info(creds_json, SCOPES)
# creds = Credentials.from_authorized_user_file(self.oauth_token, SCOPES)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
logger.debug('Requesting new GD OAuth token')
creds.refresh(Request())
else:
raise Exception("Problem with creds - create the token again")
# Save the credentials for the next run
with open(self.oauth_token, 'w') as token:
logger.debug('Saving new GD OAuth token')
token.write(creds.to_json())
else:
logger.debug('GD OAuth Token valid')
def _setup_google_drive_service(self):
"""Initialize Google Drive service based on provided credentials."""
if self.oauth_token:
logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
self.service = self._initialize_with_oauth_token()
elif self.service_account:
logger.debug(f"Using Google Drive service account: {self.service_account}")
self.service = self._initialize_with_service_account()
else:
gd_service_account = self.service_account
logger.debug(f'Using GD Service Account {gd_service_account}')
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
self.service = build('drive', 'v3', credentials=creds)
def _initialize_with_oauth_token(self):
"""Initialize Google Drive service with OAuth token."""
with open(self.oauth_token, 'r') as stream:
creds_json = json.load(stream)
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
creds = Credentials.from_authorized_user_info(creds_json, self.scopes)
if not creds.valid and creds.expired and creds.refresh_token:
creds.refresh(Request())
with open(self.oauth_token, 'w') as token_file:
logger.debug("Saving refreshed OAuth token.")
token_file.write(creds.to_json())
elif not creds.valid:
raise ValueError("Invalid OAuth token. Please regenerate the token.")
return build('drive', 'v3', credentials=creds)
def _initialize_with_service_account(self):
"""Initialize Google Drive service with service account."""
creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes)
return build('drive', 'v3', credentials=creds)
def get_cdn_url(self, media: Media) -> str:
"""
only support files saved in a folder for GD
S3 supports folder and all stored in the root
"""
# full_name = os.path.join(self.folder, media.key)
parent_id, folder_id = self.root_folder_id, None
path_parts = media.key.split(os.path.sep)
@ -71,13 +70,16 @@ class GDriveStorage(Storage):
for folder in path_parts[0:-1]:
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
parent_id = folder_id
# get id of file inside folder (or sub folder)
file_id = self._get_id_from_parent_and_name(folder_id, filename)
file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True)
if not file_id:
#
logger.info(f"file {filename} not found in folder {folder_id}")
return None
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
def upload(self, media: Media, **kwargs) -> bool:
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
"""
1. for each sub-folder in the path check if exists or create
2. upload file to root_id/other_paths.../filename
@ -105,7 +107,13 @@ class GDriveStorage(Storage):
# must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
def _get_id_from_parent_and_name(self, parent_id: str,
name: str,
retries: int = 1,
sleep_seconds: int = 10,
use_mime_type: bool = False,
raise_on_missing: bool = True,
use_cache=False):
"""
Retrieves the id of a folder or file from its @name and the @parent_id folder
Optionally does multiple @retries and sleeps @sleep_seconds between them
@ -168,8 +176,3 @@ class GDriveStorage(Storage):
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
return gd_folder.get('id')
# def exists(self, key):
# try:
# self.get_cdn_url(key)
# return True
# except: return False

Wyświetl plik

@ -20,6 +20,7 @@ the broader archiving framework.
- Retrieves metadata like titles, descriptions, upload dates, and durations.
- Downloads subtitles and comments when enabled.
- Configurable options for handling live streams, proxies, and more.
- Supports authentication of websites using the 'authentication' settings from your orchestration.
### Dropins
- For websites supported by `yt-dlp` that also contain posts in addition to videos
@ -29,10 +30,6 @@ custom dropins can be created to handle additional websites and passed to the ar
via the command line using the `--dropins` option (TODO!).
""",
"configs": {
"facebook_cookie": {
"default": None,
"help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'",
},
"subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
"comments": {
"default": False,
@ -67,14 +64,5 @@ via the command line using the `--dropins` option (TODO!).
"default": "inf",
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
},
"cookies_from_browser": {
"default": None,
"type": "str",
"help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale",
},
"cookie_file": {
"default": None,
"help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp",
},
},
}

Wyświetl plik

@ -23,19 +23,8 @@ class Bluesky(GenericDropin):
def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
# TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
# handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
# return ie_instance._extract_post(handle=handle, post_id=video_id)
handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
return ie_instance._download_json(
'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
video_id, query={
'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
'depth': 0,
'parentHeight': 0,
})['thread']['post']
return ie_instance._extract_post(handle=handle, post_id=video_id)
def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
"""

Wyświetl plik

@ -0,0 +1,17 @@
from .dropin import GenericDropin
class Facebook(GenericDropin):
def extract_post(self, url: str, ie_instance):
video_id = ie_instance._match_valid_url(url).group('id')
ie_instance._download_webpage(
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
post_data = ie_instance._extract_from_url.extract_metadata(webpage)
return post_data
def create_metadata(self, post: dict, ie_instance, archiver, url):
metadata = archiver.create_metadata(url)
metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
return metadata

Wyświetl plik

@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor
from loguru import logger
from auto_archiver.core.extractor import Extractor
from ...core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
class GenericExtractor(Extractor):
_dropins = {}
@ -266,19 +266,30 @@ class GenericExtractor(Extractor):
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
if item.netloc in ['youtube.com', 'www.youtube.com']:
if self.cookies_from_browser:
logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
elif self.cookie_file:
logger.debug(f'Using cookies from file {self.cookie_file}')
ydl_options['cookiefile'] = self.cookie_file
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
'quiet': False, 'noplaylist': not self.allow_playlist ,
'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
"live_from_start": self.live_from_start, "proxy": self.proxy,
"max_downloads": self.max_downloads, "playlistend": self.max_downloads}
# set up auth
auth = self.auth_for_site(url, extract_cookies=False)
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
if auth:
if 'username' in auth and 'password' in auth:
logger.debug(f'Using provided auth username and password for {url}')
ydl_options['username'] = auth['username']
ydl_options['password'] = auth['password']
elif 'cookie' in auth:
logger.debug(f'Using provided auth cookie for {url}')
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
elif 'cookie_from_browser' in auth:
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
elif 'cookies_file' in auth:
logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
ydl_options['cookiesfile'] = auth['cookies_file']
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

Wyświetl plik

@ -5,7 +5,7 @@ from loguru import logger
from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import UrlUtil
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.extractor import Extractor
from .dropin import GenericDropin, InfoExtractor

Wyświetl plik

@ -3,8 +3,8 @@
"type": ["database"],
"entry_point": "gsheet_db::GsheetsDb",
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "gspread", "python-slugify"],
"dependencies": {
"python": ["loguru", "gspread", "slugify"],
},
"configs": {
"allow_worksheets": {
@ -17,6 +17,7 @@
},
"use_sheet_names_in_stored_paths": {
"default": True,
"type": "bool",
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
}
},

Wyświetl plik

@ -1,39 +1,38 @@
from typing import Union, Tuple
import datetime
from urllib.parse import quote
from loguru import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.gsheet_feeder import GWorksheet
from auto_archiver.utils.misc import get_current_timestamp
class GsheetsDb(Database):
"""
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
"""
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, 'status', 'Archive in progress')
gw.set_cell(row, "status", "Archive in progress")
def failed(self, item: Metadata, reason:str) -> None:
def failed(self, item: Metadata, reason: str) -> None:
logger.error(f"FAILED {item}")
self._safe_status_update(item, f'Archive failed {reason}')
self._safe_status_update(item, f"Archive failed {reason}")
def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}")
self._safe_status_update(item, '')
self._safe_status_update(item, "")
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check if the given item has been archived already"""
return False
def done(self, item: Metadata, cached: bool=False) -> None:
def done(self, item: Metadata, cached: bool = False) -> None:
"""archival result ready - should be saved to DB"""
logger.success(f"DONE {item.get_url()}")
gw, row = self._retrieve_gsheet(item)
@ -45,23 +44,25 @@ class GsheetsDb(Database):
def batch_if_valid(col, val, final_value=None):
final_value = final_value or val
try:
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
cell_updates.append((row, col, final_value))
except Exception as e:
logger.error(f"Unable to batch {col}={final_value} due to {e}")
status_message = item.status
if cached:
status_message = f"[cached] {status_message}"
cell_updates.append((row, 'status', status_message))
cell_updates.append((row, "status", status_message))
media: Media = item.get_final_media()
if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp())
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
batch_if_valid("archive", "\n".join(media.urls))
batch_if_valid("date", True, get_current_timestamp())
batch_if_valid("title", item.get_title())
batch_if_valid("text", item.get("content", ""))
batch_if_valid("timestamp", item.get_timestamp())
if media:
batch_if_valid("hash", media.get("hash", "not-calculated"))
# merge all pdq hashes into a single string, if present
pdq_hashes = []
@ -70,34 +71,44 @@ class GsheetsDb(Database):
if pdq := m.get("pdq_hash"):
pdq_hashes.append(pdq)
if len(pdq_hashes):
batch_if_valid('pdq_hash', ",".join(pdq_hashes))
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls))
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
screenshot, "urls"
):
batch_if_valid("screenshot", "\n".join(screenshot.urls))
if (thumbnail := item.get_first_image("thumbnail")):
if thumbnail := item.get_first_image("thumbnail"):
if hasattr(thumbnail, "urls"):
batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
if (browsertrix := item.get_media_by_id("browsertrix")):
batch_if_valid('wacz', "\n".join(browsertrix.urls))
batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
if browsertrix := item.get_media_by_id("browsertrix"):
batch_if_valid("wacz", "\n".join(browsertrix.urls))
batch_if_valid(
"replaywebpage",
"\n".join(
[
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
for wacz in browsertrix.urls
]
),
)
gw.batch_set_cell(cell_updates)
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
try:
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, 'status', new_status)
gw.set_cell(row, "status", new_status)
except Exception as e:
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
if gsheet := ArchivingContext.get("gsheet"):
if gsheet := item.get_context("gsheet"):
gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row")
elif self.sheet_id:
print(self.sheet_id)
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
return gw, row

Wyświetl plik

@ -3,8 +3,8 @@
"type": ["feeder"],
"entry_point": "gsheet_feeder::GsheetsFeeder",
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "gspread", "python-slugify"],
"dependencies": {
"python": ["loguru", "gspread", "slugify"],
},
"configs": {
"sheet": {"default": None, "help": "name of the sheet to archive"},

Wyświetl plik

@ -15,14 +15,13 @@ from loguru import logger
from slugify import slugify
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.core import Metadata
from . import GWorksheet
class GsheetsFeeder(Feeder):
def setup(self, config: dict):
super().setup(config)
def setup(self) -> None:
self.gsheets_client = gspread.service_account(filename=self.service_account)
# TODO mv to validators
assert self.sheet or self.sheet_id, (
@ -37,43 +36,48 @@ class GsheetsFeeder(Feeder):
def __iter__(self) -> Metadata:
sh = self.open_sheet()
for ii, wks in enumerate(sh.worksheets()):
if not self.should_process_sheet(wks.title):
logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
for ii, worksheet in enumerate(sh.worksheets()):
if not self.should_process_sheet(worksheet.title):
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
continue
logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}')
gw = GWorksheet(wks, header_row=self.header, columns=self.columns)
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
if len(missing_cols := self.missing_required_columns(gw)):
logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
continue
for row in range(1 + self.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url').strip()
if not len(url): continue
# process and yield metadata here:
yield from self._process_rows(gw)
logger.success(f'Finished worksheet {worksheet.title}')
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
# TODO: custom status parser(?) aka should_retry_from_status
if status not in ['', None]: continue
def _process_rows(self, gw: GWorksheet):
for row in range(1 + self.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url').strip()
if not len(url): continue
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
# TODO: custom status parser(?) aka should_retry_from_status
if status not in ['', None]: continue
# All checks done - archival process starts here
m = Metadata().set_url(url)
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
if gw.get_cell_or_default(row, 'folder', "") is None:
folder = ''
else:
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
if len(folder):
if self.use_sheet_names_in_stored_paths:
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
else:
ArchivingContext.set("folder", folder, True)
# All checks done - archival process starts here
m = Metadata().set_url(url)
self._set_context(m, gw, row)
yield m
yield m
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
# TODO: Check folder value not being recognised
m.set_context("gsheet", {"row": row, "worksheet": gw})
if gw.get_cell_or_default(row, 'folder', "") is None:
folder = ''
else:
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
if len(folder):
if self.use_sheet_names_in_stored_paths:
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
else:
m.set_context("folder", folder)
logger.success(f'Finished worksheet {wks.title}')
def should_process_sheet(self, sheet_name: str) -> bool:
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Hash Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
},
"configs": {

Wyświetl plik

@ -11,7 +11,8 @@ import hashlib
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.core import Metadata
from auto_archiver.utils.misc import calculate_file_hash
class HashEnricher(Enricher):
@ -19,16 +20,6 @@ class HashEnricher(Enricher):
Calculates hashes for Media instances
"""
def __init__(self, config: dict = None):
"""
Initialize the HashEnricher with a configuration dictionary.
"""
super().__init__()
# TODO set these from the manifest?
# Set default values
self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256"
self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7)
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
@ -39,15 +30,10 @@ class HashEnricher(Enricher):
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
def calculate_hash(self, filename) -> str:
hash = None
hash_algo = None
if self.algorithm == "SHA-256":
hash = hashlib.sha256()
hash_algo = hashlib.sha256
elif self.algorithm == "SHA3-512":
hash = hashlib.sha3_512()
hash_algo = hashlib.sha3_512
else: return ""
with open(filename, "rb") as f:
while True:
buf = f.read(self.chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()
return calculate_file_hash(filename, hash_algo, self.chunksize)

Wyświetl plik

@ -2,8 +2,8 @@
"name": "HTML Formatter",
"type": ["formatter"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "jinja2"],
"dependencies": {
"python": ["hash_enricher", "loguru", "jinja2"],
"bin": [""]
},
"configs": {

Wyświetl plik

@ -1,5 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass
import mimetypes, os, pathlib
from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote
@ -8,20 +7,18 @@ import json
import base64
from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
from auto_archiver.core.module import get_module
@dataclass
class HtmlFormatter(Formatter):
environment: Environment = None
template: any = None
def setup(self, config: dict) -> None:
def setup(self) -> None:
"""Sets up the Jinja2 environment and loads the template."""
super().setup(config) # Ensure the base class logic is executed
template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
@ -48,12 +45,13 @@ class HtmlFormatter(Formatter):
version=__version__
)
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
final_media = Media(filename=html_path, _mimetype="text/html")
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
# get the already instantiated hash_enricher module
he = get_module('hash_enricher', self.config)
if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}")

Wyświetl plik

@ -1,7 +1,8 @@
{
"name": "Instagram API Extractor",
"type": ["extractor"],
"external_dependencies":
"entry_point": "instagram_api_extractor::InstagramAPIExtractor",
"dependencies":
{"python": ["requests",
"loguru",
"retrying",
@ -9,24 +10,31 @@
},
"requires_setup": True,
"configs": {
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
"api_endpoint": {"default": None, "help": "API endpoint to use"},
"access_token": {"default": None,
"help": "a valid instagrapi-api token"},
"api_endpoint": {"required": True,
"help": "API endpoint to use"},
"full_profile": {
"default": False,
"type": "bool",
"help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
},
"full_profile_max_posts": {
"default": 0,
"type": "int",
"help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
},
"minimize_json_output": {
"default": True,
"type": "bool",
"help": "if true, will remove empty values from the json output",
},
},
"description": """
Archives various types of Instagram content using the Instagrapi API.
Requires setting up an Instagrapi API deployment and providing an access token and API endpoint.
### Features
- Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content.
- Supports advanced configuration options, including:

Wyświetl plik

@ -28,20 +28,14 @@ class InstagramAPIExtractor(Extractor):
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
"""
global_pattern = re.compile(
valid_url = re.compile(
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
)
def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("access_token")
self.assert_valid_string("api_endpoint")
self.full_profile_max_posts = int(self.full_profile_max_posts)
def setup(self) -> None:
if self.api_endpoint[-1] == "/":
self.api_endpoint = self.api_endpoint[:-1]
self.full_profile = bool(self.full_profile)
self.minimize_json_output = bool(self.minimize_json_output)
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
@ -49,7 +43,7 @@ class InstagramAPIExtractor(Extractor):
url.replace("instagr.com", "instagram.com").replace(
"instagr.am", "instagram.com"
)
insta_matches = self.global_pattern.findall(url)
insta_matches = self.valid_url.findall(url)
logger.info(f"{insta_matches=}")
if not len(insta_matches) or len(insta_matches[0]) != 3:
return

Wyświetl plik

@ -1,7 +1,7 @@
{
"name": "Instagram Extractor",
"type": ["extractor"],
"external_dependencies": {
"dependencies": {
"python": [
"instaloader",
"loguru",
@ -9,9 +9,10 @@
},
"requires_setup": True,
"configs": {
"username": {"default": None, "help": "a valid Instagram username"},
"username": {"required": True,
"help": "a valid Instagram username"},
"password": {
"default": None,
"required": True,
"help": "the corresponding Instagram account password",
},
"download_folder": {
@ -25,9 +26,11 @@
# TODO: fine-grain
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
},
"description": """Uses the Instaloader library to download content from Instagram. This class handles both individual posts
and user profiles, downloading as much information as possible, including images, videos, text, stories,
highlights, and tagged posts. Authentication is required via username/password or a session file.
"description": """
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
and user profiles, downloading as much information as possible, including images, videos, text, stories,
highlights, and tagged posts.
Authentication is required via username/password or a session file.
""",
}

Wyświetl plik

@ -4,7 +4,7 @@
"""
import re, os, shutil, traceback
import instaloader # https://instaloader.github.io/as-module.html
import instaloader
from loguru import logger
from auto_archiver.core import Extractor
@ -16,19 +16,17 @@ class InstagramExtractor(Extractor):
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
"""
# NB: post regex should be tested before profile
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
# https://regex101.com/r/MGPquX/1
post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
# https://regex101.com/r/6Wbsxa/1
profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
# TODO: links to stories
def __init__(self, config: dict) -> None:
super().__init__(config)
# TODO: refactor how configuration validation is done
self.assert_valid_string("username")
self.assert_valid_string("password")
self.assert_valid_string("download_folder")
self.assert_valid_string("session_file")
def setup(self) -> None:
self.insta = instaloader.Instaloader(
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
)

Wyświetl plik

@ -1,15 +1,16 @@
{
"name": "Instagram Telegram Bot Extractor",
"type": ["extractor"],
"external_dependencies": {"python": ["loguru",
"telethon",],
"dependencies": {"python": ["loguru", "telethon",],
},
"requires_setup": True,
"configs": {
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
"timeout": {"default": 45,
"type": "int",
"help": "timeout to fetch the instagram content in seconds."},
},
"description": """
The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
@ -28,6 +29,12 @@ returned as part of a `Metadata` object.
To use the `InstagramTbotExtractor`, you need to provide the following configuration settings:
- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
- **Session File**: Optional path to store the Telegram session file for future use.
- The session file is created automatically and should be unique for each instance.
- You may need to enter your Telegram credentials (phone) and use the a 2FA code sent to you the first time you run the extractor.:
```2025-01-30 00:43:49.348 | INFO | auto_archiver.modules.instagram_tbot_extractor.instagram_tbot_extractor:setup:36 - SETUP instagram_tbot_extractor checking login...
Please enter your phone (or bot token): +447123456789
Please enter the code you received: 00000
Signed in successfully as E C; remember to not break the ToS or you will risk an account ban!
```
""",
}

Wyświetl plik

@ -16,7 +16,7 @@ from loguru import logger
from telethon.sync import TelegramClient
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import random_str
@ -33,17 +33,30 @@ class InstagramTbotExtractor(Extractor):
2. checks if the session file is valid
"""
logger.info(f"SETUP {self.name} checking login...")
self._prepare_session_file()
self._initialize_telegram_client()
# make a copy of the session that is used exclusively with this archiver instance
def _prepare_session_file(self):
"""
Creates a copy of the session file for exclusive use with this archiver instance.
Ensures that a valid session file exists before proceeding.
"""
new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
if not os.path.exists(f"{self.session_file}.session"):
raise FileNotFoundError(f"Session file {self.session_file}.session not found.")
shutil.copy(self.session_file + ".session", new_session_file)
self.session_file = new_session_file.replace(".session", "")
def _initialize_telegram_client(self):
"""Initializes the Telegram client."""
try:
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
except OperationalError as e:
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
logger.error(
f"Unable to access the {self.session_file} session. "
"Ensure that you don't use the same session file here and in telethon_extractor. "
"If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
)
with self.client.start():
logger.success(f"SETUP {self.name} login works.")
@ -58,34 +71,51 @@ class InstagramTbotExtractor(Extractor):
if not "instagram.com" in url: return False
result = Metadata()
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
with self.client.start():
chat = self.client.get_entity("instagram_load_bot")
since_id = self.client.send_message(entity=chat, message=url).id
attempts = 0
seen_media = []
message = ""
time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):
since_id = max(since_id, post.id)
if post.media and post.id not in seen_media:
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
media = self.client.download_media(post.media, filename_dest)
if media:
result.add_media(Media(media))
seen_media.append(post.id)
if post.message: message += post.message
chat, since_id = self._send_url_to_bot(url)
message = self._process_messages(chat, since_id, tmp_dir, result)
if "You must enter a URL to a post" in message:
if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}")
return False
# # TODO: It currently returns this as a success - is that intentional?
# if "Media not found or unavailable" in message:
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
# return False
if message:
result.set_content(message).set_title(message[:128])
return result.success("insta-via-bot")
def _send_url_to_bot(self, url: str):
"""
Sends the URL to the 'instagram_load_bot' and returns (chat, since_id).
"""
chat = self.client.get_entity("instagram_load_bot")
since_message = self.client.send_message(entity=chat, message=url)
return chat, since_message.id
def _process_messages(self, chat, since_id, tmp_dir, result):
attempts = 0
seen_media = []
message = ""
time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):
since_id = max(since_id, post.id)
# Skip known filler message:
if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
continue
if post.media and post.id not in seen_media:
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
media = self.client.download_media(post.media, filename_dest)
if media:
result.add_media(Media(media))
seen_media.append(post.id)
if post.message: message += post.message
return message.strip()

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Local Storage",
"type": ["storage"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
},
"configs": {

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Archive Metadata Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
},
"description": """

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Media Metadata Enricher",
"type": ["enricher"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
"bin": ["exiftool"]
},

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Mute Formatter",
"type": ["formatter"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
},
"description": """ Default formatter.
""",

Wyświetl plik

@ -1,11 +1,9 @@
from __future__ import annotations
from dataclasses import dataclass
from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter
@dataclass
class MuteFormatter(Formatter):
def format(self, item: Metadata) -> Media: return None

Wyświetl plik

@ -2,8 +2,8 @@
"name": "PDQ Hash Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "pdqhash", "numpy", "Pillow"],
"dependencies": {
"python": ["loguru", "pdqhash", "numpy", "PIL"],
},
"description": """
PDQ Hash Enricher for generating perceptual hashes of media files.

Wyświetl plik

@ -1 +1 @@
from .s3 import S3Storage
from .s3_storage import S3Storage

Wyświetl plik

@ -2,17 +2,17 @@
"name": "S3 Storage",
"type": ["storage"],
"requires_setup": True,
"external_dependencies": {
"python": ["boto3", "loguru"],
"dependencies": {
"python": ["hash_enricher", "boto3", "loguru"],
},
"configs": {
"path_generator": {
"default": "url",
"default": "flat",
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
"choices": ["flat", "url", "random"],
},
"filename_generator": {
"default": "random",
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"choices": ["random", "static"],
},
@ -20,7 +20,9 @@
"region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"},
"random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
"random_no_duplicate": {"default": False,
"type": "bool",
"help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
"endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime"
@ -29,7 +31,9 @@
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
},
"private": {"default": False, "help": "if true S3 files will not be readable online"},
"private": {"default": False,
"type": "bool",
"help": "if true S3 files will not be readable online"},
},
"description": """
S3Storage: A storage module for saving media files to an S3-compatible object storage.
@ -45,5 +49,6 @@
- Requires S3 credentials (API key and secret) and a bucket name to function.
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
- Uses `boto3` for interaction with the S3 API.
- Depends on the `HashEnricher` module for hash calculation.
"""
}

Wyświetl plik

@ -1,19 +1,19 @@
from typing import IO
import boto3, os
from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.modules.hash_enricher import HashEnricher
import boto3
import os
from loguru import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.utils.misc import calculate_file_hash, random_str
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage):
def __init__(self, config: dict) -> None:
super().__init__(config)
def setup(self) -> None:
self.s3 = boto3.client(
's3',
region_name=self.region,
@ -21,7 +21,6 @@ class S3Storage(Storage):
aws_access_key_id=self.key,
aws_secret_access_key=self.secret
)
self.random_no_duplicate = bool(self.random_no_duplicate)
if self.random_no_duplicate:
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
@ -41,15 +40,13 @@ class S3Storage(Storage):
extra_args['ContentType'] = media.mimetype
except Exception as e:
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
hd = calculate_file_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
@ -61,8 +58,7 @@ class S3Storage(Storage):
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
return True
def file_in_folder(self, path:str) -> str:
# checks if path exists and is not an empty folder
if not path.endswith('/'):

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Screenshot Enricher",
"type": ["enricher"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["loguru", "selenium"],
"bin": ["chromedriver"]
},

Wyświetl plik

@ -6,8 +6,8 @@ from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Enricher
from auto_archiver.utils import Webdriver, UrlUtil, random_str
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
@ -19,15 +19,17 @@ class ScreenshotEnricher(Enricher):
return
logger.debug(f"Enriching screenshot for {url=}")
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
auth = self.auth_for_site(url)
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
try:
driver.get(url)
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
if self.save_to_pdf:
pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
pdf = driver.print_page(driver.print_options)
with open(pdf_file, "wb") as f:
f.write(base64.b64decode(pdf))

Wyświetl plik

@ -2,8 +2,8 @@
"name": "SSL Certificate Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "python-slugify"],
"dependencies": {
"python": ["loguru", "slugify"],
},
'entry_point': 'ssl_enricher::SSLEnricher',
"configs": {

Wyświetl plik

@ -4,7 +4,7 @@ from urllib.parse import urlparse
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media
from auto_archiver.core import Metadata, Media
class SSLEnricher(Enricher):
@ -23,6 +23,6 @@ class SSLEnricher(Enricher):
logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
cert = ssl.get_server_certificate((domain, 443))
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem")
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
with open(cert_fn, "w") as f: f.write(cert)
to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Telegram Extractor",
"type": ["extractor"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": [
"requests",
"bs4",
@ -13,7 +13,7 @@
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
is advised for more comprehensive functionality.
is advised for more comprehensive functionality, and higher quality media extraction.
### Features
- Extracts images and videos from public Telegram message links (`t.me`).

Wyświetl plik

@ -1 +1 @@
from .telethon_extractor import TelethonArchiver
from .telethon_extractor import TelethonExtractor

Wyświetl plik

@ -2,7 +2,7 @@
"name": "telethon_extractor",
"type": ["extractor"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["telethon",
"loguru",
"tqdm",

Wyświetl plik

@ -6,19 +6,20 @@ from telethon.tl.functions.messages import ImportChatInviteRequest
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
from loguru import logger
from tqdm import tqdm
import re, time, json, os
import re, time, os
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import random_str
class TelethonArchiver(Extractor):
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
class TelethonExtractor(Extractor):
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
def setup(self) -> None:
"""
1. makes a copy of session_file that is removed in cleanup
2. trigger login process for telegram or proceed if already saved in a session file
@ -92,7 +93,7 @@ class TelethonArchiver(Extractor):
"""
url = item.get_url()
# detect URLs that we definitely cannot handle
match = self.link_pattern.search(url)
match = self.valid_url.search(url)
logger.debug(f"TELETHON: {match=}")
if not match: return False
@ -120,7 +121,7 @@ class TelethonArchiver(Extractor):
media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f'got {len(media_posts)=} for {url=}')
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
group_id = post.grouped_id if post.grouped_id is not None else post.id
title = post.message

Wyświetl plik

@ -2,8 +2,8 @@
"name": "Thumbnail Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "ffmpeg-python"],
"dependencies": {
"python": ["loguru", "ffmpeg"],
"bin": ["ffmpeg"]
},
"configs": {

Wyświetl plik

@ -10,7 +10,7 @@ import ffmpeg, os
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.core import Media, Metadata
from auto_archiver.utils.misc import random_str
@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher):
logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
for m_id, m in enumerate(to_enrich.media[::]):
if m.is_video():
folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
folder = os.path.join(self.tmp_dir, random_str(24))
os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}")
duration = m.get("duration")

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Timestamping Enricher",
"type": ["enricher"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": [
"loguru",
"slugify",

Wyświetl plik

@ -10,8 +10,7 @@ from asn1crypto.core import Asn1Value
import certifi
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media
from auto_archiver.core import Metadata, Media
class TimestampingEnricher(Enricher):
"""
@ -33,7 +32,7 @@ class TimestampingEnricher(Enricher):
logger.warning(f"No hashes found in {url=}")
return
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
hashes_fn = os.path.join(tmp_dir, "hashes.txt")
data_to_sign = "\n".join(hashes)
@ -102,9 +101,9 @@ class TimestampingEnricher(Enricher):
cert_chain = []
for cert in path:
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt")
cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
with open(cert_fn, "wb") as f:
f.write(cert.dump())
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
return cert_chain
return cert_chain

Wyświetl plik

@ -2,7 +2,7 @@
"name": "Twitter API Extractor",
"type": ["extractor"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["requests",
"loguru",
"pytwitter",

Wyświetl plik

@ -9,14 +9,13 @@ from pytwitter import Api
from slugify import slugify
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata,Media
from auto_archiver.core import Metadata, Media
class TwitterApiExtractor(Extractor):
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
def setup(self, config: dict) -> None:
super().setup(config)
valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
def setup(self) -> None:
self.api_index = 0
self.apis = []
if len(self.bearer_tokens):
@ -54,7 +53,7 @@ class TwitterApiExtractor(Extractor):
def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
matches = self.valid_url.findall(url)
if not len(matches): return False, False
username, tweet_id = matches[0] # only one URL supported

Wyświetl plik

@ -3,15 +3,19 @@
"type": ["extractor"],
"requires_setup": True,
"depends": ["core", "utils"],
"external_dependencies": {
"python": ["loguru",
"vk_url_scraper"],
"dependencies": {
"python": ["loguru", "vk_url_scraper"],
},
"configs": {
"username": {"default": None, "help": "valid VKontakte username"},
"password": {"default": None, "help": "valid VKontakte password"},
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
"username": {"required": True,
"help": "valid VKontakte username"},
"password": {"required": True,
"help": "valid VKontakte password"},
"session_file": {
"default": "secrets/vk_config.v2.json",
"help": "valid VKontakte password",
},
},
"description": """
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
@ -31,6 +35,5 @@ To use the `VkArchiver`, you must provide valid VKontakte login credentials and
Credentials can be set in the configuration file or directly via environment variables. Ensure you
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
"""
,
""",
}

Wyświetl plik

@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
from auto_archiver.utils.misc import dump_payload
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
class VkExtractor(Extractor):
@ -12,10 +12,7 @@ class VkExtractor(Extractor):
Currently only works for /wall posts
"""
def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("username")
self.assert_valid_string("password")
def setup(self) -> None:
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
def download(self, item: Metadata) -> Metadata:
@ -37,7 +34,7 @@ class VkExtractor(Extractor):
result.set_content(dump_payload(vk_scrapes))
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
for filename in filenames:
result.add_media(Media(filename))

Wyświetl plik

@ -1,8 +1,9 @@
{
"name": "WACZ Enricher",
"type": ["enricher", "archiver"],
"entry_point": "wacz_enricher::WaczExtractorEnricher",
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": [
"loguru",
"jsonlines",
@ -25,6 +26,7 @@
},
"description": """
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
### Features
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
@ -33,7 +35,7 @@
- Generates metadata from the archived page's content and structure (e.g., titles, text).
### Notes
- Requires Docker for running `browsertrix-crawler` unless explicitly disabled.
- Requires Docker for running `browsertrix-crawler` .
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
"""
}

Wyświetl plik

@ -5,9 +5,9 @@ from zipfile import ZipFile
from loguru import logger
from warcio.archiveiterator import ArchiveIterator
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.core import Media, Metadata
from auto_archiver.core import Extractor, Enricher
from auto_archiver.utils import UrlUtil, random_str
from auto_archiver.utils import url as UrlUtil, random_str
class WaczExtractorEnricher(Enricher, Extractor):
@ -19,6 +19,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
"""
def setup(self) -> None:
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
@ -49,7 +50,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
url = to_enrich.get_url()
collection = random_str(8)
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
cmd = [
@ -152,7 +153,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")
# unzipping the .wacz
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
unzipped_dir = os.path.join(tmp_dir, "unzipped")
with ZipFile(wacz_filename, 'r') as z_obj:
z_obj.extractall(path=unzipped_dir)

Wyświetl plik

@ -1 +0,0 @@
from .wayback_enricher import WaybackExtractorEnricher

Wyświetl plik

@ -1,30 +0,0 @@
{
"name": "Wayback Machine Enricher",
"type": ["enricher", "archiver"],
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "requests"],
},
"entry_point": "wayback_enricher::WaybackExtractorEnricher",
"configs": {
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
"if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"},
"key": {"default": None, "required": True, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
"secret": {"default": None, "required": True, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"},
"proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"},
"proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"},
},
"description": """
Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
### Features
- Archives URLs using the Internet Archive's Wayback Machine API.
- Supports conditional archiving based on the existence of prior archives within a specified time range.
- Provides proxies for HTTP and HTTPS requests.
- Fetches and confirms the archive URL or provides a job ID for later status checks.
### Notes
- Requires a valid Wayback Machine API key and secret.
- Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
"""
}

Wyświetl plik

@ -0,0 +1 @@
from .wayback_extractor_enricher import WaybackExtractorEnricher

Wyświetl plik

@ -0,0 +1,56 @@
{
"name": "Wayback Machine Enricher",
"type": ["enricher", "archiver"],
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
"requires_setup": True,
"dependencies": {
"python": ["loguru", "requests"],
},
"configs": {
"timeout": {
"default": 15,
"help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
},
"if_not_archived_within": {
"default": None,
"help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA",
},
"key": {
"required": True,
"help": "wayback API key. to get credentials visit https://archive.org/account/s3.php",
},
"secret": {
"required": True,
"help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php",
},
"proxy_http": {
"default": None,
"help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port",
},
"proxy_https": {
"default": None,
"help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port",
},
},
"description": """
Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
### Features
- Archives URLs using the Internet Archive's Wayback Machine API.
- Supports conditional archiving based on the existence of prior archives within a specified time range.
- Provides proxies for HTTP and HTTPS requests.
- Fetches and confirms the archive URL or provides a job ID for later status checks.
### Notes
- Requires a valid Wayback Machine API key and secret.
- Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
### Steps to Get an Wayback API Key:
- Sign up for an account at [Internet Archive](https://archive.org/account/signup).
- Log in to your account.
- Navigte to your [account settings](https://archive.org/account).
- or: https://archive.org/developers/tutorial-get-ia-credentials.html
- Under Wayback Machine API Keys, generate a new key.
- Note down your API key and secret, as they will be required for authentication.
""",
}

Wyświetl plik

@ -3,7 +3,7 @@ from loguru import logger
import time, requests
from auto_archiver.core import Extractor, Enricher
from auto_archiver.utils import UrlUtil
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core import Metadata
class WaybackExtractorEnricher(Enricher, Extractor):

Wyświetl plik

@ -2,15 +2,19 @@
"name": "Whisper Enricher",
"type": ["enricher"],
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "requests"],
"dependencies": {
"python": ["s3_storage", "loguru", "requests"],
},
"configs": {
"api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
"api_endpoint": {"required": True,
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
"api_key": {"required": True,
"help": "WhisperApi api key for authentication"},
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
"action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
"action": {"default": "translate",
"help": "which Whisper operation to execute",
"choices": ["transcribe", "translate", "language_detection"]},
},
"description": """
Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
@ -25,6 +29,7 @@
### Notes
- Requires a Whisper API endpoint and API key for authentication.
- Only compatible with S3-compatible storage systems for media file accessibility.
- ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files.
- Handles multiple jobs and retries for failed or incomplete processing.
"""
}

Wyświetl plik

@ -3,9 +3,8 @@ import requests, time
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.modules.s3_storage import S3Storage
from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module
class WhisperEnricher(Enricher):
"""
@ -14,18 +13,25 @@ class WhisperEnricher(Enricher):
Only works if an S3 compatible storage is used
"""
def enrich(self, to_enrich: Metadata) -> None:
if not self._get_s3_storage():
def setup(self) -> None:
self.stores = self.config['steps']['storages']
self.s3 = get_module("s3_storage", self.config)
if not "s3_storage" in self.stores:
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
job_results = {}
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
m.store(url=url, metadata=to_enrich)
# TODO: this used to pass all storage items to store now
# Now only passing S3, the rest will get added later in the usual order (?)
m.store(url=url, metadata=to_enrich, storages=[self.s3])
try:
job_id = self.submit_job(m)
job_results[job_id] = False
@ -53,8 +59,8 @@ class WhisperEnricher(Enricher):
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
def submit_job(self, media: Media):
s3 = self._get_s3_storage()
s3_url = s3.get_cdn_url(media)
s3_url = self.s3.get_cdn_url(media)
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
payload = {
"url": s3_url,
@ -107,10 +113,3 @@ class WhisperEnricher(Enricher):
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
return result
return False
def _get_s3_storage(self) -> S3Storage:
try:
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
except:
logger.warning("No S3Storage instance found in storages")
return

Wyświetl plik

@ -2,7 +2,6 @@
# we need to explicitly expose the available imports here
from .misc import *
from .webdriver import Webdriver
from .url import UrlUtil
from .atlos import get_atlos_config_options
# handy utils from ytdlp

Wyświetl plik

@ -1,53 +0,0 @@
import json, gspread
from ..core import BaseModule
class Gsheets(BaseModule):
name = "gsheets"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.gsheets_client = gspread.service_account(filename=self.service_account)
# TODO: config should be responsible for conversions
try: self.header = int(self.header)
except: pass
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
# TODO merge this into gsheets processors manifest
@staticmethod
def configs() -> dict:
return {
"sheet": {"default": None, "help": "name of the sheet to archive"},
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
"columns": {
"default": {
'url': 'link',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'thumbnail': 'thumbnail',
'timestamp': 'upload timestamp',
'title': 'upload title',
'text': 'text content',
'screenshot': 'screenshot',
'hash': 'hash',
'pdq_hash': 'perceptual hashes',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
},
"help": "names of columns in the google sheet (stringified JSON object)",
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
},
}
def open_sheet(self):
if self.sheet:
return self.gsheets_client.open(self.sheet)
else: # self.sheet_id
return self.gsheets_client.open_by_key(self.sheet_id)

Wyświetl plik

@ -1,7 +1,9 @@
import os, json, requests
import os
import json
import uuid
from datetime import datetime
from datetime import datetime, timezone
import requests
import hashlib
from loguru import logger
@ -51,9 +53,52 @@ def update_nested_dict(dictionary, update_dict):
else:
dictionary[key] = value
def random_str(length: int = 32) -> str:
assert length <= 32, "length must be less than 32 as UUID4 is used"
return str(uuid.uuid4()).replace("-", "")[:length]
def json_loader(cli_val):
return json.loads(cli_val)
def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
hash = hash_algo()
with open(filename, "rb") as f:
while True:
buf = f.read(chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()
def get_current_datetime_iso() -> str:
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat()
def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None:
# parse a datetime string with option of passing a specific format
try:
return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str)
except ValueError as e:
logger.error(f"Unable to parse datestring {dt_str}: {e}")
return None
def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None:
# Consistent parsing of timestamps
# If utc=True, the timezone is set to UTC,
# if iso=True, the output is an iso string
if not ts: return
try:
if isinstance(ts, str): ts = datetime.fromisoformat(ts)
if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts)
if utc: ts = ts.replace(tzinfo=timezone.utc)
if iso: return ts.isoformat()
return ts
except Exception as e:
logger.error(f"Unable to parse timestamp {ts}: {e}")
return None
def get_current_timestamp() -> str:
return get_timestamp(datetime.now())

Wyświetl plik

@ -1,79 +1,84 @@
import re
from urllib.parse import urlparse, urlunparse
class UrlUtil:
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
@staticmethod
def clean(url: str) -> str: return url
AUTHWALL_URLS = [
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
]
@staticmethod
def is_auth_wall(url: str) -> bool:
"""
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
"""
if UrlUtil.telegram_private.match(url): return True
if UrlUtil.is_istagram.match(url): return True
def domain_for_url(url: str) -> str:
"""
SECURITY: parse the domain using urllib to avoid any potential security issues
"""
return urlparse(url).netloc
return False
def clean(url: str) -> str:
return url
@staticmethod
def remove_get_parameters(url: str) -> str:
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
def is_auth_wall(url: str) -> bool:
"""
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
"""
for regex in AUTHWALL_URLS:
if regex.match(url):
return True
@staticmethod
def is_relevant_url(url: str) -> bool:
"""
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
"""
clean_url = UrlUtil.remove_get_parameters(url)
return False
# favicons
if "favicon" in url: return False
# ifnore icons
if clean_url.endswith(".ico"): return False
# ignore SVGs
if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False
def remove_get_parameters(url: str) -> str:
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
# twitter profile pictures
if "twimg.com/profile_images" in url: return False
if "twimg.com" in url and "/default_profile_images" in url: return False
def is_relevant_url(url: str) -> bool:
"""
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
"""
clean_url = remove_get_parameters(url)
# instagram profile pictures
if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
# instagram recurring images
if "https://static.cdninstagram.com/rsrc.php/" in url: return False
# favicons
if "favicon" in url: return False
# ifnore icons
if clean_url.endswith(".ico"): return False
# ignore SVGs
if remove_get_parameters(url).endswith(".svg"): return False
# telegram
if "https://telegram.org/img/emoji/" in url: return False
# twitter profile pictures
if "twimg.com/profile_images" in url: return False
if "twimg.com" in url and "/default_profile_images" in url: return False
# youtube
if "https://www.youtube.com/s/gaming/emoji/" in url: return False
if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
if "https://www.youtube.com/s/search/audio/" in url: return False
# instagram profile pictures
if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
# instagram recurring images
if "https://static.cdninstagram.com/rsrc.php/" in url: return False
# ok
if " https://ok.ru/res/i/" in url: return False
# telegram
if "https://telegram.org/img/emoji/" in url: return False
# vk
if "https://vk.com/emoji/" in url: return False
if "vk.com/images/" in url: return False
if "vk.com/images/reaction/" in url: return False
# youtube
if "https://www.youtube.com/s/gaming/emoji/" in url: return False
if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
if "https://www.youtube.com/s/search/audio/" in url: return False
# wikipedia
if "wikipedia.org/static" in url: return False
# ok
if " https://ok.ru/res/i/" in url: return False
return True
# vk
if "https://vk.com/emoji/" in url: return False
if "vk.com/images/" in url: return False
if "vk.com/images/reaction/" in url: return False
@staticmethod
def twitter_best_quality_url(url: str) -> str:
"""
some twitter image URLs point to a less-than best quality
this returns the URL pointing to the highest (original) quality
"""
return re.sub(r"name=(\w+)", "name=orig", url, 1)
# wikipedia
if "wikipedia.org/static" in url: return False
return True
def twitter_best_quality_url(url: str) -> str:
"""
some twitter image URLs point to a less-than best quality
this returns the URL pointing to the highest (original) quality
"""
return re.sub(r"name=(\w+)", "name=orig", url, 1)

Wyświetl plik

@ -9,12 +9,79 @@ from loguru import logger
from selenium.webdriver.common.by import By
import time
#import domain_for_url
from urllib.parse import urlparse, urlunparse
from http.cookiejar import MozillaCookieJar
class CookieSettingDriver(webdriver.Firefox):
facebook_accept_cookies: bool
cookies: str
cookiejar: MozillaCookieJar
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
super(CookieSettingDriver, self).__init__(*args, **kwargs)
self.cookies = cookies
self.cookiejar = cookiejar
self.facebook_accept_cookies = facebook_accept_cookies
def get(self, url: str):
if self.cookies or self.cookiejar:
# set up the driver to make it not 'cookie averse' (needs a context/URL)
# get the 'robots.txt' file which should be quick and easy
robots_url = urlunparse(urlparse(url)._replace(path='/robots.txt', query='', fragment=''))
super(CookieSettingDriver, self).get(robots_url)
if self.cookies:
# an explicit cookie is set for this site, use that first
for cookie in self.cookies.split(";"):
for name, value in cookie.split("="):
self.driver.add_cookie({'name': name, 'value': value})
elif self.cookiejar:
domain = urlparse(url).netloc.lstrip("www.")
for cookie in self.cookiejar:
if domain in cookie.domain:
try:
self.add_cookie({
'name': cookie.name,
'value': cookie.value,
'path': cookie.path,
'domain': cookie.domain,
'secure': bool(cookie.secure),
'expiry': cookie.expires
})
except Exception as e:
logger.warning(f"Failed to add cookie to webdriver: {e}")
if self.facebook_accept_cookies:
try:
logger.debug(f'Trying fb click accept cookie popup.')
super(CookieSettingDriver, self).get("http://www.facebook.com")
essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]")
essential_only.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
time.sleep(2)
except Exception as e:
logger.warning(f'Failed on fb accept cookies.', e)
# now get the actual URL
super(CookieSettingDriver, self).get(url)
if self.facebook_accept_cookies:
# try and click the 'close' button on the 'login' window to close it
close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']")
if close_button:
close_button.click()
class Webdriver:
def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver:
def __init__(self, width: int, height: int, timeout_seconds: int,
facebook_accept_cookies: bool = False, http_proxy: str = "",
print_options: dict = {}, auth: dict = {}) -> webdriver:
self.width = width
self.height = height
self.timeout_seconds = timeout_seconds
self.auth = auth
self.facebook_accept_cookies = facebook_accept_cookies
self.http_proxy = http_proxy
# create and set print options
@ -23,32 +90,26 @@ class Webdriver:
setattr(self.print_options, k, v)
def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
options.add_argument(f'--proxy-server={self.http_proxy}')
options.set_preference('network.protocol-handler.external.tg', False)
# if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option
if self.facebook_accept_cookies:
options.add_argument('--lang=en')
try:
self.driver = webdriver.Firefox(options=options)
self.driver = CookieSettingDriver(cookies=self.auth.get('cookies'), cookiejar=self.auth.get('cookies_jar'),
facebook_accept_cookies=self.facebook_accept_cookies, options=options)
self.driver.set_window_size(self.width, self.height)
self.driver.set_page_load_timeout(self.timeout_seconds)
self.driver.print_options = self.print_options
except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
if self.facebook_accept_cookies:
try:
logger.debug(f'Trying fb click accept cookie popup.')
self.driver.get("http://www.facebook.com")
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
foo.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
time.sleep(2)
except:
logger.warning(f'Failed on fb accept cookies.')
return self.driver
def __exit__(self, exc_type, exc_val, exc_tb):
self.driver.close()
self.driver.quit()

Wyświetl plik

@ -1,6 +0,0 @@
import tempfile
from auto_archiver.core.context import ArchivingContext
ArchivingContext.reset(full_reset=True)
ArchivingContext.set_tmp_dir(tempfile.gettempdir())

Wyświetl plik

@ -1,7 +1,9 @@
"""
pytest conftest file, for shared fixtures and configuration
"""
import os
import pickle
from tempfile import TemporaryDirectory
from typing import Dict, Tuple
import hashlib
import pytest
@ -23,13 +25,15 @@ def setup_module(request):
# if the class does not have a .name, use the name of the parent folder
module_name = module_name.__module__.rsplit(".",2)[-2]
m = get_module(module_name).load()
m.name = module_name
m.setup({module_name : config})
m = get_module(module_name, {module_name: config})
# add the tmp_dir to the module
tmp_dir = TemporaryDirectory()
m.tmp_dir = tmp_dir.name
def cleanup():
_LAZY_LOADED_MODULES.pop(module_name)
tmp_dir.cleanup()
request.addfinalizer(cleanup)
return m
@ -110,4 +114,18 @@ def pytest_runtest_setup(item):
test_name = _test_failed_incremental[cls_name].get((), None)
# if name found, test has failed for the combination of class name & test name
if test_name is not None:
pytest.xfail(f"previous test failed ({test_name})")
pytest.xfail(f"previous test failed ({test_name})")
@pytest.fixture()
def unpickle():
"""
Returns a helper function that unpickles a file
** gets the file from the test_files directory: tests/data/test_files **
"""
def _unpickle(path):
test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files")
with open(os.path.join(test_data_dir, path), "rb") as f:
return pickle.load(f)
return _unpickle

Wyświetl plik

@ -0,0 +1,2 @@
https://example.com/1/,data 1
https://example.com/2/,data 2
1 https://example.com/1/ data 1
2 https://example.com/2/ data 2

Some files were not shown because too many files have changed in this diff Show More