Merge pull request #247 from bellingcat/opentimestamps

Opentimestamps Module
pull/223/head
Patrick Robertson 2025-03-14 13:41:46 +00:00 zatwierdzone przez GitHub
commit 17463de937
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
10 zmienionych plików z 879 dodań i 342 usunięć

66
poetry.lock wygenerowano
Wyświetl plik

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
[[package]]
name = "accessible-pygments"
@ -51,7 +51,7 @@ typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""}
[package.extras]
doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx_rtd_theme"]
test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21)"]
test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1) ; python_version >= \"3.10\"", "uvloop (>=0.21) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\" and python_version < \"3.14\""]
trio = ["trio (>=0.26.1)"]
[[package]]
@ -94,12 +94,12 @@ files = [
]
[package.extras]
benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
benchmark = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
cov = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
tests-mypy = ["mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\""]
[[package]]
name = "authlib"
@ -145,7 +145,7 @@ files = [
]
[package.extras]
dev = ["backports.zoneinfo", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata"]
dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""]
[[package]]
name = "beautifulsoup4"
@ -781,7 +781,7 @@ files = [
[package.extras]
docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
typing = ["typing-extensions (>=4.12.2)"]
typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""]
[[package]]
name = "future"
@ -816,7 +816,7 @@ requests = ">=2.18.0,<3.0.0.dev0"
[package.extras]
async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"]
grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0)"]
grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""]
grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
@ -1115,7 +1115,7 @@ colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
[package.extras]
dev = ["Sphinx (==8.1.3)", "build (==1.2.2)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.5.0)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.13.0)", "mypy (==v1.4.1)", "myst-parser (==4.0.0)", "pre-commit (==4.0.1)", "pytest (==6.1.2)", "pytest (==8.3.2)", "pytest-cov (==2.12.1)", "pytest-cov (==5.0.0)", "pytest-cov (==6.0.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.1.0)", "sphinx-rtd-theme (==3.0.2)", "tox (==3.27.1)", "tox (==4.23.2)", "twine (==6.0.1)"]
dev = ["Sphinx (==8.1.3) ; python_version >= \"3.11\"", "build (==1.2.2) ; python_version >= \"3.11\"", "colorama (==0.4.5) ; python_version < \"3.8\"", "colorama (==0.4.6) ; python_version >= \"3.8\"", "exceptiongroup (==1.1.3) ; python_version >= \"3.7\" and python_version < \"3.11\"", "freezegun (==1.1.0) ; python_version < \"3.8\"", "freezegun (==1.5.0) ; python_version >= \"3.8\"", "mypy (==v0.910) ; python_version < \"3.6\"", "mypy (==v0.971) ; python_version == \"3.6\"", "mypy (==v1.13.0) ; python_version >= \"3.8\"", "mypy (==v1.4.1) ; python_version == \"3.7\"", "myst-parser (==4.0.0) ; python_version >= \"3.11\"", "pre-commit (==4.0.1) ; python_version >= \"3.9\"", "pytest (==6.1.2) ; python_version < \"3.8\"", "pytest (==8.3.2) ; python_version >= \"3.8\"", "pytest-cov (==2.12.1) ; python_version < \"3.8\"", "pytest-cov (==5.0.0) ; python_version == \"3.8\"", "pytest-cov (==6.0.0) ; python_version >= \"3.9\"", "pytest-mypy-plugins (==1.9.3) ; python_version >= \"3.6\" and python_version < \"3.8\"", "pytest-mypy-plugins (==3.1.0) ; python_version >= \"3.8\"", "sphinx-rtd-theme (==3.0.2) ; python_version >= \"3.11\"", "tox (==3.27.1) ; python_version < \"3.8\"", "tox (==4.23.2) ; python_version >= \"3.8\"", "twine (==6.0.1) ; python_version >= \"3.11\""]
[[package]]
name = "markdown-it-py"
@ -1429,6 +1429,22 @@ rsa = ["cryptography (>=3.0.0)"]
signals = ["blinker (>=1.4.0)"]
signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
[[package]]
name = "opentimestamps"
version = "0.4.5"
description = "Create and verify OpenTimestamps proofs"
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "opentimestamps-0.4.5-py3-none-any.whl", hash = "sha256:a4912b3bd1b612a3ef5fac925b9137889e6c5cb91cc9e76c8202a2bf8abe26b5"},
{file = "opentimestamps-0.4.5.tar.gz", hash = "sha256:56726ccde97fb67f336a7f237ce36808e5593c3089d68d900b1c83d0ebf9dcfa"},
]
[package.dependencies]
pycryptodomex = ">=3.3.1"
python-bitcoinlib = ">=0.9.0,<0.13.0"
[[package]]
name = "oscrypto"
version = "1.3.0"
@ -1578,7 +1594,7 @@ docs = ["furo", "olefile", "sphinx (>=8.1)", "sphinx-copybutton", "sphinx-inline
fpx = ["olefile"]
mic = ["olefile"]
tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout", "trove-classifiers (>=2024.10.12)"]
typing = ["typing-extensions"]
typing = ["typing-extensions ; python_version < \"3.10\""]
xmp = ["defusedxml"]
[[package]]
@ -1938,6 +1954,18 @@ pytest = ">=6.2.5"
[package.extras]
dev = ["pre-commit", "pytest-asyncio", "tox"]
[[package]]
name = "python-bitcoinlib"
version = "0.12.2"
description = "The Swiss Army Knife of the Bitcoin protocol."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "python-bitcoinlib-0.12.2.tar.gz", hash = "sha256:c65ab61427c77c38d397bfc431f71d86fd355b453a536496ec3fcb41bd10087d"},
{file = "python_bitcoinlib-0.12.2-py3-none-any.whl", hash = "sha256:2f29a9f475f21c12169b3a6cc8820f34f11362d7ff1200a5703dce3e4e903a44"},
]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@ -2990,7 +3018,7 @@ files = [
pysocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""}
[package.extras]
brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
h2 = ["h2 (>=4,<5)"]
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
zstd = ["zstandard (>=0.18.0)"]
@ -3013,7 +3041,7 @@ h11 = ">=0.8"
typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
[package.extras]
standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"]
standard = ["colorama (>=0.4) ; sys_platform == \"win32\"", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1) ; sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"", "watchfiles (>=0.13)", "websockets (>=10.4)"]
[[package]]
name = "virtualenv"
@ -3034,7 +3062,7 @@ platformdirs = ">=3.9.1,<5"
[package.extras]
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""]
[[package]]
name = "vk-api"
@ -3296,7 +3324,7 @@ files = [
]
[package.extras]
dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
dev = ["black (>=19.3b0) ; python_version >= \"3.6\"", "pytest (>=4.6.2)"]
[[package]]
name = "wsproto"
@ -3327,8 +3355,8 @@ files = [
[package.extras]
build = ["build", "hatchling", "pip", "setuptools (>=71.0.2)", "wheel"]
curl-cffi = ["curl-cffi (==0.5.10)", "curl-cffi (>=0.5.10,!=0.6.*,<0.7.2)"]
default = ["brotli", "brotlicffi", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=1.26.17,<3)", "websockets (>=13.0)"]
curl-cffi = ["curl-cffi (==0.5.10) ; os_name == \"nt\" and implementation_name == \"cpython\"", "curl-cffi (>=0.5.10,!=0.6.*,<0.7.2) ; os_name != \"nt\" and implementation_name == \"cpython\""]
default = ["brotli ; implementation_name == \"cpython\"", "brotlicffi ; implementation_name != \"cpython\"", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=1.26.17,<3)", "websockets (>=13.0)"]
dev = ["autopep8 (>=2.0,<3.0)", "pre-commit", "pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)", "ruff (>=0.9.0,<0.10.0)"]
pyinstaller = ["pyinstaller (>=6.11.1)"]
secretstorage = ["cffi", "secretstorage"]
@ -3338,4 +3366,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.13"
content-hash = "fbd6cdff4eb38021115a8cd361df7c292733028822f92f45cb667971c4bce901"
content-hash = "beb354960b8d8af491a13e09cb565c7e3099a2b150167c16147aa0438e970018"

Wyświetl plik

@ -57,6 +57,7 @@ dependencies = [
"certvalidator (>=0.0.0)",
"rich-argparse (>=1.6.0,<2.0.0)",
"ruamel-yaml (>=0.18.10,<0.19.0)",
"opentimestamps (>=0.4.5,<0.5.0)",
]
[tool.poetry.group.dev.dependencies]

Wyświetl plik

@ -1,151 +1,25 @@
{
"modules": {
"gsheet_feeder": {
"name": "gsheet_feeder",
"display_name": "Google Sheets Feeder",
"atlos_feeder_db_storage": {
"name": "atlos_feeder_db_storage",
"display_name": "Atlos Feeder Database Storage",
"manifest": {
"name": "Google Sheets Feeder",
"name": "Atlos Feeder Database Storage",
"author": "Bellingcat",
"type": [
"feeder"
"feeder",
"database",
"storage"
],
"requires_setup": true,
"description": "\n GsheetsFeeder \n A Google Sheets-based feeder for the Auto Archiver.\n\n This reads data from Google Sheets and filters rows based on user-defined rules.\n The filtered rows are processed into `Metadata` objects.\n\n ### Features\n - Validates the sheet structure and filters rows based on input configurations.\n - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.\n - Ensures only rows with valid URLs and unprocessed statuses are included for archival.\n - Supports organizing stored files into folder paths based on sheet and worksheet names.\n\n ### Setup\n - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.\n To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).\n - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.\n - Customize the column names in your Google sheet using the `columns` configuration.\n ",
"dependencies": {
"python": [
"loguru",
"gspread",
"slugify"
]
},
"entry_point": "gsheet_feeder::GsheetsFeeder",
"version": "1.0",
"configs": {
"sheet": {
"default": null,
"help": "name of the sheet to archive"
},
"sheet_id": {
"default": null,
"help": "the id of the sheet to archive (alternative to 'sheet' config)"
},
"header": {
"default": 1,
"type": "int",
"help": "index of the header row (starts at 1)"
},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
"required": true
},
"columns": {
"default": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage"
},
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
"type": "json_loader"
},
"allow_worksheets": {
"default": [],
"help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed"
},
"block_worksheets": {
"default": [],
"help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed"
},
"use_sheet_names_in_stored_paths": {
"default": true,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
"type": "bool"
}
}
},
"configs": {
"sheet": {
"default": null,
"help": "name of the sheet to archive"
},
"sheet_id": {
"default": null,
"help": "the id of the sheet to archive (alternative to 'sheet' config)"
},
"header": {
"default": 1,
"type": "int",
"help": "index of the header row (starts at 1)"
},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
"required": true
},
"columns": {
"default": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage"
},
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
"type": "json_loader"
},
"allow_worksheets": {
"default": [],
"help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed"
},
"block_worksheets": {
"default": [],
"help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed"
},
"use_sheet_names_in_stored_paths": {
"default": true,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
"type": "bool"
}
}
},
"atlos_feeder": {
"name": "atlos_feeder",
"display_name": "Atlos Feeder",
"manifest": {
"name": "Atlos Feeder",
"author": "Bellingcat",
"type": [
"feeder"
],
"requires_setup": true,
"description": "\n AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.\n\n ### Features\n - Connects to the Atlos API to retrieve a list of source material URLs.\n - Filters source materials based on visibility, processing status, and metadata.\n - Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL.\n - Iterates through paginated results using a cursor for efficient API interaction.\n\n ### Notes\n - Requires an Atlos API endpoint and a valid API token for authentication.\n - Ensures only unprocessed, visible, and ready-to-archive URLs are returned.\n - Handles pagination transparently when retrieving data from the Atlos API.\n ",
"description": "\n A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media,\n \n [Atlos](https://www.atlos.org/) is a visual investigation and archiving platform designed for investigative research, journalism, and open-source intelligence (OSINT). \n It helps users organize, analyze, and store media from various sources, making it easier to track and investigate digital evidence.\n \n To get started create a new project and obtain an API token from the settings page. You can group event's into Atlos's 'incidents'.\n Here you can add 'source material' by URLn and the Atlos feeder will fetch these URLs for archival.\n \n You can use Atlos only as a 'feeder', however you can also implement the 'database' and 'storage' features to store the media files in Atlos which is recommended.\n The Auto Archiver will retain the Atlos ID for each item, ensuring that the media and database outputs are uplaoded back into the relevant media item.\n \n \n ### Features\n - Connects to the Atlos API to retrieve a list of source material URLs.\n - Iterates through the URLs from all source material items which are unprocessed, visible, and ready to archive.\n - If the storage option is selected, it will store the media files alongside the original source material item in Atlos.\n - Is the database option is selected it will output the results to the media item, as well as updating failure status with error details when archiving fails.\n - Skips Storege/ database upload for items without an Atlos ID - restricting that you must use the Atlos feeder so that it has the Atlos ID to store the results with.\n\n ### Notes\n - Requires an Atlos account with a project and a valid API token for authentication.\n - Ensures only unprocessed, visible, and ready-to-archive URLs are returned.\n - Feches any media items within an Atlos project, regardless of separation into incidents.\n ",
"dependencies": {
"python": [
"loguru",
"requests"
]
},
"entry_point": "",
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
"version": "1.0",
"configs": {
"api_token": {
@ -222,6 +96,135 @@
}
}
},
"gsheet_feeder_db": {
"name": "gsheet_feeder_db",
"display_name": "Google Sheets Feeder Database",
"manifest": {
"name": "Google Sheets Feeder Database",
"author": "Bellingcat",
"type": [
"feeder",
"database"
],
"requires_setup": true,
"description": "\n GsheetsFeederDatabase\n A Google Sheets-based feeder and optional database for the Auto Archiver.\n\n This reads data from Google Sheets and filters rows based on user-defined rules.\n The filtered rows are processed into `Metadata` objects.\n\n ### Features\n - Validates the sheet structure and filters rows based on input configurations.\n - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.\n - Ensures only rows with valid URLs and unprocessed statuses are included for archival.\n - Supports organizing stored files into folder paths based on sheet and worksheet names.\n - If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.\n - Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.\n - Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.\n - Skips redundant updates for empty or invalid data fields.\n\n ### Setup\n - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.\n To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).\n - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.\n - Customize the column names in your Google sheet using the `columns` configuration.\n - The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.\n ",
"dependencies": {
"python": [
"loguru",
"gspread",
"slugify"
]
},
"entry_point": "gsheet_feeder_db::GsheetsFeederDB",
"version": "1.0",
"configs": {
"sheet": {
"default": null,
"help": "name of the sheet to archive"
},
"sheet_id": {
"default": null,
"help": "the id of the sheet to archive (alternative to 'sheet' config)"
},
"header": {
"default": 1,
"type": "int",
"help": "index of the header row (starts at 1)"
},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
"required": true
},
"columns": {
"default": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage"
},
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
"type": "json_loader"
},
"allow_worksheets": {
"default": [],
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed"
},
"block_worksheets": {
"default": [],
"help": "(CSV) explicitly block some worksheets from being processed"
},
"use_sheet_names_in_stored_paths": {
"default": true,
"type": "bool",
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'"
}
}
},
"configs": {
"sheet": {
"default": null,
"help": "name of the sheet to archive"
},
"sheet_id": {
"default": null,
"help": "the id of the sheet to archive (alternative to 'sheet' config)"
},
"header": {
"default": 1,
"type": "int",
"help": "index of the header row (starts at 1)"
},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
"required": true
},
"columns": {
"default": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage"
},
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
"type": "json_loader"
},
"allow_worksheets": {
"default": [],
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed"
},
"block_worksheets": {
"default": [],
"help": "(CSV) explicitly block some worksheets from being processed"
},
"use_sheet_names_in_stored_paths": {
"default": true,
"type": "bool",
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'"
}
}
},
"cli_feeder": {
"name": "cli_feeder",
"display_name": "Command Line Feeder",
@ -470,7 +473,7 @@
"extractor"
],
"requires_setup": true,
"description": "\n Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts\n and user profiles, downloading as much information as possible, including images, videos, text, stories,\n highlights, and tagged posts. \n Authentication is required via username/password or a session file.\n \n ",
"description": "\n Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. \n \n > \u26a0\ufe0f **Warning** \n > This module is not actively maintained due to known issues with blocking. \n > Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)\n \n This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,\n highlights, and tagged posts. \n Authentication is required via username/password or a session file.\n \n ",
"dependencies": {
"python": [
"instaloader",
@ -482,38 +485,38 @@
"configs": {
"username": {
"required": true,
"help": "a valid Instagram username"
"help": "A valid Instagram username."
},
"password": {
"required": true,
"help": "the corresponding Instagram account password"
"help": "The corresponding Instagram account password."
},
"download_folder": {
"default": "instaloader",
"help": "name of a folder to temporarily download content to"
"help": "Name of a folder to temporarily download content to."
},
"session_file": {
"default": "secrets/instaloader.session",
"help": "path to the instagram session which saves session credentials"
"help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one."
}
}
},
"configs": {
"username": {
"required": true,
"help": "a valid Instagram username"
"help": "A valid Instagram username."
},
"password": {
"required": true,
"help": "the corresponding Instagram account password"
"help": "The corresponding Instagram account password."
},
"download_folder": {
"default": "instaloader",
"help": "name of a folder to temporarily download content to"
"help": "Name of a folder to temporarily download content to."
},
"session_file": {
"default": "secrets/instaloader.session",
"help": "path to the instagram session which saves session credentials"
"help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one."
}
}
},
@ -661,7 +664,7 @@
"extractor"
],
"requires_setup": false,
"description": "\nThis is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.\n\nThis module is responsible for downloading and processing media content from platforms\nsupported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality\nfor retrieving videos, subtitles, comments, and other metadata, and it integrates with\nthe broader archiving framework.\n\n### Features\n- Supports downloading videos and playlists.\n- Retrieves metadata like titles, descriptions, upload dates, and durations.\n- Downloads subtitles and comments when enabled.\n- Configurable options for handling live streams, proxies, and more.\n- Supports authentication of websites using the 'authentication' settings from your orchestration.\n\n### Dropins\n- For websites supported by `yt-dlp` that also contain posts in addition to videos\n (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create \n metadata objects. Some dropins are included in this generic_archiver by default, but\ncustom dropins can be created to handle additional websites and passed to the archiver\nvia the command line using the `--dropins` option (TODO!).\n",
"description": "\nThis is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.\n\nThis module is responsible for downloading and processing media content from platforms\nsupported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality\nfor retrieving videos, subtitles, comments, and other metadata, and it integrates with\nthe broader archiving framework.\n\n### Features\n- Supports downloading videos and playlists.\n- Retrieves metadata like titles, descriptions, upload dates, and durations.\n- Downloads subtitles and comments when enabled.\n- Configurable options for handling live streams, proxies, and more.\n- Supports authentication of websites using the 'authentication' settings from your orchestration.\n\n### Dropins\n- For websites supported by `yt-dlp` that also contain posts in addition to videos\n (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create \n metadata objects. Some dropins are included in this generic_archiver by default, but\ncustom dropins can be created to handle additional websites and passed to the archiver\nvia the command line using the `--dropins` option (TODO!).\n\n### Auto-Updates\n\nThe Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).\nThis can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).\nIf you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.\n\n",
"dependencies": {
"python": [
"yt_dlp",
@ -710,6 +713,11 @@
"max_downloads": {
"default": "inf",
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."
},
"ytdlp_update_interval": {
"default": 5,
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
"type": "int"
}
}
},
@ -751,9 +759,38 @@
"max_downloads": {
"default": "inf",
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."
},
"ytdlp_update_interval": {
"default": 5,
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
"type": "int"
}
}
},
"tiktok_tikwm_extractor": {
"name": "tiktok_tikwm_extractor",
"display_name": "Tiktok Tikwm Extractor",
"manifest": {
"name": "Tiktok Tikwm Extractor",
"author": "Bellingcat",
"type": [
"extractor"
],
"requires_setup": false,
"description": "\n Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/\n\t\n\tThis extractor complements the generic_extractor which can already get TikTok videos, but this one can extract special videos like those marked as sensitive.\n\n ### Features\n - Downloads the video and, if possible, also the video cover.\n\t- Stores extra metadata about the post like author information, and more as returned by tikwm.com. \n\n ### Notes\n - If tikwm.com is down, this extractor will not work.\n\t- If tikwm.com changes their API, this extractor may break.\n\t- If no video is found, this extractor will consider the extraction failed.\n ",
"dependencies": {
"python": [
"loguru",
"requests"
],
"bin": []
},
"entry_point": "",
"version": "1.0",
"configs": {}
},
"configs": null
},
"telegram_extractor": {
"name": "telegram_extractor",
"display_name": "Telegram Extractor",
@ -1054,7 +1091,7 @@
"help": "width of the screenshots"
},
"height": {
"default": 720,
"default": 1024,
"type": "int",
"help": "height of the screenshots"
},
@ -1091,7 +1128,7 @@
"help": "width of the screenshots"
},
"height": {
"default": 720,
"default": 1024,
"type": "int",
"help": "height of the screenshots"
},
@ -1201,6 +1238,79 @@
}
}
},
"opentimestamps_enricher": {
"name": "opentimestamps_enricher",
"display_name": "OpenTimestamps Enricher",
"manifest": {
"name": "OpenTimestamps Enricher",
"author": "Bellingcat",
"type": [
"enricher"
],
"requires_setup": true,
"description": "\n Creates OpenTimestamps proofs for archived files, providing blockchain-backed evidence of file existence at a specific time.\n\n Uses OpenTimestamps \u2013 a service that timestamps data using the Bitcoin blockchain, providing a decentralized \n and secure way to prove that data existed at a certain point in time.\n\n ### Features\n - Creates cryptographic timestamp proofs that link files to the Bitcoin blockchain\n - Verifies existing timestamp proofs to confirm the time a file existed\n - Uses multiple calendar servers to ensure reliability and redundancy\n - Stores timestamp proofs alongside original files for future verification\n\n ### Notes\n - Can work offline to create timestamp proofs that can be upgraded later\n - Verification checks if timestamps have been confirmed in the Bitcoin blockchain\n - Should run after files have been archived and hashed\n\n ### Verifying Timestamps Later\n If you wish to verify a timestamp (ots) file later, you can install the opentimestamps-client command line tool and use the `ots verify` command.\n Example: `ots verify my_file.ots`\n\n Note: if you're using local storage with a filename_generator set to 'static' (a hash) or random, the files will be renamed when they are saved to the\n final location meaning you will need to specify the original filename when verifying the timestamp with `ots verify -f original_filename my_file.ots`.\n ",
"dependencies": {
"python": [
"loguru",
"opentimestamps"
]
},
"entry_point": "",
"version": "1.0",
"configs": {
"use_calendars": {
"default": true,
"help": "Whether to connect to OpenTimestamps calendar servers to create timestamps. If false, creates local timestamp proofs only.",
"type": "bool"
},
"calendar_urls": {
"default": [
"https://alice.btc.calendar.opentimestamps.org",
"https://bob.btc.calendar.opentimestamps.org",
"https://finney.calendar.eternitywall.com"
],
"help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars",
"type": "list"
},
"calendar_whitelist": {
"default": [],
"help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']",
"type": "list"
},
"verify_timestamps": {
"default": true,
"help": "Whether to verify timestamps after creating them.",
"type": "bool"
}
}
},
"configs": {
"use_calendars": {
"default": true,
"help": "Whether to connect to OpenTimestamps calendar servers to create timestamps. If false, creates local timestamp proofs only.",
"type": "bool"
},
"calendar_urls": {
"default": [
"https://alice.btc.calendar.opentimestamps.org",
"https://bob.btc.calendar.opentimestamps.org",
"https://finney.calendar.eternitywall.com"
],
"help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars",
"type": "list"
},
"calendar_whitelist": {
"default": [],
"help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']",
"type": "list"
},
"verify_timestamps": {
"default": true,
"help": "Whether to verify timestamps after creating them.",
"type": "bool"
}
}
},
"thumbnail_enricher": {
"name": "thumbnail_enricher",
"display_name": "Thumbnail Enricher",
@ -1381,56 +1491,6 @@
}
}
},
"atlos_db": {
"name": "atlos_db",
"display_name": "Atlos Database",
"manifest": {
"name": "Atlos Database",
"author": "Bellingcat",
"type": [
"database"
],
"requires_setup": true,
"description": "\nHandles integration with the Atlos platform for managing archival results.\n\n### Features\n- Outputs archival results to the Atlos API for storage and tracking.\n- Updates failure status with error details when archiving fails.\n- Processes and formats metadata, including ISO formatting for datetime fields.\n- Skips processing for items without an Atlos ID.\n\n### Setup\nRequired configs:\n- atlos_url: Base URL for the Atlos API.\n- api_token: Authentication token for API access.\n",
"dependencies": {
"python": [
"loguru",
""
],
"bin": [
""
]
},
"entry_point": "atlos_db::AtlosDb",
"version": "1.0",
"configs": {
"api_token": {
"default": null,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"required": true,
"type": "str"
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"type": "str"
}
}
},
"configs": {
"api_token": {
"default": null,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"required": true,
"type": "str"
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"type": "str"
}
}
},
"api_db": {
"name": "api_db",
"display_name": "Auto Archiver API Database",
@ -1473,9 +1533,9 @@
"help": "which group of users have access to the archive in case public=false as author"
},
"use_api_cache": {
"default": true,
"default": false,
"type": "bool",
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"
"help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived"
},
"store_results": {
"default": true,
@ -1511,9 +1571,9 @@
"help": "which group of users have access to the archive in case public=false as author"
},
"use_api_cache": {
"default": true,
"default": false,
"type": "bool",
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"
"help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived"
},
"store_results": {
"default": true,
@ -1526,58 +1586,6 @@
}
}
},
"gsheet_db": {
"name": "gsheet_db",
"display_name": "Google Sheets Database",
"manifest": {
"name": "Google Sheets Database",
"author": "Bellingcat",
"type": [
"database"
],
"requires_setup": true,
"description": "\n GsheetsDatabase:\n Handles integration with Google Sheets for tracking archival tasks.\n\n### Features\n- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.\n- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.\n- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.\n- Skips redundant updates for empty or invalid data fields.\n\n### Notes\n- Currently works only with metadata provided by GsheetFeeder. \n- Requires configuration of a linked Google Sheet and appropriate API credentials.\n ",
"dependencies": {
"python": [
"loguru",
"gspread",
"slugify"
]
},
"entry_point": "gsheet_db::GsheetsDb",
"version": "1.0",
"configs": {
"allow_worksheets": {
"default": [],
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed"
},
"block_worksheets": {
"default": [],
"help": "(CSV) explicitly block some worksheets from being processed"
},
"use_sheet_names_in_stored_paths": {
"default": true,
"type": "bool",
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'"
}
}
},
"configs": {
"allow_worksheets": {
"default": [],
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed"
},
"block_worksheets": {
"default": [],
"help": "(CSV) explicitly block some worksheets from being processed"
},
"use_sheet_names_in_stored_paths": {
"default": true,
"type": "bool",
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'"
}
}
},
"console_db": {
"name": "console_db",
"display_name": "Console Database",
@ -1664,7 +1672,7 @@
},
"filename_generator": {
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
"choices": [
"random",
"static"
@ -1696,7 +1704,7 @@
},
"filename_generator": {
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
"choices": [
"random",
"static"
@ -1716,54 +1724,6 @@
}
}
},
"atlos_storage": {
"name": "atlos_storage",
"display_name": "Atlos Storage",
"manifest": {
"name": "Atlos Storage",
"author": "Bellingcat",
"type": [
"storage"
],
"requires_setup": true,
"description": "\n Stores media files in a [Atlos](https://www.atlos.org/).\n\n ### Features\n - Saves media files to Atlos, organizing them into folders based on the provided path structure.\n\n ### Notes\n - Requires setup with Atlos credentials.\n - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.\n ",
"dependencies": {
"python": [
"loguru",
"boto3"
],
"bin": []
},
"entry_point": "",
"version": "1.0",
"configs": {
"api_token": {
"default": null,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"required": true,
"type": "str"
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"type": "str"
}
}
},
"configs": {
"api_token": {
"default": null,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"required": true,
"type": "str"
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"type": "str"
}
}
},
"s3_storage": {
"name": "s3_storage",
"display_name": "S3 Storage",
@ -1796,7 +1756,7 @@
},
"filename_generator": {
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
"choices": [
"random",
"static"
@ -1850,7 +1810,7 @@
},
"filename_generator": {
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
"choices": [
"random",
"static"
@ -1922,7 +1882,7 @@
},
"filename_generator": {
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)",
"choices": [
"random",
"static"
@ -1951,7 +1911,7 @@
},
"filename_generator": {
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)",
"choices": [
"random",
"static"
@ -2029,9 +1989,9 @@
"steps": {
"feeders": [
"cli_feeder",
"gsheet_feeder",
"atlos_feeder",
"csv_feeder"
"atlos_feeder_db_storage",
"csv_feeder",
"gsheet_feeder_db"
],
"extractors": [
"wayback_extractor_enricher",
@ -2039,6 +1999,7 @@
"instagram_api_extractor",
"instagram_tbot_extractor",
"generic_extractor",
"tiktok_tikwm_extractor",
"twitter_api_extractor",
"instagram_extractor",
"telethon_extractor",
@ -2055,20 +2016,21 @@
"meta_enricher",
"pdq_hash_enricher",
"whisper_enricher",
"opentimestamps_enricher",
"ssl_enricher",
"hash_enricher"
],
"databases": [
"console_db",
"atlos_db",
"api_db",
"csv_db",
"gsheet_db"
"atlos_feeder_db_storage",
"gsheet_feeder_db"
],
"storages": [
"local_storage",
"gdrive_storage",
"atlos_storage",
"atlos_feeder_db_storage",
"s3_storage"
],
"formatters": [
@ -2077,9 +2039,9 @@
]
},
"configs": [
"gsheet_feeder",
"atlos_feeder",
"atlos_feeder_db_storage",
"csv_feeder",
"gsheet_feeder_db",
"cli_feeder",
"instagram_api_extractor",
"instagram_tbot_extractor",
@ -2093,15 +2055,13 @@
"timestamping_enricher",
"screenshot_enricher",
"whisper_enricher",
"opentimestamps_enricher",
"thumbnail_enricher",
"ssl_enricher",
"hash_enricher",
"atlos_db",
"api_db",
"gsheet_db",
"csv_db",
"gdrive_storage",
"atlos_storage",
"s3_storage",
"local_storage",
"html_formatter"

Wyświetl plik

@ -7,7 +7,7 @@ by handling user configuration, validating the steps properties, and implementin
from __future__ import annotations
from dataclasses import dataclass
from typing import List, TYPE_CHECKING
from typing import List, TYPE_CHECKING, Type
import shutil
import ast
import copy
@ -60,7 +60,7 @@ class ModuleFactory:
HAS_SETUP_PATHS = True
def get_module(self, module_name: str, config: dict) -> BaseModule:
def get_module(self, module_name: str, config: dict) -> Type[BaseModule]:
"""
Gets and sets up a module using the provided config

Wyświetl plik

@ -19,7 +19,7 @@
},
"filename_generator": {
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
"choices": ["random", "static"],
},
"root_folder_id": {

Wyświetl plik

@ -13,7 +13,7 @@
},
"filename_generator": {
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)",
"choices": ["random", "static"],
},
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},

Wyświetl plik

@ -0,0 +1,100 @@
{
"name": "OpenTimestamps Enricher",
"type": ["enricher"],
"requires_setup": True,
"dependencies": {
"python": [
"loguru",
"opentimestamps",
],
},
"configs": {
"calendar_urls": {
"default": [
"https://alice.btc.calendar.opentimestamps.org",
"https://bob.btc.calendar.opentimestamps.org",
"https://finney.calendar.eternitywall.com",
# "https://ots.btc.catallaxy.com/", # ipv4 only
],
"help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:\
https://opentimestamps.org/#calendars",
"type": "list",
},
"calendar_whitelist": {
"default": [],
"help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']",
"type": "list",
},
},
"description": """
Creates OpenTimestamps proofs for archived files, providing blockchain-backed evidence of file existence at a specific time.
Uses OpenTimestamps a service that timestamps data using the Bitcoin blockchain, providing a decentralized
and secure way to prove that data existed at a certain point in time. A SHA256 hash of the file to be timestamped is used as the token
and sent to each of the 'timestamp calendars' for inclusion in the blockchain. The proof is then saved alongside the original file in a file with
the '.ots' extension.
### Features
- Creates cryptographic timestamp proofs that link files to the Bitcoin
- Verifies timestamp proofs have been submitted to the blockchain (note: does not confirm they have been *added*)
- Can use multiple calendar servers to ensure reliability and redundancy
- Stores timestamp proofs alongside original files for future verification
### Timestamp status
An opentimestamp, when submitted to a timestmap server will have a 'pending' status (Pending Attestation) as it waits to be added
to the blockchain. Once it has been added to the blockchain, it will have a 'confirmed' status (Bitcoin Block Timestamp).
This process typically takes several hours, depending on the calendar server and the current state of the Bitcoin network. As such,
the status of all timestamps added will be 'pending' until they are subsequently confirmed (see 'Upgrading Timestamps' below).
There are two possible statuses for a timestamp:
- `Pending`: The timestamp has been submitted to the calendar server but has not yet been confirmed in the Bitcoin blockchain.
- `Confirmed`: The timestamp has been confirmed in the Bitcoin blockchain.
### Upgrading Timestamps
To upgrade a timestamp from 'pending' to 'confirmed', you can use the `ots upgrade` command from the opentimestamps-client package
(install it with `pip install opentimesptamps-client`).
Example: `ots upgrade my_file.ots`
Here is a useful script that could be used to upgrade all timestamps in a directory, which could be run on a cron job:
```{code} bash
find . -name "*.ots" -type f | while read file; do
echo "Upgrading OTS $file"
ots upgrade $file
done
# The result might look like:
# Upgrading OTS ./my_file.ots
# Got 1 attestation(s) from https://alice.btc.calendar.opentimestamps.org
# Success! Timestamp complete
```
```{note} Note: this will only upgrade the .ots files, and will not change the status text in any output .html files or any databases where the
metadata is stored (e.g. Google Sheets, CSV database, API database etc.).
```
### Verifying Timestamps
The easiest way to verify a timestamp (ots) file is to install the opentimestamps-client command line tool and use the `ots verify` command.
Example: `ots verify my_file.ots`
```{code} bash
$ ots verify my_file.ots
Calendar https://bob.btc.calendar.opentimestamps.org: Pending confirmation in Bitcoin blockchain
Calendar https://finney.calendar.eternitywall.com: Pending confirmation in Bitcoin blockchain
Calendar https://alice.btc.calendar.opentimestamps.org: Timestamped by transaction 12345; waiting for 6 confirmations
```
Note: if you're using a storage with `filename_generator` set to `static` or `random`, the files will be renamed when they are saved to the
final location meaning you will need to specify the original filename when verifying the timestamp with `ots verify -f original_filename my_file.ots`.
### Choosing Calendar Servers
By default, the OpenTimestamps enricher uses a set of public calendar servers provided by the 'opentimestamps' project.
You can customize the list of calendar servers by providing URLs in the `calendar_urls` configuration option.
### Calendar WhiteList
By default, the opentimestamps package only allows their own calendars to be used (see `DEFAULT_CALENDAR_WHITELIST` in `opentimestamps.calendar`),
if you want to use your own calendars, then you can override this setting in the `calendar_whitelist` configuration option.
""",
}

Wyświetl plik

@ -0,0 +1,172 @@
import os
from loguru import logger
import opentimestamps
from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation
from opentimestamps.core.op import OpSHA256
from opentimestamps.core import serialize
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media
from auto_archiver.utils.misc import get_current_timestamp
class OpentimestampsEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"OpenTimestamps timestamping files for {url=}")
# Get the media files to timestamp
media_files = [m for m in to_enrich.media if m.filename and not m.get("opentimestamps")]
if not media_files:
logger.warning(f"No files found to timestamp in {url=}")
return
timestamp_files = []
for media in media_files:
try:
# Get the file path from the media
file_path = media.filename
if not os.path.exists(file_path):
logger.warning(f"File not found: {file_path}")
continue
# Create timestamp for the file - hash is SHA256
# Note: hash is hard-coded to SHA256 and does not use hash_enricher to set it.
# SHA256 is the recommended hash, ref: https://github.com/bellingcat/auto-archiver/pull/247#discussion_r1992433181
logger.debug(f"Creating timestamp for {file_path}")
file_hash = None
with open(file_path, "rb") as f:
file_hash = OpSHA256().hash_fd(f)
if not file_hash:
logger.warning(f"Failed to hash file for timestamping, skipping: {file_path}")
continue
# Create a timestamp with the file hash
timestamp = Timestamp(file_hash)
# Create a detached timestamp file with the hash operation and timestamp
detached_timestamp = DetachedTimestampFile(OpSHA256(), timestamp)
# Submit to calendar servers
submitted_to_calendar = False
logger.debug(f"Submitting timestamp to calendar servers for {file_path}")
calendars = []
whitelist = DEFAULT_CALENDAR_WHITELIST
if self.calendar_whitelist:
whitelist = set(self.calendar_whitelist)
# Create calendar instances
calendar_urls = []
for url in self.calendar_urls:
if url in whitelist:
calendars.append(RemoteCalendar(url))
calendar_urls.append(url)
# Submit the hash to each calendar
for calendar in calendars:
try:
calendar_timestamp = calendar.submit(file_hash)
timestamp.merge(calendar_timestamp)
logger.debug(f"Successfully submitted to calendar: {calendar.url}")
submitted_to_calendar = True
except Exception as e:
logger.warning(f"Failed to submit to calendar {calendar.url}: {e}")
# If all calendar submissions failed, add pending attestations
if not submitted_to_calendar and not timestamp.attestations:
logger.error(
f"Failed to submit to any calendar for {file_path}. **This file will not be timestamped.**"
)
media.set("opentimestamps", False)
continue
# Save the timestamp proof to a file
timestamp_path = os.path.join(self.tmp_dir, f"{os.path.basename(file_path)}.ots")
try:
with open(timestamp_path, "wb") as f:
# Create a serialization context and write to the file
ctx = serialize.BytesSerializationContext()
detached_timestamp.serialize(ctx)
f.write(ctx.getbytes())
except Exception as e:
logger.warning(f"Failed to serialize timestamp file: {e}")
continue
# Create media for the timestamp file
timestamp_media = Media(filename=timestamp_path)
# explicitly set the mimetype, normally .ots files are 'application/vnd.oasis.opendocument.spreadsheet-template'
timestamp_media.mimetype = "application/vnd.opentimestamps"
timestamp_media.set("opentimestamps_version", opentimestamps.__version__)
verification_info = self.verify_timestamp(detached_timestamp)
for key, value in verification_info.items():
timestamp_media.set(key, value)
media.set("opentimestamp_files", [timestamp_media])
timestamp_files.append(timestamp_media.filename)
# Update the original media to indicate it's been timestamped
media.set("opentimestamps", True)
except Exception as e:
logger.warning(f"Error while timestamping {media.filename}: {e}")
# Add timestamp files to the metadata
if timestamp_files:
to_enrich.set("opentimestamped", True)
to_enrich.set("opentimestamps_count", len(timestamp_files))
logger.success(f"{len(timestamp_files)} OpenTimestamps proofs created for {url=}")
else:
to_enrich.set("opentimestamped", False)
logger.warning(f"No successful timestamps created for {url=}")
def verify_timestamp(self, detached_timestamp):
"""
Verify a timestamp and extract verification information.
Args:
detached_timestamp: The detached timestamp to verify.
Returns:
dict: Information about the verification result.
"""
result = {}
# Check if we have attestations
attestations = list(detached_timestamp.timestamp.all_attestations())
result["attestation_count"] = len(attestations)
if attestations:
attestation_info = []
for msg, attestation in attestations:
info = {}
# Process different types of attestations
if isinstance(attestation, PendingAttestation):
info["status"] = "pending"
info["uri"] = attestation.uri
elif isinstance(attestation, BitcoinBlockHeaderAttestation):
info["status"] = "confirmed"
info["block_height"] = attestation.height
info["last_check"] = get_current_timestamp()
attestation_info.append(info)
result["attestations"] = attestation_info
# For at least one confirmed attestation
if any("confirmed" in a.get("status") for a in attestation_info):
result["verified"] = True
else:
result["verified"] = False
else:
result["verified"] = False
result["last_updated"] = get_current_timestamp()
return result

Wyświetl plik

@ -13,7 +13,7 @@
},
"filename_generator": {
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
"choices": ["random", "static"],
},
"bucket": {"default": None, "help": "S3 bucket name"},

Wyświetl plik

@ -0,0 +1,276 @@
import pytest
import hashlib
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
from opentimestamps.calendar import RemoteCalendar
from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation
from auto_archiver.core import Metadata, Media
# TODO: Remove once timestamping overhaul is merged
@pytest.fixture
def sample_media(tmp_path) -> Media:
"""Fixture creating a Media object with temporary source file"""
src_file = tmp_path / "source.txt"
src_file.write_text("test content")
return Media(_key="subdir/test.txt", filename=str(src_file))
@pytest.fixture
def sample_file_path(tmp_path):
tmp_file = tmp_path / "test.txt"
tmp_file.write_text("This is a test file content for OpenTimestamps")
return str(tmp_file)
@pytest.fixture
def detached_timestamp_file():
"""Create a simple detached timestamp file for testing"""
file_hash = hashlib.sha256(b"Test content").digest()
from opentimestamps.core.op import OpSHA256
file_hash_op = OpSHA256()
timestamp = Timestamp(file_hash)
# Add a pending attestation
pending = PendingAttestation("https://example.calendar.com")
timestamp.attestations.add(pending)
# Add a bitcoin attestation
bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height
timestamp.attestations.add(bitcoin)
return DetachedTimestampFile(file_hash_op, timestamp)
@pytest.fixture
def verified_timestamp_file():
"""Create a timestamp file with a Bitcoin attestation"""
file_hash = hashlib.sha256(b"Verified content").digest()
from opentimestamps.core.op import OpSHA256
file_hash_op = OpSHA256()
timestamp = Timestamp(file_hash)
# Add only a Bitcoin attestation
bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height
timestamp.attestations.add(bitcoin)
return DetachedTimestampFile(file_hash_op, timestamp)
@pytest.fixture
def pending_timestamp_file():
"""Create a timestamp file with only pending attestations"""
file_hash = hashlib.sha256(b"Pending content").digest()
from opentimestamps.core.op import OpSHA256
file_hash_op = OpSHA256()
timestamp = Timestamp(file_hash)
# Add only pending attestations
pending1 = PendingAttestation("https://example1.calendar.com")
pending2 = PendingAttestation("https://example2.calendar.com")
timestamp.attestations.add(pending1)
timestamp.attestations.add(pending2)
return DetachedTimestampFile(file_hash_op, timestamp)
@pytest.mark.download
def test_download_tsr(setup_module, mocker):
"""Test submitting a hash to calendar servers"""
# Mock the RemoteCalendar submit method
mock_submit = mocker.patch.object(RemoteCalendar, "submit")
test_timestamp = Timestamp(hashlib.sha256(b"test").digest())
mock_submit.return_value = test_timestamp
# Create a calendar
calendar = RemoteCalendar("https://alice.btc.calendar.opentimestamps.org")
# Test submission
file_hash = hashlib.sha256(b"Test file content").digest()
result = calendar.submit(file_hash)
assert mock_submit.called
assert isinstance(result, Timestamp)
assert result == test_timestamp
def test_verify_timestamp(setup_module, detached_timestamp_file):
"""Test the verification of timestamp attestations"""
ots = setup_module("opentimestamps_enricher")
# Test verification
verification_info = ots.verify_timestamp(detached_timestamp_file)
# Check verification results
assert verification_info["attestation_count"] == 2
assert verification_info["verified"] is True
assert len(verification_info["attestations"]) == 2
# Check attestation types
assertion_types = [a["status"] for a in verification_info["attestations"]]
assert "pending" in assertion_types
assert "confirmed" in assertion_types
# Check Bitcoin attestation details
bitcoin_attestation = next(a for a in verification_info["attestations"] if a["status"] == "confirmed")
assert bitcoin_attestation["block_height"] == 783000
def test_verify_pending_only(setup_module, pending_timestamp_file):
"""Test verification of timestamps with only pending attestations"""
ots = setup_module("opentimestamps_enricher")
verification_info = ots.verify_timestamp(pending_timestamp_file)
assert verification_info["attestation_count"] == 2
assert verification_info["verified"] is False
# All attestations should be of type "pending"
assert all(a["status"] == "pending" for a in verification_info["attestations"])
# Check URIs of pending attestations
uris = [a["uri"] for a in verification_info["attestations"]]
assert "https://example1.calendar.com" in uris
assert "https://example2.calendar.com" in uris
def test_verify_bitcoin_completed(setup_module, verified_timestamp_file):
"""Test verification of timestamps with completed Bitcoin attestations"""
ots = setup_module("opentimestamps_enricher")
verification_info = ots.verify_timestamp(verified_timestamp_file)
assert verification_info["attestation_count"] == 1
assert verification_info["verified"] is True
assert "pending" not in verification_info
# Check that the attestation is a Bitcoin attestation
attestation = verification_info["attestations"][0]
assert attestation["status"] == "confirmed"
assert attestation["block_height"] == 783000
def test_full_enriching(setup_module, sample_file_path, sample_media, mocker):
"""Test the complete enrichment process"""
# Mock the calendar submission to avoid network requests
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
# Create a function that returns a new timestamp for each call
def side_effect(digest):
test_timestamp = Timestamp(digest)
# Add a bitcoin attestation to the test timestamp
bitcoin = BitcoinBlockHeaderAttestation(783000)
test_timestamp.attestations.add(bitcoin)
return test_timestamp
mock_calendar.side_effect = side_effect
ots = setup_module("opentimestamps_enricher")
# Create test metadata with sample file
metadata = Metadata().set_url("https://example.com")
sample_media.filename = sample_file_path
metadata.add_media(sample_media)
# Run enrichment
ots.enrich(metadata)
# Verify results
assert metadata.get("opentimestamped") is True
assert metadata.get("opentimestamps_count") == 1
# Check that we have one parent media item: the original
assert len(metadata.media) == 1
# Check that the original media was updated
assert metadata.media[0].get("opentimestamps") is True
# Check the timestamp file media is a child of the original
assert len(metadata.media[0].get("opentimestamp_files")) == 1
timestamp_media = metadata.media[0].get("opentimestamp_files")[0]
assert timestamp_media.get("opentimestamps_version") is not None
# Check verification results on the timestamp media
assert timestamp_media.get("verified") is True
assert timestamp_media.get("attestation_count") == 1
def test_full_enriching_one_calendar_error(
setup_module, sample_file_path, sample_media, mocker, pending_timestamp_file
):
"""Test enrichment when one calendar server returns an error"""
# Mock the calendar submission to raise an exception
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
test_timestamp = Timestamp(bytes.fromhex("583988e03646c26fa290c5c2408540a2f4e2aa9be087aa4546aefb531385b935"))
# Add a bitcoin attestation to the test timestamp
bitcoin = BitcoinBlockHeaderAttestation(783000)
test_timestamp.attestations.add(bitcoin)
mock_calendar.side_effect = [test_timestamp, Exception("Calendar server error")]
ots = setup_module(
"opentimestamps_enricher",
{
"calendar_urls": [
"https://alice.btc.calendar.opentimestamps.org",
"https://bob.btc.calendar.opentimestamps.org",
]
},
)
# Create test metadata with sample file
metadata = Metadata().set_url("https://example.com")
sample_media.filename = sample_file_path
metadata.add_media(sample_media)
# Run enrichment (should complete despite calendar errors)
ots.enrich(metadata)
# Verify results
assert metadata.get("opentimestamped") is True
assert metadata.get("opentimestamps_count") == 1 # only alice worked, not bob
def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker):
"""Test enrichment when calendar servers return errors"""
# Mock the calendar submission to raise an exception
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
mock_calendar.side_effect = Exception("Calendar server error")
ots = setup_module("opentimestamps_enricher")
# Create test metadata with sample file
metadata = Metadata().set_url("https://example.com")
sample_media.filename = sample_file_path
metadata.add_media(sample_media)
# Run enrichment (should complete despite calendar errors)
ots.enrich(metadata)
# Verify results
assert metadata.get("opentimestamped") is False
assert metadata.get("opentimestamps_count") is None
def test_no_files_to_stamp(setup_module):
"""Test enrichment with no files to timestamp"""
ots = setup_module("opentimestamps_enricher")
# Create empty metadata
metadata = Metadata().set_url("https://example.com")
# Run enrichment
ots.enrich(metadata)
# Verify no timestamping occurred
assert metadata.get("opentimestamped") is None
assert len(metadata.media) == 0