kopia lustrzana https://github.com/bellingcat/auto-archiver
commit
17463de937
|
@ -1,4 +1,4 @@
|
|||
# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "accessible-pygments"
|
||||
|
@ -51,7 +51,7 @@ typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""}
|
|||
|
||||
[package.extras]
|
||||
doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx_rtd_theme"]
|
||||
test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21)"]
|
||||
test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1) ; python_version >= \"3.10\"", "uvloop (>=0.21) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\" and python_version < \"3.14\""]
|
||||
trio = ["trio (>=0.26.1)"]
|
||||
|
||||
[[package]]
|
||||
|
@ -94,12 +94,12 @@ files = [
|
|||
]
|
||||
|
||||
[package.extras]
|
||||
benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
|
||||
cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
|
||||
dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
|
||||
benchmark = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
|
||||
cov = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
|
||||
dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
|
||||
docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
|
||||
tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
|
||||
tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
|
||||
tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
|
||||
tests-mypy = ["mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\""]
|
||||
|
||||
[[package]]
|
||||
name = "authlib"
|
||||
|
@ -145,7 +145,7 @@ files = [
|
|||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["backports.zoneinfo", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata"]
|
||||
dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
|
@ -781,7 +781,7 @@ files = [
|
|||
[package.extras]
|
||||
docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
|
||||
testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
|
||||
typing = ["typing-extensions (>=4.12.2)"]
|
||||
typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""]
|
||||
|
||||
[[package]]
|
||||
name = "future"
|
||||
|
@ -816,7 +816,7 @@ requests = ">=2.18.0,<3.0.0.dev0"
|
|||
|
||||
[package.extras]
|
||||
async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"]
|
||||
grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0)"]
|
||||
grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""]
|
||||
grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
|
||||
grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
|
||||
|
||||
|
@ -1115,7 +1115,7 @@ colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
|
|||
win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["Sphinx (==8.1.3)", "build (==1.2.2)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.5.0)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.13.0)", "mypy (==v1.4.1)", "myst-parser (==4.0.0)", "pre-commit (==4.0.1)", "pytest (==6.1.2)", "pytest (==8.3.2)", "pytest-cov (==2.12.1)", "pytest-cov (==5.0.0)", "pytest-cov (==6.0.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.1.0)", "sphinx-rtd-theme (==3.0.2)", "tox (==3.27.1)", "tox (==4.23.2)", "twine (==6.0.1)"]
|
||||
dev = ["Sphinx (==8.1.3) ; python_version >= \"3.11\"", "build (==1.2.2) ; python_version >= \"3.11\"", "colorama (==0.4.5) ; python_version < \"3.8\"", "colorama (==0.4.6) ; python_version >= \"3.8\"", "exceptiongroup (==1.1.3) ; python_version >= \"3.7\" and python_version < \"3.11\"", "freezegun (==1.1.0) ; python_version < \"3.8\"", "freezegun (==1.5.0) ; python_version >= \"3.8\"", "mypy (==v0.910) ; python_version < \"3.6\"", "mypy (==v0.971) ; python_version == \"3.6\"", "mypy (==v1.13.0) ; python_version >= \"3.8\"", "mypy (==v1.4.1) ; python_version == \"3.7\"", "myst-parser (==4.0.0) ; python_version >= \"3.11\"", "pre-commit (==4.0.1) ; python_version >= \"3.9\"", "pytest (==6.1.2) ; python_version < \"3.8\"", "pytest (==8.3.2) ; python_version >= \"3.8\"", "pytest-cov (==2.12.1) ; python_version < \"3.8\"", "pytest-cov (==5.0.0) ; python_version == \"3.8\"", "pytest-cov (==6.0.0) ; python_version >= \"3.9\"", "pytest-mypy-plugins (==1.9.3) ; python_version >= \"3.6\" and python_version < \"3.8\"", "pytest-mypy-plugins (==3.1.0) ; python_version >= \"3.8\"", "sphinx-rtd-theme (==3.0.2) ; python_version >= \"3.11\"", "tox (==3.27.1) ; python_version < \"3.8\"", "tox (==4.23.2) ; python_version >= \"3.8\"", "twine (==6.0.1) ; python_version >= \"3.11\""]
|
||||
|
||||
[[package]]
|
||||
name = "markdown-it-py"
|
||||
|
@ -1429,6 +1429,22 @@ rsa = ["cryptography (>=3.0.0)"]
|
|||
signals = ["blinker (>=1.4.0)"]
|
||||
signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
|
||||
|
||||
[[package]]
|
||||
name = "opentimestamps"
|
||||
version = "0.4.5"
|
||||
description = "Create and verify OpenTimestamps proofs"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "opentimestamps-0.4.5-py3-none-any.whl", hash = "sha256:a4912b3bd1b612a3ef5fac925b9137889e6c5cb91cc9e76c8202a2bf8abe26b5"},
|
||||
{file = "opentimestamps-0.4.5.tar.gz", hash = "sha256:56726ccde97fb67f336a7f237ce36808e5593c3089d68d900b1c83d0ebf9dcfa"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pycryptodomex = ">=3.3.1"
|
||||
python-bitcoinlib = ">=0.9.0,<0.13.0"
|
||||
|
||||
[[package]]
|
||||
name = "oscrypto"
|
||||
version = "1.3.0"
|
||||
|
@ -1578,7 +1594,7 @@ docs = ["furo", "olefile", "sphinx (>=8.1)", "sphinx-copybutton", "sphinx-inline
|
|||
fpx = ["olefile"]
|
||||
mic = ["olefile"]
|
||||
tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout", "trove-classifiers (>=2024.10.12)"]
|
||||
typing = ["typing-extensions"]
|
||||
typing = ["typing-extensions ; python_version < \"3.10\""]
|
||||
xmp = ["defusedxml"]
|
||||
|
||||
[[package]]
|
||||
|
@ -1938,6 +1954,18 @@ pytest = ">=6.2.5"
|
|||
[package.extras]
|
||||
dev = ["pre-commit", "pytest-asyncio", "tox"]
|
||||
|
||||
[[package]]
|
||||
name = "python-bitcoinlib"
|
||||
version = "0.12.2"
|
||||
description = "The Swiss Army Knife of the Bitcoin protocol."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "python-bitcoinlib-0.12.2.tar.gz", hash = "sha256:c65ab61427c77c38d397bfc431f71d86fd355b453a536496ec3fcb41bd10087d"},
|
||||
{file = "python_bitcoinlib-0.12.2-py3-none-any.whl", hash = "sha256:2f29a9f475f21c12169b3a6cc8820f34f11362d7ff1200a5703dce3e4e903a44"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.9.0.post0"
|
||||
|
@ -2990,7 +3018,7 @@ files = [
|
|||
pysocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""}
|
||||
|
||||
[package.extras]
|
||||
brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
|
||||
brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
|
||||
h2 = ["h2 (>=4,<5)"]
|
||||
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||
zstd = ["zstandard (>=0.18.0)"]
|
||||
|
@ -3013,7 +3041,7 @@ h11 = ">=0.8"
|
|||
typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
|
||||
|
||||
[package.extras]
|
||||
standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"]
|
||||
standard = ["colorama (>=0.4) ; sys_platform == \"win32\"", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1) ; sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"", "watchfiles (>=0.13)", "websockets (>=10.4)"]
|
||||
|
||||
[[package]]
|
||||
name = "virtualenv"
|
||||
|
@ -3034,7 +3062,7 @@ platformdirs = ">=3.9.1,<5"
|
|||
|
||||
[package.extras]
|
||||
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
|
||||
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
|
||||
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""]
|
||||
|
||||
[[package]]
|
||||
name = "vk-api"
|
||||
|
@ -3296,7 +3324,7 @@ files = [
|
|||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
|
||||
dev = ["black (>=19.3b0) ; python_version >= \"3.6\"", "pytest (>=4.6.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "wsproto"
|
||||
|
@ -3327,8 +3355,8 @@ files = [
|
|||
|
||||
[package.extras]
|
||||
build = ["build", "hatchling", "pip", "setuptools (>=71.0.2)", "wheel"]
|
||||
curl-cffi = ["curl-cffi (==0.5.10)", "curl-cffi (>=0.5.10,!=0.6.*,<0.7.2)"]
|
||||
default = ["brotli", "brotlicffi", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=1.26.17,<3)", "websockets (>=13.0)"]
|
||||
curl-cffi = ["curl-cffi (==0.5.10) ; os_name == \"nt\" and implementation_name == \"cpython\"", "curl-cffi (>=0.5.10,!=0.6.*,<0.7.2) ; os_name != \"nt\" and implementation_name == \"cpython\""]
|
||||
default = ["brotli ; implementation_name == \"cpython\"", "brotlicffi ; implementation_name != \"cpython\"", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=1.26.17,<3)", "websockets (>=13.0)"]
|
||||
dev = ["autopep8 (>=2.0,<3.0)", "pre-commit", "pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)", "ruff (>=0.9.0,<0.10.0)"]
|
||||
pyinstaller = ["pyinstaller (>=6.11.1)"]
|
||||
secretstorage = ["cffi", "secretstorage"]
|
||||
|
@ -3338,4 +3366,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
|
|||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "fbd6cdff4eb38021115a8cd361df7c292733028822f92f45cb667971c4bce901"
|
||||
content-hash = "beb354960b8d8af491a13e09cb565c7e3099a2b150167c16147aa0438e970018"
|
||||
|
|
|
@ -57,6 +57,7 @@ dependencies = [
|
|||
"certvalidator (>=0.0.0)",
|
||||
"rich-argparse (>=1.6.0,<2.0.0)",
|
||||
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||
]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
|
|
@ -1,151 +1,25 @@
|
|||
{
|
||||
"modules": {
|
||||
"gsheet_feeder": {
|
||||
"name": "gsheet_feeder",
|
||||
"display_name": "Google Sheets Feeder",
|
||||
"atlos_feeder_db_storage": {
|
||||
"name": "atlos_feeder_db_storage",
|
||||
"display_name": "Atlos Feeder Database Storage",
|
||||
"manifest": {
|
||||
"name": "Google Sheets Feeder",
|
||||
"name": "Atlos Feeder Database Storage",
|
||||
"author": "Bellingcat",
|
||||
"type": [
|
||||
"feeder"
|
||||
"feeder",
|
||||
"database",
|
||||
"storage"
|
||||
],
|
||||
"requires_setup": true,
|
||||
"description": "\n GsheetsFeeder \n A Google Sheets-based feeder for the Auto Archiver.\n\n This reads data from Google Sheets and filters rows based on user-defined rules.\n The filtered rows are processed into `Metadata` objects.\n\n ### Features\n - Validates the sheet structure and filters rows based on input configurations.\n - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.\n - Ensures only rows with valid URLs and unprocessed statuses are included for archival.\n - Supports organizing stored files into folder paths based on sheet and worksheet names.\n\n ### Setup\n - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.\n To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).\n - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.\n - Customize the column names in your Google sheet using the `columns` configuration.\n ",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"gspread",
|
||||
"slugify"
|
||||
]
|
||||
},
|
||||
"entry_point": "gsheet_feeder::GsheetsFeeder",
|
||||
"version": "1.0",
|
||||
"configs": {
|
||||
"sheet": {
|
||||
"default": null,
|
||||
"help": "name of the sheet to archive"
|
||||
},
|
||||
"sheet_id": {
|
||||
"default": null,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)"
|
||||
},
|
||||
"header": {
|
||||
"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)"
|
||||
},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
"required": true
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage"
|
||||
},
|
||||
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
|
||||
"type": "json_loader"
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": [],
|
||||
"help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed"
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": [],
|
||||
"help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed"
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": true,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
"type": "bool"
|
||||
}
|
||||
}
|
||||
},
|
||||
"configs": {
|
||||
"sheet": {
|
||||
"default": null,
|
||||
"help": "name of the sheet to archive"
|
||||
},
|
||||
"sheet_id": {
|
||||
"default": null,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)"
|
||||
},
|
||||
"header": {
|
||||
"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)"
|
||||
},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
"required": true
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage"
|
||||
},
|
||||
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
|
||||
"type": "json_loader"
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": [],
|
||||
"help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed"
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": [],
|
||||
"help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed"
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": true,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
"type": "bool"
|
||||
}
|
||||
}
|
||||
},
|
||||
"atlos_feeder": {
|
||||
"name": "atlos_feeder",
|
||||
"display_name": "Atlos Feeder",
|
||||
"manifest": {
|
||||
"name": "Atlos Feeder",
|
||||
"author": "Bellingcat",
|
||||
"type": [
|
||||
"feeder"
|
||||
],
|
||||
"requires_setup": true,
|
||||
"description": "\n AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.\n\n ### Features\n - Connects to the Atlos API to retrieve a list of source material URLs.\n - Filters source materials based on visibility, processing status, and metadata.\n - Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL.\n - Iterates through paginated results using a cursor for efficient API interaction.\n\n ### Notes\n - Requires an Atlos API endpoint and a valid API token for authentication.\n - Ensures only unprocessed, visible, and ready-to-archive URLs are returned.\n - Handles pagination transparently when retrieving data from the Atlos API.\n ",
|
||||
"description": "\n A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media,\n \n [Atlos](https://www.atlos.org/) is a visual investigation and archiving platform designed for investigative research, journalism, and open-source intelligence (OSINT). \n It helps users organize, analyze, and store media from various sources, making it easier to track and investigate digital evidence.\n \n To get started create a new project and obtain an API token from the settings page. You can group event's into Atlos's 'incidents'.\n Here you can add 'source material' by URLn and the Atlos feeder will fetch these URLs for archival.\n \n You can use Atlos only as a 'feeder', however you can also implement the 'database' and 'storage' features to store the media files in Atlos which is recommended.\n The Auto Archiver will retain the Atlos ID for each item, ensuring that the media and database outputs are uplaoded back into the relevant media item.\n \n \n ### Features\n - Connects to the Atlos API to retrieve a list of source material URLs.\n - Iterates through the URLs from all source material items which are unprocessed, visible, and ready to archive.\n - If the storage option is selected, it will store the media files alongside the original source material item in Atlos.\n - Is the database option is selected it will output the results to the media item, as well as updating failure status with error details when archiving fails.\n - Skips Storege/ database upload for items without an Atlos ID - restricting that you must use the Atlos feeder so that it has the Atlos ID to store the results with.\n\n ### Notes\n - Requires an Atlos account with a project and a valid API token for authentication.\n - Ensures only unprocessed, visible, and ready-to-archive URLs are returned.\n - Feches any media items within an Atlos project, regardless of separation into incidents.\n ",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"requests"
|
||||
]
|
||||
},
|
||||
"entry_point": "",
|
||||
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
|
||||
"version": "1.0",
|
||||
"configs": {
|
||||
"api_token": {
|
||||
|
@ -222,6 +96,135 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"gsheet_feeder_db": {
|
||||
"name": "gsheet_feeder_db",
|
||||
"display_name": "Google Sheets Feeder Database",
|
||||
"manifest": {
|
||||
"name": "Google Sheets Feeder Database",
|
||||
"author": "Bellingcat",
|
||||
"type": [
|
||||
"feeder",
|
||||
"database"
|
||||
],
|
||||
"requires_setup": true,
|
||||
"description": "\n GsheetsFeederDatabase\n A Google Sheets-based feeder and optional database for the Auto Archiver.\n\n This reads data from Google Sheets and filters rows based on user-defined rules.\n The filtered rows are processed into `Metadata` objects.\n\n ### Features\n - Validates the sheet structure and filters rows based on input configurations.\n - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.\n - Ensures only rows with valid URLs and unprocessed statuses are included for archival.\n - Supports organizing stored files into folder paths based on sheet and worksheet names.\n - If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.\n - Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.\n - Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.\n - Skips redundant updates for empty or invalid data fields.\n\n ### Setup\n - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.\n To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).\n - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.\n - Customize the column names in your Google sheet using the `columns` configuration.\n - The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.\n ",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"gspread",
|
||||
"slugify"
|
||||
]
|
||||
},
|
||||
"entry_point": "gsheet_feeder_db::GsheetsFeederDB",
|
||||
"version": "1.0",
|
||||
"configs": {
|
||||
"sheet": {
|
||||
"default": null,
|
||||
"help": "name of the sheet to archive"
|
||||
},
|
||||
"sheet_id": {
|
||||
"default": null,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)"
|
||||
},
|
||||
"header": {
|
||||
"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)"
|
||||
},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
"required": true
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage"
|
||||
},
|
||||
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
|
||||
"type": "json_loader"
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": [],
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed"
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": [],
|
||||
"help": "(CSV) explicitly block some worksheets from being processed"
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": true,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'"
|
||||
}
|
||||
}
|
||||
},
|
||||
"configs": {
|
||||
"sheet": {
|
||||
"default": null,
|
||||
"help": "name of the sheet to archive"
|
||||
},
|
||||
"sheet_id": {
|
||||
"default": null,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)"
|
||||
},
|
||||
"header": {
|
||||
"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)"
|
||||
},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
"required": true
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage"
|
||||
},
|
||||
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
|
||||
"type": "json_loader"
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": [],
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed"
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": [],
|
||||
"help": "(CSV) explicitly block some worksheets from being processed"
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": true,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'"
|
||||
}
|
||||
}
|
||||
},
|
||||
"cli_feeder": {
|
||||
"name": "cli_feeder",
|
||||
"display_name": "Command Line Feeder",
|
||||
|
@ -470,7 +473,7 @@
|
|||
"extractor"
|
||||
],
|
||||
"requires_setup": true,
|
||||
"description": "\n Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts\n and user profiles, downloading as much information as possible, including images, videos, text, stories,\n highlights, and tagged posts. \n Authentication is required via username/password or a session file.\n \n ",
|
||||
"description": "\n Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. \n \n > \u26a0\ufe0f **Warning** \n > This module is not actively maintained due to known issues with blocking. \n > Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)\n \n This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,\n highlights, and tagged posts. \n Authentication is required via username/password or a session file.\n \n ",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"instaloader",
|
||||
|
@ -482,38 +485,38 @@
|
|||
"configs": {
|
||||
"username": {
|
||||
"required": true,
|
||||
"help": "a valid Instagram username"
|
||||
"help": "A valid Instagram username."
|
||||
},
|
||||
"password": {
|
||||
"required": true,
|
||||
"help": "the corresponding Instagram account password"
|
||||
"help": "The corresponding Instagram account password."
|
||||
},
|
||||
"download_folder": {
|
||||
"default": "instaloader",
|
||||
"help": "name of a folder to temporarily download content to"
|
||||
"help": "Name of a folder to temporarily download content to."
|
||||
},
|
||||
"session_file": {
|
||||
"default": "secrets/instaloader.session",
|
||||
"help": "path to the instagram session which saves session credentials"
|
||||
"help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one."
|
||||
}
|
||||
}
|
||||
},
|
||||
"configs": {
|
||||
"username": {
|
||||
"required": true,
|
||||
"help": "a valid Instagram username"
|
||||
"help": "A valid Instagram username."
|
||||
},
|
||||
"password": {
|
||||
"required": true,
|
||||
"help": "the corresponding Instagram account password"
|
||||
"help": "The corresponding Instagram account password."
|
||||
},
|
||||
"download_folder": {
|
||||
"default": "instaloader",
|
||||
"help": "name of a folder to temporarily download content to"
|
||||
"help": "Name of a folder to temporarily download content to."
|
||||
},
|
||||
"session_file": {
|
||||
"default": "secrets/instaloader.session",
|
||||
"help": "path to the instagram session which saves session credentials"
|
||||
"help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one."
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -661,7 +664,7 @@
|
|||
"extractor"
|
||||
],
|
||||
"requires_setup": false,
|
||||
"description": "\nThis is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.\n\nThis module is responsible for downloading and processing media content from platforms\nsupported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality\nfor retrieving videos, subtitles, comments, and other metadata, and it integrates with\nthe broader archiving framework.\n\n### Features\n- Supports downloading videos and playlists.\n- Retrieves metadata like titles, descriptions, upload dates, and durations.\n- Downloads subtitles and comments when enabled.\n- Configurable options for handling live streams, proxies, and more.\n- Supports authentication of websites using the 'authentication' settings from your orchestration.\n\n### Dropins\n- For websites supported by `yt-dlp` that also contain posts in addition to videos\n (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create \n metadata objects. Some dropins are included in this generic_archiver by default, but\ncustom dropins can be created to handle additional websites and passed to the archiver\nvia the command line using the `--dropins` option (TODO!).\n",
|
||||
"description": "\nThis is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.\n\nThis module is responsible for downloading and processing media content from platforms\nsupported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality\nfor retrieving videos, subtitles, comments, and other metadata, and it integrates with\nthe broader archiving framework.\n\n### Features\n- Supports downloading videos and playlists.\n- Retrieves metadata like titles, descriptions, upload dates, and durations.\n- Downloads subtitles and comments when enabled.\n- Configurable options for handling live streams, proxies, and more.\n- Supports authentication of websites using the 'authentication' settings from your orchestration.\n\n### Dropins\n- For websites supported by `yt-dlp` that also contain posts in addition to videos\n (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create \n metadata objects. Some dropins are included in this generic_archiver by default, but\ncustom dropins can be created to handle additional websites and passed to the archiver\nvia the command line using the `--dropins` option (TODO!).\n\n### Auto-Updates\n\nThe Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).\nThis can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).\nIf you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.\n\n",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"yt_dlp",
|
||||
|
@ -710,6 +713,11 @@
|
|||
"max_downloads": {
|
||||
"default": "inf",
|
||||
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."
|
||||
},
|
||||
"ytdlp_update_interval": {
|
||||
"default": 5,
|
||||
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
|
||||
"type": "int"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -751,9 +759,38 @@
|
|||
"max_downloads": {
|
||||
"default": "inf",
|
||||
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."
|
||||
},
|
||||
"ytdlp_update_interval": {
|
||||
"default": 5,
|
||||
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
|
||||
"type": "int"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tiktok_tikwm_extractor": {
|
||||
"name": "tiktok_tikwm_extractor",
|
||||
"display_name": "Tiktok Tikwm Extractor",
|
||||
"manifest": {
|
||||
"name": "Tiktok Tikwm Extractor",
|
||||
"author": "Bellingcat",
|
||||
"type": [
|
||||
"extractor"
|
||||
],
|
||||
"requires_setup": false,
|
||||
"description": "\n Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/\n\t\n\tThis extractor complements the generic_extractor which can already get TikTok videos, but this one can extract special videos like those marked as sensitive.\n\n ### Features\n - Downloads the video and, if possible, also the video cover.\n\t- Stores extra metadata about the post like author information, and more as returned by tikwm.com. \n\n ### Notes\n - If tikwm.com is down, this extractor will not work.\n\t- If tikwm.com changes their API, this extractor may break.\n\t- If no video is found, this extractor will consider the extraction failed.\n ",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"requests"
|
||||
],
|
||||
"bin": []
|
||||
},
|
||||
"entry_point": "",
|
||||
"version": "1.0",
|
||||
"configs": {}
|
||||
},
|
||||
"configs": null
|
||||
},
|
||||
"telegram_extractor": {
|
||||
"name": "telegram_extractor",
|
||||
"display_name": "Telegram Extractor",
|
||||
|
@ -1054,7 +1091,7 @@
|
|||
"help": "width of the screenshots"
|
||||
},
|
||||
"height": {
|
||||
"default": 720,
|
||||
"default": 1024,
|
||||
"type": "int",
|
||||
"help": "height of the screenshots"
|
||||
},
|
||||
|
@ -1091,7 +1128,7 @@
|
|||
"help": "width of the screenshots"
|
||||
},
|
||||
"height": {
|
||||
"default": 720,
|
||||
"default": 1024,
|
||||
"type": "int",
|
||||
"help": "height of the screenshots"
|
||||
},
|
||||
|
@ -1201,6 +1238,79 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"opentimestamps_enricher": {
|
||||
"name": "opentimestamps_enricher",
|
||||
"display_name": "OpenTimestamps Enricher",
|
||||
"manifest": {
|
||||
"name": "OpenTimestamps Enricher",
|
||||
"author": "Bellingcat",
|
||||
"type": [
|
||||
"enricher"
|
||||
],
|
||||
"requires_setup": true,
|
||||
"description": "\n Creates OpenTimestamps proofs for archived files, providing blockchain-backed evidence of file existence at a specific time.\n\n Uses OpenTimestamps \u2013 a service that timestamps data using the Bitcoin blockchain, providing a decentralized \n and secure way to prove that data existed at a certain point in time.\n\n ### Features\n - Creates cryptographic timestamp proofs that link files to the Bitcoin blockchain\n - Verifies existing timestamp proofs to confirm the time a file existed\n - Uses multiple calendar servers to ensure reliability and redundancy\n - Stores timestamp proofs alongside original files for future verification\n\n ### Notes\n - Can work offline to create timestamp proofs that can be upgraded later\n - Verification checks if timestamps have been confirmed in the Bitcoin blockchain\n - Should run after files have been archived and hashed\n\n ### Verifying Timestamps Later\n If you wish to verify a timestamp (ots) file later, you can install the opentimestamps-client command line tool and use the `ots verify` command.\n Example: `ots verify my_file.ots`\n\n Note: if you're using local storage with a filename_generator set to 'static' (a hash) or random, the files will be renamed when they are saved to the\n final location meaning you will need to specify the original filename when verifying the timestamp with `ots verify -f original_filename my_file.ots`.\n ",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"opentimestamps"
|
||||
]
|
||||
},
|
||||
"entry_point": "",
|
||||
"version": "1.0",
|
||||
"configs": {
|
||||
"use_calendars": {
|
||||
"default": true,
|
||||
"help": "Whether to connect to OpenTimestamps calendar servers to create timestamps. If false, creates local timestamp proofs only.",
|
||||
"type": "bool"
|
||||
},
|
||||
"calendar_urls": {
|
||||
"default": [
|
||||
"https://alice.btc.calendar.opentimestamps.org",
|
||||
"https://bob.btc.calendar.opentimestamps.org",
|
||||
"https://finney.calendar.eternitywall.com"
|
||||
],
|
||||
"help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars",
|
||||
"type": "list"
|
||||
},
|
||||
"calendar_whitelist": {
|
||||
"default": [],
|
||||
"help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']",
|
||||
"type": "list"
|
||||
},
|
||||
"verify_timestamps": {
|
||||
"default": true,
|
||||
"help": "Whether to verify timestamps after creating them.",
|
||||
"type": "bool"
|
||||
}
|
||||
}
|
||||
},
|
||||
"configs": {
|
||||
"use_calendars": {
|
||||
"default": true,
|
||||
"help": "Whether to connect to OpenTimestamps calendar servers to create timestamps. If false, creates local timestamp proofs only.",
|
||||
"type": "bool"
|
||||
},
|
||||
"calendar_urls": {
|
||||
"default": [
|
||||
"https://alice.btc.calendar.opentimestamps.org",
|
||||
"https://bob.btc.calendar.opentimestamps.org",
|
||||
"https://finney.calendar.eternitywall.com"
|
||||
],
|
||||
"help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars",
|
||||
"type": "list"
|
||||
},
|
||||
"calendar_whitelist": {
|
||||
"default": [],
|
||||
"help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']",
|
||||
"type": "list"
|
||||
},
|
||||
"verify_timestamps": {
|
||||
"default": true,
|
||||
"help": "Whether to verify timestamps after creating them.",
|
||||
"type": "bool"
|
||||
}
|
||||
}
|
||||
},
|
||||
"thumbnail_enricher": {
|
||||
"name": "thumbnail_enricher",
|
||||
"display_name": "Thumbnail Enricher",
|
||||
|
@ -1381,56 +1491,6 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"atlos_db": {
|
||||
"name": "atlos_db",
|
||||
"display_name": "Atlos Database",
|
||||
"manifest": {
|
||||
"name": "Atlos Database",
|
||||
"author": "Bellingcat",
|
||||
"type": [
|
||||
"database"
|
||||
],
|
||||
"requires_setup": true,
|
||||
"description": "\nHandles integration with the Atlos platform for managing archival results.\n\n### Features\n- Outputs archival results to the Atlos API for storage and tracking.\n- Updates failure status with error details when archiving fails.\n- Processes and formats metadata, including ISO formatting for datetime fields.\n- Skips processing for items without an Atlos ID.\n\n### Setup\nRequired configs:\n- atlos_url: Base URL for the Atlos API.\n- api_token: Authentication token for API access.\n",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
""
|
||||
],
|
||||
"bin": [
|
||||
""
|
||||
]
|
||||
},
|
||||
"entry_point": "atlos_db::AtlosDb",
|
||||
"version": "1.0",
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": null,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": true,
|
||||
"type": "str"
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
}
|
||||
}
|
||||
},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": null,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": true,
|
||||
"type": "str"
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
}
|
||||
}
|
||||
},
|
||||
"api_db": {
|
||||
"name": "api_db",
|
||||
"display_name": "Auto Archiver API Database",
|
||||
|
@ -1473,9 +1533,9 @@
|
|||
"help": "which group of users have access to the archive in case public=false as author"
|
||||
},
|
||||
"use_api_cache": {
|
||||
"default": true,
|
||||
"default": false,
|
||||
"type": "bool",
|
||||
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"
|
||||
"help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived"
|
||||
},
|
||||
"store_results": {
|
||||
"default": true,
|
||||
|
@ -1511,9 +1571,9 @@
|
|||
"help": "which group of users have access to the archive in case public=false as author"
|
||||
},
|
||||
"use_api_cache": {
|
||||
"default": true,
|
||||
"default": false,
|
||||
"type": "bool",
|
||||
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"
|
||||
"help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived"
|
||||
},
|
||||
"store_results": {
|
||||
"default": true,
|
||||
|
@ -1526,58 +1586,6 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"gsheet_db": {
|
||||
"name": "gsheet_db",
|
||||
"display_name": "Google Sheets Database",
|
||||
"manifest": {
|
||||
"name": "Google Sheets Database",
|
||||
"author": "Bellingcat",
|
||||
"type": [
|
||||
"database"
|
||||
],
|
||||
"requires_setup": true,
|
||||
"description": "\n GsheetsDatabase:\n Handles integration with Google Sheets for tracking archival tasks.\n\n### Features\n- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.\n- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.\n- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.\n- Skips redundant updates for empty or invalid data fields.\n\n### Notes\n- Currently works only with metadata provided by GsheetFeeder. \n- Requires configuration of a linked Google Sheet and appropriate API credentials.\n ",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"gspread",
|
||||
"slugify"
|
||||
]
|
||||
},
|
||||
"entry_point": "gsheet_db::GsheetsDb",
|
||||
"version": "1.0",
|
||||
"configs": {
|
||||
"allow_worksheets": {
|
||||
"default": [],
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed"
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": [],
|
||||
"help": "(CSV) explicitly block some worksheets from being processed"
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": true,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'"
|
||||
}
|
||||
}
|
||||
},
|
||||
"configs": {
|
||||
"allow_worksheets": {
|
||||
"default": [],
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed"
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": [],
|
||||
"help": "(CSV) explicitly block some worksheets from being processed"
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": true,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'"
|
||||
}
|
||||
}
|
||||
},
|
||||
"console_db": {
|
||||
"name": "console_db",
|
||||
"display_name": "Console Database",
|
||||
|
@ -1664,7 +1672,7 @@
|
|||
},
|
||||
"filename_generator": {
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
|
||||
"choices": [
|
||||
"random",
|
||||
"static"
|
||||
|
@ -1696,7 +1704,7 @@
|
|||
},
|
||||
"filename_generator": {
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
|
||||
"choices": [
|
||||
"random",
|
||||
"static"
|
||||
|
@ -1716,54 +1724,6 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"atlos_storage": {
|
||||
"name": "atlos_storage",
|
||||
"display_name": "Atlos Storage",
|
||||
"manifest": {
|
||||
"name": "Atlos Storage",
|
||||
"author": "Bellingcat",
|
||||
"type": [
|
||||
"storage"
|
||||
],
|
||||
"requires_setup": true,
|
||||
"description": "\n Stores media files in a [Atlos](https://www.atlos.org/).\n\n ### Features\n - Saves media files to Atlos, organizing them into folders based on the provided path structure.\n\n ### Notes\n - Requires setup with Atlos credentials.\n - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.\n ",
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"boto3"
|
||||
],
|
||||
"bin": []
|
||||
},
|
||||
"entry_point": "",
|
||||
"version": "1.0",
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": null,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": true,
|
||||
"type": "str"
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
}
|
||||
}
|
||||
},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": null,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": true,
|
||||
"type": "str"
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
}
|
||||
}
|
||||
},
|
||||
"s3_storage": {
|
||||
"name": "s3_storage",
|
||||
"display_name": "S3 Storage",
|
||||
|
@ -1796,7 +1756,7 @@
|
|||
},
|
||||
"filename_generator": {
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
|
||||
"choices": [
|
||||
"random",
|
||||
"static"
|
||||
|
@ -1850,7 +1810,7 @@
|
|||
},
|
||||
"filename_generator": {
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
|
||||
"choices": [
|
||||
"random",
|
||||
"static"
|
||||
|
@ -1922,7 +1882,7 @@
|
|||
},
|
||||
"filename_generator": {
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)",
|
||||
"choices": [
|
||||
"random",
|
||||
"static"
|
||||
|
@ -1951,7 +1911,7 @@
|
|||
},
|
||||
"filename_generator": {
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)",
|
||||
"choices": [
|
||||
"random",
|
||||
"static"
|
||||
|
@ -2029,9 +1989,9 @@
|
|||
"steps": {
|
||||
"feeders": [
|
||||
"cli_feeder",
|
||||
"gsheet_feeder",
|
||||
"atlos_feeder",
|
||||
"csv_feeder"
|
||||
"atlos_feeder_db_storage",
|
||||
"csv_feeder",
|
||||
"gsheet_feeder_db"
|
||||
],
|
||||
"extractors": [
|
||||
"wayback_extractor_enricher",
|
||||
|
@ -2039,6 +1999,7 @@
|
|||
"instagram_api_extractor",
|
||||
"instagram_tbot_extractor",
|
||||
"generic_extractor",
|
||||
"tiktok_tikwm_extractor",
|
||||
"twitter_api_extractor",
|
||||
"instagram_extractor",
|
||||
"telethon_extractor",
|
||||
|
@ -2055,20 +2016,21 @@
|
|||
"meta_enricher",
|
||||
"pdq_hash_enricher",
|
||||
"whisper_enricher",
|
||||
"opentimestamps_enricher",
|
||||
"ssl_enricher",
|
||||
"hash_enricher"
|
||||
],
|
||||
"databases": [
|
||||
"console_db",
|
||||
"atlos_db",
|
||||
"api_db",
|
||||
"csv_db",
|
||||
"gsheet_db"
|
||||
"atlos_feeder_db_storage",
|
||||
"gsheet_feeder_db"
|
||||
],
|
||||
"storages": [
|
||||
"local_storage",
|
||||
"gdrive_storage",
|
||||
"atlos_storage",
|
||||
"atlos_feeder_db_storage",
|
||||
"s3_storage"
|
||||
],
|
||||
"formatters": [
|
||||
|
@ -2077,9 +2039,9 @@
|
|||
]
|
||||
},
|
||||
"configs": [
|
||||
"gsheet_feeder",
|
||||
"atlos_feeder",
|
||||
"atlos_feeder_db_storage",
|
||||
"csv_feeder",
|
||||
"gsheet_feeder_db",
|
||||
"cli_feeder",
|
||||
"instagram_api_extractor",
|
||||
"instagram_tbot_extractor",
|
||||
|
@ -2093,15 +2055,13 @@
|
|||
"timestamping_enricher",
|
||||
"screenshot_enricher",
|
||||
"whisper_enricher",
|
||||
"opentimestamps_enricher",
|
||||
"thumbnail_enricher",
|
||||
"ssl_enricher",
|
||||
"hash_enricher",
|
||||
"atlos_db",
|
||||
"api_db",
|
||||
"gsheet_db",
|
||||
"csv_db",
|
||||
"gdrive_storage",
|
||||
"atlos_storage",
|
||||
"s3_storage",
|
||||
"local_storage",
|
||||
"html_formatter"
|
||||
|
|
|
@ -7,7 +7,7 @@ by handling user configuration, validating the steps properties, and implementin
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, TYPE_CHECKING
|
||||
from typing import List, TYPE_CHECKING, Type
|
||||
import shutil
|
||||
import ast
|
||||
import copy
|
||||
|
@ -60,7 +60,7 @@ class ModuleFactory:
|
|||
|
||||
HAS_SETUP_PATHS = True
|
||||
|
||||
def get_module(self, module_name: str, config: dict) -> BaseModule:
|
||||
def get_module(self, module_name: str, config: dict) -> Type[BaseModule]:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
},
|
||||
"filename_generator": {
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"root_folder_id": {
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
},
|
||||
"filename_generator": {
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
{
|
||||
"name": "OpenTimestamps Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"opentimestamps",
|
||||
],
|
||||
},
|
||||
"configs": {
|
||||
"calendar_urls": {
|
||||
"default": [
|
||||
"https://alice.btc.calendar.opentimestamps.org",
|
||||
"https://bob.btc.calendar.opentimestamps.org",
|
||||
"https://finney.calendar.eternitywall.com",
|
||||
# "https://ots.btc.catallaxy.com/", # ipv4 only
|
||||
],
|
||||
"help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:\
|
||||
https://opentimestamps.org/#calendars",
|
||||
"type": "list",
|
||||
},
|
||||
"calendar_whitelist": {
|
||||
"default": [],
|
||||
"help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']",
|
||||
"type": "list",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Creates OpenTimestamps proofs for archived files, providing blockchain-backed evidence of file existence at a specific time.
|
||||
|
||||
Uses OpenTimestamps – a service that timestamps data using the Bitcoin blockchain, providing a decentralized
|
||||
and secure way to prove that data existed at a certain point in time. A SHA256 hash of the file to be timestamped is used as the token
|
||||
and sent to each of the 'timestamp calendars' for inclusion in the blockchain. The proof is then saved alongside the original file in a file with
|
||||
the '.ots' extension.
|
||||
|
||||
### Features
|
||||
- Creates cryptographic timestamp proofs that link files to the Bitcoin
|
||||
- Verifies timestamp proofs have been submitted to the blockchain (note: does not confirm they have been *added*)
|
||||
- Can use multiple calendar servers to ensure reliability and redundancy
|
||||
- Stores timestamp proofs alongside original files for future verification
|
||||
|
||||
### Timestamp status
|
||||
An opentimestamp, when submitted to a timestmap server will have a 'pending' status (Pending Attestation) as it waits to be added
|
||||
to the blockchain. Once it has been added to the blockchain, it will have a 'confirmed' status (Bitcoin Block Timestamp).
|
||||
This process typically takes several hours, depending on the calendar server and the current state of the Bitcoin network. As such,
|
||||
the status of all timestamps added will be 'pending' until they are subsequently confirmed (see 'Upgrading Timestamps' below).
|
||||
|
||||
There are two possible statuses for a timestamp:
|
||||
- `Pending`: The timestamp has been submitted to the calendar server but has not yet been confirmed in the Bitcoin blockchain.
|
||||
- `Confirmed`: The timestamp has been confirmed in the Bitcoin blockchain.
|
||||
|
||||
### Upgrading Timestamps
|
||||
To upgrade a timestamp from 'pending' to 'confirmed', you can use the `ots upgrade` command from the opentimestamps-client package
|
||||
(install it with `pip install opentimesptamps-client`).
|
||||
Example: `ots upgrade my_file.ots`
|
||||
|
||||
Here is a useful script that could be used to upgrade all timestamps in a directory, which could be run on a cron job:
|
||||
```{code} bash
|
||||
find . -name "*.ots" -type f | while read file; do
|
||||
echo "Upgrading OTS $file"
|
||||
ots upgrade $file
|
||||
done
|
||||
# The result might look like:
|
||||
# Upgrading OTS ./my_file.ots
|
||||
# Got 1 attestation(s) from https://alice.btc.calendar.opentimestamps.org
|
||||
# Success! Timestamp complete
|
||||
```
|
||||
|
||||
```{note} Note: this will only upgrade the .ots files, and will not change the status text in any output .html files or any databases where the
|
||||
metadata is stored (e.g. Google Sheets, CSV database, API database etc.).
|
||||
```
|
||||
|
||||
### Verifying Timestamps
|
||||
The easiest way to verify a timestamp (ots) file is to install the opentimestamps-client command line tool and use the `ots verify` command.
|
||||
Example: `ots verify my_file.ots`
|
||||
|
||||
```{code} bash
|
||||
$ ots verify my_file.ots
|
||||
Calendar https://bob.btc.calendar.opentimestamps.org: Pending confirmation in Bitcoin blockchain
|
||||
Calendar https://finney.calendar.eternitywall.com: Pending confirmation in Bitcoin blockchain
|
||||
Calendar https://alice.btc.calendar.opentimestamps.org: Timestamped by transaction 12345; waiting for 6 confirmations
|
||||
```
|
||||
|
||||
Note: if you're using a storage with `filename_generator` set to `static` or `random`, the files will be renamed when they are saved to the
|
||||
final location meaning you will need to specify the original filename when verifying the timestamp with `ots verify -f original_filename my_file.ots`.
|
||||
|
||||
### Choosing Calendar Servers
|
||||
|
||||
By default, the OpenTimestamps enricher uses a set of public calendar servers provided by the 'opentimestamps' project.
|
||||
You can customize the list of calendar servers by providing URLs in the `calendar_urls` configuration option.
|
||||
|
||||
### Calendar WhiteList
|
||||
|
||||
By default, the opentimestamps package only allows their own calendars to be used (see `DEFAULT_CALENDAR_WHITELIST` in `opentimestamps.calendar`),
|
||||
if you want to use your own calendars, then you can override this setting in the `calendar_whitelist` configuration option.
|
||||
|
||||
|
||||
""",
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
import os
|
||||
|
||||
from loguru import logger
|
||||
import opentimestamps
|
||||
from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST
|
||||
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
|
||||
from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation
|
||||
from opentimestamps.core.op import OpSHA256
|
||||
from opentimestamps.core import serialize
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils.misc import get_current_timestamp
|
||||
|
||||
|
||||
class OpentimestampsEnricher(Enricher):
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"OpenTimestamps timestamping files for {url=}")
|
||||
|
||||
# Get the media files to timestamp
|
||||
media_files = [m for m in to_enrich.media if m.filename and not m.get("opentimestamps")]
|
||||
if not media_files:
|
||||
logger.warning(f"No files found to timestamp in {url=}")
|
||||
return
|
||||
|
||||
timestamp_files = []
|
||||
for media in media_files:
|
||||
try:
|
||||
# Get the file path from the media
|
||||
file_path = media.filename
|
||||
if not os.path.exists(file_path):
|
||||
logger.warning(f"File not found: {file_path}")
|
||||
continue
|
||||
|
||||
# Create timestamp for the file - hash is SHA256
|
||||
# Note: hash is hard-coded to SHA256 and does not use hash_enricher to set it.
|
||||
# SHA256 is the recommended hash, ref: https://github.com/bellingcat/auto-archiver/pull/247#discussion_r1992433181
|
||||
logger.debug(f"Creating timestamp for {file_path}")
|
||||
file_hash = None
|
||||
with open(file_path, "rb") as f:
|
||||
file_hash = OpSHA256().hash_fd(f)
|
||||
|
||||
if not file_hash:
|
||||
logger.warning(f"Failed to hash file for timestamping, skipping: {file_path}")
|
||||
continue
|
||||
|
||||
# Create a timestamp with the file hash
|
||||
timestamp = Timestamp(file_hash)
|
||||
|
||||
# Create a detached timestamp file with the hash operation and timestamp
|
||||
detached_timestamp = DetachedTimestampFile(OpSHA256(), timestamp)
|
||||
|
||||
# Submit to calendar servers
|
||||
submitted_to_calendar = False
|
||||
|
||||
logger.debug(f"Submitting timestamp to calendar servers for {file_path}")
|
||||
calendars = []
|
||||
whitelist = DEFAULT_CALENDAR_WHITELIST
|
||||
|
||||
if self.calendar_whitelist:
|
||||
whitelist = set(self.calendar_whitelist)
|
||||
|
||||
# Create calendar instances
|
||||
calendar_urls = []
|
||||
for url in self.calendar_urls:
|
||||
if url in whitelist:
|
||||
calendars.append(RemoteCalendar(url))
|
||||
calendar_urls.append(url)
|
||||
|
||||
# Submit the hash to each calendar
|
||||
for calendar in calendars:
|
||||
try:
|
||||
calendar_timestamp = calendar.submit(file_hash)
|
||||
timestamp.merge(calendar_timestamp)
|
||||
logger.debug(f"Successfully submitted to calendar: {calendar.url}")
|
||||
submitted_to_calendar = True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to submit to calendar {calendar.url}: {e}")
|
||||
|
||||
# If all calendar submissions failed, add pending attestations
|
||||
if not submitted_to_calendar and not timestamp.attestations:
|
||||
logger.error(
|
||||
f"Failed to submit to any calendar for {file_path}. **This file will not be timestamped.**"
|
||||
)
|
||||
media.set("opentimestamps", False)
|
||||
continue
|
||||
|
||||
# Save the timestamp proof to a file
|
||||
timestamp_path = os.path.join(self.tmp_dir, f"{os.path.basename(file_path)}.ots")
|
||||
try:
|
||||
with open(timestamp_path, "wb") as f:
|
||||
# Create a serialization context and write to the file
|
||||
ctx = serialize.BytesSerializationContext()
|
||||
detached_timestamp.serialize(ctx)
|
||||
f.write(ctx.getbytes())
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to serialize timestamp file: {e}")
|
||||
continue
|
||||
|
||||
# Create media for the timestamp file
|
||||
timestamp_media = Media(filename=timestamp_path)
|
||||
# explicitly set the mimetype, normally .ots files are 'application/vnd.oasis.opendocument.spreadsheet-template'
|
||||
timestamp_media.mimetype = "application/vnd.opentimestamps"
|
||||
timestamp_media.set("opentimestamps_version", opentimestamps.__version__)
|
||||
|
||||
verification_info = self.verify_timestamp(detached_timestamp)
|
||||
for key, value in verification_info.items():
|
||||
timestamp_media.set(key, value)
|
||||
|
||||
media.set("opentimestamp_files", [timestamp_media])
|
||||
timestamp_files.append(timestamp_media.filename)
|
||||
# Update the original media to indicate it's been timestamped
|
||||
media.set("opentimestamps", True)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error while timestamping {media.filename}: {e}")
|
||||
|
||||
# Add timestamp files to the metadata
|
||||
if timestamp_files:
|
||||
to_enrich.set("opentimestamped", True)
|
||||
to_enrich.set("opentimestamps_count", len(timestamp_files))
|
||||
logger.success(f"{len(timestamp_files)} OpenTimestamps proofs created for {url=}")
|
||||
else:
|
||||
to_enrich.set("opentimestamped", False)
|
||||
logger.warning(f"No successful timestamps created for {url=}")
|
||||
|
||||
def verify_timestamp(self, detached_timestamp):
|
||||
"""
|
||||
Verify a timestamp and extract verification information.
|
||||
|
||||
Args:
|
||||
detached_timestamp: The detached timestamp to verify.
|
||||
|
||||
Returns:
|
||||
dict: Information about the verification result.
|
||||
"""
|
||||
result = {}
|
||||
|
||||
# Check if we have attestations
|
||||
attestations = list(detached_timestamp.timestamp.all_attestations())
|
||||
result["attestation_count"] = len(attestations)
|
||||
|
||||
if attestations:
|
||||
attestation_info = []
|
||||
for msg, attestation in attestations:
|
||||
info = {}
|
||||
|
||||
# Process different types of attestations
|
||||
if isinstance(attestation, PendingAttestation):
|
||||
info["status"] = "pending"
|
||||
info["uri"] = attestation.uri
|
||||
|
||||
elif isinstance(attestation, BitcoinBlockHeaderAttestation):
|
||||
info["status"] = "confirmed"
|
||||
info["block_height"] = attestation.height
|
||||
|
||||
info["last_check"] = get_current_timestamp()
|
||||
|
||||
attestation_info.append(info)
|
||||
|
||||
result["attestations"] = attestation_info
|
||||
|
||||
# For at least one confirmed attestation
|
||||
if any("confirmed" in a.get("status") for a in attestation_info):
|
||||
result["verified"] = True
|
||||
else:
|
||||
result["verified"] = False
|
||||
else:
|
||||
result["verified"] = False
|
||||
result["last_updated"] = get_current_timestamp()
|
||||
|
||||
return result
|
|
@ -13,7 +13,7 @@
|
|||
},
|
||||
"filename_generator": {
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
|
|
|
@ -0,0 +1,276 @@
|
|||
import pytest
|
||||
import hashlib
|
||||
|
||||
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
|
||||
from opentimestamps.calendar import RemoteCalendar
|
||||
from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
# TODO: Remove once timestamping overhaul is merged
|
||||
@pytest.fixture
|
||||
def sample_media(tmp_path) -> Media:
|
||||
"""Fixture creating a Media object with temporary source file"""
|
||||
src_file = tmp_path / "source.txt"
|
||||
src_file.write_text("test content")
|
||||
return Media(_key="subdir/test.txt", filename=str(src_file))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_file_path(tmp_path):
|
||||
tmp_file = tmp_path / "test.txt"
|
||||
tmp_file.write_text("This is a test file content for OpenTimestamps")
|
||||
return str(tmp_file)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def detached_timestamp_file():
|
||||
"""Create a simple detached timestamp file for testing"""
|
||||
file_hash = hashlib.sha256(b"Test content").digest()
|
||||
from opentimestamps.core.op import OpSHA256
|
||||
|
||||
file_hash_op = OpSHA256()
|
||||
timestamp = Timestamp(file_hash)
|
||||
|
||||
# Add a pending attestation
|
||||
pending = PendingAttestation("https://example.calendar.com")
|
||||
timestamp.attestations.add(pending)
|
||||
|
||||
# Add a bitcoin attestation
|
||||
bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height
|
||||
timestamp.attestations.add(bitcoin)
|
||||
|
||||
return DetachedTimestampFile(file_hash_op, timestamp)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def verified_timestamp_file():
|
||||
"""Create a timestamp file with a Bitcoin attestation"""
|
||||
file_hash = hashlib.sha256(b"Verified content").digest()
|
||||
from opentimestamps.core.op import OpSHA256
|
||||
|
||||
file_hash_op = OpSHA256()
|
||||
timestamp = Timestamp(file_hash)
|
||||
|
||||
# Add only a Bitcoin attestation
|
||||
bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height
|
||||
timestamp.attestations.add(bitcoin)
|
||||
|
||||
return DetachedTimestampFile(file_hash_op, timestamp)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pending_timestamp_file():
|
||||
"""Create a timestamp file with only pending attestations"""
|
||||
file_hash = hashlib.sha256(b"Pending content").digest()
|
||||
from opentimestamps.core.op import OpSHA256
|
||||
|
||||
file_hash_op = OpSHA256()
|
||||
timestamp = Timestamp(file_hash)
|
||||
|
||||
# Add only pending attestations
|
||||
pending1 = PendingAttestation("https://example1.calendar.com")
|
||||
pending2 = PendingAttestation("https://example2.calendar.com")
|
||||
timestamp.attestations.add(pending1)
|
||||
timestamp.attestations.add(pending2)
|
||||
|
||||
return DetachedTimestampFile(file_hash_op, timestamp)
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_tsr(setup_module, mocker):
|
||||
"""Test submitting a hash to calendar servers"""
|
||||
# Mock the RemoteCalendar submit method
|
||||
mock_submit = mocker.patch.object(RemoteCalendar, "submit")
|
||||
test_timestamp = Timestamp(hashlib.sha256(b"test").digest())
|
||||
mock_submit.return_value = test_timestamp
|
||||
|
||||
# Create a calendar
|
||||
calendar = RemoteCalendar("https://alice.btc.calendar.opentimestamps.org")
|
||||
|
||||
# Test submission
|
||||
file_hash = hashlib.sha256(b"Test file content").digest()
|
||||
result = calendar.submit(file_hash)
|
||||
|
||||
assert mock_submit.called
|
||||
assert isinstance(result, Timestamp)
|
||||
assert result == test_timestamp
|
||||
|
||||
|
||||
def test_verify_timestamp(setup_module, detached_timestamp_file):
|
||||
"""Test the verification of timestamp attestations"""
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
# Test verification
|
||||
verification_info = ots.verify_timestamp(detached_timestamp_file)
|
||||
|
||||
# Check verification results
|
||||
assert verification_info["attestation_count"] == 2
|
||||
assert verification_info["verified"] is True
|
||||
assert len(verification_info["attestations"]) == 2
|
||||
|
||||
# Check attestation types
|
||||
assertion_types = [a["status"] for a in verification_info["attestations"]]
|
||||
assert "pending" in assertion_types
|
||||
assert "confirmed" in assertion_types
|
||||
|
||||
# Check Bitcoin attestation details
|
||||
bitcoin_attestation = next(a for a in verification_info["attestations"] if a["status"] == "confirmed")
|
||||
assert bitcoin_attestation["block_height"] == 783000
|
||||
|
||||
|
||||
def test_verify_pending_only(setup_module, pending_timestamp_file):
|
||||
"""Test verification of timestamps with only pending attestations"""
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
verification_info = ots.verify_timestamp(pending_timestamp_file)
|
||||
|
||||
assert verification_info["attestation_count"] == 2
|
||||
assert verification_info["verified"] is False
|
||||
|
||||
# All attestations should be of type "pending"
|
||||
assert all(a["status"] == "pending" for a in verification_info["attestations"])
|
||||
|
||||
# Check URIs of pending attestations
|
||||
uris = [a["uri"] for a in verification_info["attestations"]]
|
||||
assert "https://example1.calendar.com" in uris
|
||||
assert "https://example2.calendar.com" in uris
|
||||
|
||||
|
||||
def test_verify_bitcoin_completed(setup_module, verified_timestamp_file):
|
||||
"""Test verification of timestamps with completed Bitcoin attestations"""
|
||||
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
verification_info = ots.verify_timestamp(verified_timestamp_file)
|
||||
|
||||
assert verification_info["attestation_count"] == 1
|
||||
assert verification_info["verified"] is True
|
||||
assert "pending" not in verification_info
|
||||
|
||||
# Check that the attestation is a Bitcoin attestation
|
||||
attestation = verification_info["attestations"][0]
|
||||
assert attestation["status"] == "confirmed"
|
||||
assert attestation["block_height"] == 783000
|
||||
|
||||
|
||||
def test_full_enriching(setup_module, sample_file_path, sample_media, mocker):
|
||||
"""Test the complete enrichment process"""
|
||||
|
||||
# Mock the calendar submission to avoid network requests
|
||||
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
|
||||
|
||||
# Create a function that returns a new timestamp for each call
|
||||
def side_effect(digest):
|
||||
test_timestamp = Timestamp(digest)
|
||||
# Add a bitcoin attestation to the test timestamp
|
||||
bitcoin = BitcoinBlockHeaderAttestation(783000)
|
||||
test_timestamp.attestations.add(bitcoin)
|
||||
return test_timestamp
|
||||
|
||||
mock_calendar.side_effect = side_effect
|
||||
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
# Create test metadata with sample file
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.filename = sample_file_path
|
||||
metadata.add_media(sample_media)
|
||||
|
||||
# Run enrichment
|
||||
ots.enrich(metadata)
|
||||
|
||||
# Verify results
|
||||
assert metadata.get("opentimestamped") is True
|
||||
assert metadata.get("opentimestamps_count") == 1
|
||||
|
||||
# Check that we have one parent media item: the original
|
||||
assert len(metadata.media) == 1
|
||||
|
||||
# Check that the original media was updated
|
||||
assert metadata.media[0].get("opentimestamps") is True
|
||||
|
||||
# Check the timestamp file media is a child of the original
|
||||
assert len(metadata.media[0].get("opentimestamp_files")) == 1
|
||||
|
||||
timestamp_media = metadata.media[0].get("opentimestamp_files")[0]
|
||||
|
||||
assert timestamp_media.get("opentimestamps_version") is not None
|
||||
|
||||
# Check verification results on the timestamp media
|
||||
assert timestamp_media.get("verified") is True
|
||||
assert timestamp_media.get("attestation_count") == 1
|
||||
|
||||
|
||||
def test_full_enriching_one_calendar_error(
|
||||
setup_module, sample_file_path, sample_media, mocker, pending_timestamp_file
|
||||
):
|
||||
"""Test enrichment when one calendar server returns an error"""
|
||||
# Mock the calendar submission to raise an exception
|
||||
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
|
||||
|
||||
test_timestamp = Timestamp(bytes.fromhex("583988e03646c26fa290c5c2408540a2f4e2aa9be087aa4546aefb531385b935"))
|
||||
# Add a bitcoin attestation to the test timestamp
|
||||
bitcoin = BitcoinBlockHeaderAttestation(783000)
|
||||
test_timestamp.attestations.add(bitcoin)
|
||||
|
||||
mock_calendar.side_effect = [test_timestamp, Exception("Calendar server error")]
|
||||
|
||||
ots = setup_module(
|
||||
"opentimestamps_enricher",
|
||||
{
|
||||
"calendar_urls": [
|
||||
"https://alice.btc.calendar.opentimestamps.org",
|
||||
"https://bob.btc.calendar.opentimestamps.org",
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
# Create test metadata with sample file
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.filename = sample_file_path
|
||||
metadata.add_media(sample_media)
|
||||
|
||||
# Run enrichment (should complete despite calendar errors)
|
||||
ots.enrich(metadata)
|
||||
|
||||
# Verify results
|
||||
assert metadata.get("opentimestamped") is True
|
||||
assert metadata.get("opentimestamps_count") == 1 # only alice worked, not bob
|
||||
|
||||
|
||||
def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker):
|
||||
"""Test enrichment when calendar servers return errors"""
|
||||
# Mock the calendar submission to raise an exception
|
||||
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
|
||||
mock_calendar.side_effect = Exception("Calendar server error")
|
||||
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
# Create test metadata with sample file
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.filename = sample_file_path
|
||||
metadata.add_media(sample_media)
|
||||
|
||||
# Run enrichment (should complete despite calendar errors)
|
||||
ots.enrich(metadata)
|
||||
|
||||
# Verify results
|
||||
assert metadata.get("opentimestamped") is False
|
||||
assert metadata.get("opentimestamps_count") is None
|
||||
|
||||
|
||||
def test_no_files_to_stamp(setup_module):
|
||||
"""Test enrichment with no files to timestamp"""
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
# Create empty metadata
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
|
||||
# Run enrichment
|
||||
ots.enrich(metadata)
|
||||
|
||||
# Verify no timestamping occurred
|
||||
assert metadata.get("opentimestamped") is None
|
||||
assert len(metadata.media) == 0
|
Ładowanie…
Reference in New Issue