From 2d879350420b106db68087ac2f21ed50d8ee8dbf Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 11 Feb 2025 14:54:46 +0000 Subject: [PATCH 01/12] Start on opentimestamps enricher --- .../opentimestamps_enricher/__manifest__.py | 50 +++++++++++++++++++ .../opentimestamps_enricher.py | 0 2 files changed, 50 insertions(+) create mode 100644 src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py create mode 100644 src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py diff --git a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py new file mode 100644 index 0000000..cfed1fb --- /dev/null +++ b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py @@ -0,0 +1,50 @@ +{ + "name": "Opentimestamps Enricher", + "type": ["enricher"], + "requires_setup": False, + "dependencies": { + "python": [ + "loguru", + "opentimestamps", + ], + }, + "configs": { + "tsa_urls": { + "default": [ + # [Adobe Approved Trust List] and [Windows Cert Store] + "http://timestamp.digicert.com", + "http://timestamp.identrust.com", + # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping + # "https://timestamp.sectigo.com", # wait 15 seconds between each request. + + # [Adobe: European Union Trusted Lists]. + # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. + + # [Windows Cert Store] + "http://timestamp.globalsign.com/tsa/r6advanced1", + # [Adobe: European Union Trusted Lists] and [Windows Cert Store] + # "http://ts.quovadisglobal.com/eu", # not valid for timestamping + # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain + # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain + # "http://tsa.sep.bg", # self-signed certificate in certificate chain + # "http://tsa.izenpe.com", #unable to get local issuer certificate + # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate + "http://tss.accv.es:8318/tsa", + ], + "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", + } + }, + "description": """ + Generates RFC3161-compliant timestamp tokens using Time Stamp Authorities (TSA) for archived files. + + ### Features + - Creates timestamp tokens to prove the existence of files at a specific time, useful for legal and authenticity purposes. + - Aggregates file hashes into a text file and timestamps the concatenated data. + - Uses multiple Time Stamp Authorities (TSAs) to ensure reliability and redundancy. + - Validates timestamping certificates against trusted Certificate Authorities (CAs) using the `certifi` trust store. + + ### Notes + - Should be run after the `hash_enricher` to ensure file hashes are available. + - Requires internet access to interact with the configured TSAs. + """ +} diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py new file mode 100644 index 0000000..e69de29 From 37eac64442c581fabf599710efd47f4d35a483c9 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 11 Mar 2025 17:10:44 +0000 Subject: [PATCH 02/12] Remove desc --- .../opentimestamps_enricher/__manifest__.py | 41 +------------------ 1 file changed, 2 insertions(+), 39 deletions(-) diff --git a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py index cfed1fb..645a04d 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py @@ -1,50 +1,13 @@ { "name": "Opentimestamps Enricher", "type": ["enricher"], - "requires_setup": False, + "requires_setup": True, "dependencies": { "python": [ "loguru", - "opentimestamps", + "opentimestamps-client", ], }, - "configs": { - "tsa_urls": { - "default": [ - # [Adobe Approved Trust List] and [Windows Cert Store] - "http://timestamp.digicert.com", - "http://timestamp.identrust.com", - # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping - # "https://timestamp.sectigo.com", # wait 15 seconds between each request. - - # [Adobe: European Union Trusted Lists]. - # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. - - # [Windows Cert Store] - "http://timestamp.globalsign.com/tsa/r6advanced1", - # [Adobe: European Union Trusted Lists] and [Windows Cert Store] - # "http://ts.quovadisglobal.com/eu", # not valid for timestamping - # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain - # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain - # "http://tsa.sep.bg", # self-signed certificate in certificate chain - # "http://tsa.izenpe.com", #unable to get local issuer certificate - # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate - "http://tss.accv.es:8318/tsa", - ], - "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - } - }, "description": """ - Generates RFC3161-compliant timestamp tokens using Time Stamp Authorities (TSA) for archived files. - - ### Features - - Creates timestamp tokens to prove the existence of files at a specific time, useful for legal and authenticity purposes. - - Aggregates file hashes into a text file and timestamps the concatenated data. - - Uses multiple Time Stamp Authorities (TSAs) to ensure reliability and redundancy. - - Validates timestamping certificates against trusted Certificate Authorities (CAs) using the `certifi` trust store. - - ### Notes - - Should be run after the `hash_enricher` to ensure file hashes are available. - - Requires internet access to interact with the configured TSAs. """ } From b70ed97ffd79bfdf87fbb286968c97faf21a7146 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 11 Mar 2025 17:28:28 +0000 Subject: [PATCH 03/12] Create opentimestamps module --- .../opentimestamps_enricher/__manifest__.py | 45 ++++- .../opentimestamps_enricher.py | 174 ++++++++++++++++++ 2 files changed, 216 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py index 645a04d..849bb3d 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py @@ -1,13 +1,52 @@ { - "name": "Opentimestamps Enricher", + "name": "OpenTimestamps Enricher", "type": ["enricher"], "requires_setup": True, "dependencies": { "python": [ "loguru", - "opentimestamps-client", + "opentimestamps", + "slugify", ], }, + "configs": { + "use_calendars": { + "default": True, + "help": "Whether to connect to OpenTimestamps calendar servers to create timestamps. If false, creates local timestamp proofs only.", + "type": "bool" + }, + "calendar_urls": { + "default": [ + "https://alice.btc.calendar.opentimestamps.org", + "https://bob.btc.calendar.opentimestamps.org", + "https://finney.calendar.eternitywall.com" + ], + "help": "List of OpenTimestamps calendar servers to use for timestamping.", + "type": "list" + }, + "calendar_whitelist": { + "default": [], + "help": "Optional whitelist of calendar servers. If empty, all calendar servers are allowed.", + "type": "list" + }, + "verify_timestamps": { + "default": True, + "help": "Whether to verify timestamps after creating them.", + "type": "bool" + } + }, "description": """ + Creates OpenTimestamps proofs for archived files, providing blockchain-backed evidence of file existence at a specific time. + + ### Features + - Creates cryptographic timestamp proofs that link files to the Bitcoin blockchain + - Verifies existing timestamp proofs to confirm the time a file existed + - Uses multiple calendar servers to ensure reliability and redundancy + - Stores timestamp proofs alongside original files for future verification + + ### Notes + - Can work offline to create timestamp proofs that can be upgraded later + - Verification checks if timestamps have been confirmed in the Bitcoin blockchain + - Should run after files have been archived and hashed """ -} +} \ No newline at end of file diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index e69de29..01e8964 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -0,0 +1,174 @@ +import os +import hashlib +from importlib.metadata import version + +from slugify import slugify +from loguru import logger +import opentimestamps +from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST +from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile +from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation +from auto_archiver.core import Enricher +from auto_archiver.core import Metadata, Media +from auto_archiver.version import __version__ + + +class OpentimestampsEnricher(Enricher): + """ + Uses OpenTimestamps to create and verify timestamps for files. OpenTimestamps is a service that + timestamps data using the Bitcoin blockchain, providing a decentralized and secure way to prove + that data existed at a certain point in time. + + The enricher hashes files in the archive and creates timestamp proofs that can later be verified. + These proofs are stored alongside the original files and can be used to verify the timestamp + even if the OpenTimestamps calendar servers are unavailable. + """ + + def setup(self): + # Initialize any resources needed + pass + + def cleanup(self) -> None: + # Clean up any resources used + pass + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"OpenTimestamps timestamping files for {url=}") + + # Get the media files to timestamp + media_files = [m for m in to_enrich.media if m.get("filename") and not m.get("opentimestamps")] + + if not media_files: + logger.warning(f"No files found to timestamp in {url=}") + return + + timestamp_files = [] + for media in media_files: + try: + # Get the file path from the media + file_path = media.get("filename") + if not os.path.exists(file_path): + logger.warning(f"File not found: {file_path}") + continue + + # Create timestamp for the file + logger.debug(f"Creating timestamp for {file_path}") + + # Hash the file + with open(file_path, 'rb') as f: + file_bytes = f.read() + file_hash = hashlib.sha256(file_bytes).digest() + + # Create a timestamp with the file hash + timestamp = Timestamp(file_hash) + + # Create a detached timestamp file with the timestamp + detached_timestamp = DetachedTimestampFile(timestamp) + + # Submit to calendar servers + if self.use_calendars: + logger.debug(f"Submitting timestamp to calendar servers for {file_path}") + calendars = [] + whitelist = DEFAULT_CALENDAR_WHITELIST + + if self.calendar_whitelist: + whitelist = set(self.calendar_whitelist) + + # Create calendar instances + for url in self.calendar_urls: + if url in whitelist: + calendars.append(RemoteCalendar(url)) + + # Submit the hash to each calendar + for calendar in calendars: + try: + calendar_timestamp = calendar.submit(file_hash) + timestamp.merge(calendar_timestamp) + logger.debug(f"Successfully submitted to calendar: {calendar.url}") + except Exception as e: + logger.warning(f"Failed to submit to calendar {calendar.url}: {e}") + else: + logger.info("Skipping calendar submission as per configuration") + + # Save the timestamp proof to a file + timestamp_path = os.path.join(self.tmp_dir, f"{os.path.basename(file_path)}.ots") + with open(timestamp_path, 'wb') as f: + detached_timestamp.serialize(f) + + # Create media for the timestamp file + timestamp_media = Media(filename=timestamp_path) + timestamp_media.set("source_file", os.path.basename(file_path)) + timestamp_media.set("opentimestamps_version", opentimestamps.__version__) + + # Verify the timestamp if needed + if self.verify_timestamps: + verification_info = self.verify_timestamp(detached_timestamp) + for key, value in verification_info.items(): + timestamp_media.set(key, value) + + timestamp_files.append(timestamp_media) + + # Update the original media to indicate it's been timestamped + media.set("opentimestamps", True) + media.set("opentimestamp_file", timestamp_path) + + except Exception as e: + logger.warning(f"Error while timestamping {media.get('filename')}: {e}") + + # Add timestamp files to the metadata + if timestamp_files: + for ts_media in timestamp_files: + to_enrich.add_media(ts_media) + + to_enrich.set("opentimestamped", True) + to_enrich.set("opentimestamps_count", len(timestamp_files)) + logger.success(f"{len(timestamp_files)} OpenTimestamps proofs created for {url=}") + else: + logger.warning(f"No successful timestamps created for {url=}") + + def verify_timestamp(self, detached_timestamp): + """ + Verify a timestamp and extract verification information. + + Args: + detached_timestamp: The detached timestamp to verify. + + Returns: + dict: Information about the verification result. + """ + result = {} + + # Check if we have attestations + attestations = list(detached_timestamp.timestamp.all_attestations()) + result["attestation_count"] = len(attestations) + + if attestations: + attestation_info = [] + for msg, attestation in attestations: + info = {} + + # Process different types of attestations + if isinstance(attestation, PendingAttestation): + info["type"] = "pending" + info["uri"] = attestation.uri.decode('utf-8') + + elif isinstance(attestation, BitcoinBlockHeaderAttestation): + info["type"] = "bitcoin" + info["block_height"] = attestation.height + + attestation_info.append(info) + + result["attestations"] = attestation_info + + # For at least one confirmed attestation + if any(a.get("type") == "bitcoin" for a in attestation_info): + result["verified"] = True + else: + result["verified"] = False + result["pending"] = True + else: + result["verified"] = False + result["pending"] = False + + return result \ No newline at end of file From 28041d94d97d0a9fc88272a72cdcf8a623501812 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 11 Mar 2025 17:33:54 +0000 Subject: [PATCH 04/12] Add unit tests for opentimestamps enricher --- .../enrichers/test_opentimestamps_enricher.py | 242 ++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 tests/enrichers/test_opentimestamps_enricher.py diff --git a/tests/enrichers/test_opentimestamps_enricher.py b/tests/enrichers/test_opentimestamps_enricher.py new file mode 100644 index 0000000..5681561 --- /dev/null +++ b/tests/enrichers/test_opentimestamps_enricher.py @@ -0,0 +1,242 @@ +from pathlib import Path +import pytest +import os +import tempfile +import hashlib + +from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile +from opentimestamps.calendar import RemoteCalendar +from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation + +from auto_archiver.core import Metadata, Media + +@pytest.fixture +def sample_file_path(): + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp.write(b"This is a test file content for OpenTimestamps") + return tmp.name + +@pytest.fixture +def detached_timestamp_file(): + """Create a simple detached timestamp file for testing""" + file_hash = hashlib.sha256(b"Test content").digest() + timestamp = Timestamp(file_hash) + + # Add a pending attestation + pending = PendingAttestation(b"https://example.calendar.com") + timestamp.attestations.add(pending) + + # Add a bitcoin attestation + bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height + timestamp.attestations.add(bitcoin) + + return DetachedTimestampFile(timestamp) + +@pytest.fixture +def verified_timestamp_file(): + """Create a timestamp file with a Bitcoin attestation""" + file_hash = hashlib.sha256(b"Verified content").digest() + timestamp = Timestamp(file_hash) + + # Add only a Bitcoin attestation + bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height + timestamp.attestations.add(bitcoin) + + return DetachedTimestampFile(timestamp) + +@pytest.fixture +def pending_timestamp_file(): + """Create a timestamp file with only pending attestations""" + file_hash = hashlib.sha256(b"Pending content").digest() + timestamp = Timestamp(file_hash) + + # Add only pending attestations + pending1 = PendingAttestation(b"https://example1.calendar.com") + pending2 = PendingAttestation(b"https://example2.calendar.com") + timestamp.attestations.add(pending1) + timestamp.attestations.add(pending2) + + return DetachedTimestampFile(timestamp) + +@pytest.mark.download +def test_download_tsr(setup_module, mocker): + """Test submitting a hash to calendar servers""" + # Mock the RemoteCalendar submit method + mock_submit = mocker.patch.object(RemoteCalendar, 'submit') + test_timestamp = Timestamp(hashlib.sha256(b"test").digest()) + mock_submit.return_value = test_timestamp + + # Setup enricher + ots = setup_module("opentimestamps_enricher") + + # Create a calendar + calendar = RemoteCalendar("https://alice.btc.calendar.opentimestamps.org") + + # Test submission + file_hash = hashlib.sha256(b"Test file content").digest() + result = calendar.submit(file_hash) + + assert mock_submit.called + assert isinstance(result, Timestamp) + assert result == test_timestamp + +def test_verify_timestamp(setup_module, detached_timestamp_file): + """Test the verification of timestamp attestations""" + ots = setup_module("opentimestamps_enricher") + + # Test verification + verification_info = ots.verify_timestamp(detached_timestamp_file) + + # Check verification results + assert verification_info["attestation_count"] == 2 + assert verification_info["verified"] == True + assert len(verification_info["attestations"]) == 2 + + # Check attestation types + assertion_types = [a["type"] for a in verification_info["attestations"]] + assert "pending" in assertion_types + assert "bitcoin" in assertion_types + + # Check Bitcoin attestation details + bitcoin_attestation = next(a for a in verification_info["attestations"] if a["type"] == "bitcoin") + assert bitcoin_attestation["block_height"] == 783000 + +def test_verify_pending_only(setup_module, pending_timestamp_file): + """Test verification of timestamps with only pending attestations""" + ots = setup_module("opentimestamps_enricher") + + verification_info = ots.verify_timestamp(pending_timestamp_file) + + assert verification_info["attestation_count"] == 2 + assert verification_info["verified"] == False + assert verification_info["pending"] == True + + # All attestations should be of type "pending" + assert all(a["type"] == "pending" for a in verification_info["attestations"]) + + # Check URIs of pending attestations + uris = [a["uri"] for a in verification_info["attestations"]] + assert "https://example1.calendar.com" in uris + assert "https://example2.calendar.com" in uris + +def test_verify_bitcoin_completed(setup_module, verified_timestamp_file): + """Test verification of timestamps with completed Bitcoin attestations""" + ots = setup_module("opentimestamps_enricher") + + verification_info = ots.verify_timestamp(verified_timestamp_file) + + assert verification_info["attestation_count"] == 1 + assert verification_info["verified"] == True + assert "pending" not in verification_info + + # Check that the attestation is a Bitcoin attestation + attestation = verification_info["attestations"][0] + assert attestation["type"] == "bitcoin" + assert attestation["block_height"] == 783000 + +def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): + """Test the complete enrichment process""" + # Mock the calendar submission to avoid network requests + mock_calendar = mocker.patch.object(RemoteCalendar, 'submit') + test_timestamp = Timestamp(hashlib.sha256(b"test").digest()) + # Add a bitcoin attestation to the test timestamp + bitcoin = BitcoinBlockHeaderAttestation(783000) + test_timestamp.attestations.add(bitcoin) + mock_calendar.return_value = test_timestamp + + # Setup enricher + ots = setup_module("opentimestamps_enricher") + + # Create test metadata with sample file + metadata = Metadata().set_url("https://example.com") + sample_media.set("filename", sample_file_path) + metadata.add_media(sample_media) + + # Run enrichment + ots.enrich(metadata) + + # Verify results + assert metadata.get("opentimestamped") == True + assert metadata.get("opentimestamps_count") == 1 + + # Check that we have two media items: the original and the timestamp + assert len(metadata.media) == 2 + + # Check that the original media was updated + assert metadata.media[0].get("opentimestamps") == True + assert metadata.media[0].get("opentimestamp_file") is not None + + # Check the timestamp file media + timestamp_media = metadata.media[1] + assert timestamp_media.get("source_file") == os.path.basename(sample_file_path) + assert timestamp_media.get("opentimestamps_version") is not None + + # Check verification results on the timestamp media + assert timestamp_media.get("verified") == True + assert timestamp_media.get("attestation_count") == 1 + +def test_full_enriching_no_calendars(setup_module, sample_file_path, sample_media, mocker): + """Test enrichment process with calendars disabled""" + # Setup enricher with calendars disabled + ots = setup_module("opentimestamps_enricher", {"use_calendars": False}) + + # Create test metadata with sample file + metadata = Metadata().set_url("https://example.com") + sample_media.set("filename", sample_file_path) + metadata.add_media(sample_media) + + # Run enrichment + ots.enrich(metadata) + + # Verify results + assert metadata.get("opentimestamped") == True + assert metadata.get("opentimestamps_count") == 1 + + # Check the timestamp file media + timestamp_media = metadata.media[1] + assert timestamp_media.get("source_file") == os.path.basename(sample_file_path) + + # Verify status should be false since we didn't use calendars + assert timestamp_media.get("verified") == False + assert timestamp_media.get("attestation_count") == 0 + +def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker): + """Test enrichment when calendar servers return errors""" + # Mock the calendar submission to raise an exception + mock_calendar = mocker.patch.object(RemoteCalendar, 'submit') + mock_calendar.side_effect = Exception("Calendar server error") + + # Setup enricher + ots = setup_module("opentimestamps_enricher") + + # Create test metadata with sample file + metadata = Metadata().set_url("https://example.com") + sample_media.set("filename", sample_file_path) + metadata.add_media(sample_media) + + # Run enrichment (should complete despite calendar errors) + ots.enrich(metadata) + + # Verify results + assert metadata.get("opentimestamped") == True + assert metadata.get("opentimestamps_count") == 1 + + # Verify status should be false since calendar submissions failed + timestamp_media = metadata.media[1] + assert timestamp_media.get("verified") == False + assert timestamp_media.get("attestation_count") == 0 + +def test_no_files_to_stamp(setup_module): + """Test enrichment with no files to timestamp""" + # Setup enricher + ots = setup_module("opentimestamps_enricher") + + # Create empty metadata + metadata = Metadata().set_url("https://example.com") + + # Run enrichment + ots.enrich(metadata) + + # Verify no timestamping occurred + assert metadata.get("opentimestamped") is None + assert len(metadata.media) == 0 \ No newline at end of file From 1423c103631480371898d57e2c358f85be2238bd Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 12 Mar 2025 10:24:57 +0000 Subject: [PATCH 05/12] Finish off timestamping module --- poetry.lock | 30 +++++++- pyproject.toml | 1 + src/auto_archiver/core/module.py | 4 +- .../opentimestamps_enricher/__manifest__.py | 12 ++- .../opentimestamps_enricher.py | 76 +++++++++++-------- .../timestamping_enricher.py | 2 +- .../enrichers/test_opentimestamps_enricher.py | 68 +++++++++++------ 7 files changed, 130 insertions(+), 63 deletions(-) diff --git a/poetry.lock b/poetry.lock index f59d5c9..17365ac 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1361,6 +1361,22 @@ rsa = ["cryptography (>=3.0.0)"] signals = ["blinker (>=1.4.0)"] signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] +[[package]] +name = "opentimestamps" +version = "0.4.5" +description = "Create and verify OpenTimestamps proofs" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "opentimestamps-0.4.5-py3-none-any.whl", hash = "sha256:a4912b3bd1b612a3ef5fac925b9137889e6c5cb91cc9e76c8202a2bf8abe26b5"}, + {file = "opentimestamps-0.4.5.tar.gz", hash = "sha256:56726ccde97fb67f336a7f237ce36808e5593c3089d68d900b1c83d0ebf9dcfa"}, +] + +[package.dependencies] +pycryptodomex = ">=3.3.1" +python-bitcoinlib = ">=0.9.0,<0.13.0" + [[package]] name = "oscrypto" version = "1.3.0" @@ -1834,6 +1850,18 @@ pytest = ">=6.2.5" [package.extras] dev = ["pre-commit", "pytest-asyncio", "tox"] +[[package]] +name = "python-bitcoinlib" +version = "0.12.2" +description = "The Swiss Army Knife of the Bitcoin protocol." +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "python-bitcoinlib-0.12.2.tar.gz", hash = "sha256:c65ab61427c77c38d397bfc431f71d86fd355b453a536496ec3fcb41bd10087d"}, + {file = "python_bitcoinlib-0.12.2-py3-none-any.whl", hash = "sha256:2f29a9f475f21c12169b3a6cc8820f34f11362d7ff1200a5703dce3e4e903a44"}, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -3185,4 +3213,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "2d0a953383901fe12e97f6f56a76a9d8008788695425792eedbf739a18585188" +content-hash = "e42f3bc122fe5d98deb6aa224ddf531b6f45a50b7c61213721ff5c8258e424e3" diff --git a/pyproject.toml b/pyproject.toml index 30693d5..b6b2aa8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ dependencies = [ "certvalidator (>=0.0.0)", "rich-argparse (>=1.6.0,<2.0.0)", "ruamel-yaml (>=0.18.10,<0.19.0)", + "opentimestamps (>=0.4.5,<0.5.0)", ] [tool.poetry.group.dev.dependencies] diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 2c6617d..96d5420 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin from __future__ import annotations from dataclasses import dataclass -from typing import List, TYPE_CHECKING +from typing import List, TYPE_CHECKING, Type import shutil import ast import copy @@ -57,7 +57,7 @@ class ModuleFactory: HAS_SETUP_PATHS = True - def get_module(self, module_name: str, config: dict) -> BaseModule: + def get_module(self, module_name: str, config: dict) -> Type[BaseModule]: """ Gets and sets up a module using the provided config diff --git a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py index 849bb3d..136c0c2 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py @@ -6,7 +6,6 @@ "python": [ "loguru", "opentimestamps", - "slugify", ], }, "configs": { @@ -19,14 +18,16 @@ "default": [ "https://alice.btc.calendar.opentimestamps.org", "https://bob.btc.calendar.opentimestamps.org", - "https://finney.calendar.eternitywall.com" + "https://finney.calendar.eternitywall.com", + # "https://ots.btc.catallaxy.com/", # ipv4 only ], - "help": "List of OpenTimestamps calendar servers to use for timestamping.", + "help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:\ +https://opentimestamps.org/#calendars", "type": "list" }, "calendar_whitelist": { "default": [], - "help": "Optional whitelist of calendar servers. If empty, all calendar servers are allowed.", + "help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']", "type": "list" }, "verify_timestamps": { @@ -38,6 +39,9 @@ "description": """ Creates OpenTimestamps proofs for archived files, providing blockchain-backed evidence of file existence at a specific time. + Uses OpenTimestamps – a service that timestamps data using the Bitcoin blockchain, providing a decentralized + and secure way to prove that data existed at a certain point in time. + ### Features - Creates cryptographic timestamp proofs that link files to the Bitcoin blockchain - Verifies existing timestamp proofs to confirm the time a file existed diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index 01e8964..2b74ee6 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -1,36 +1,19 @@ import os import hashlib -from importlib.metadata import version +from typing import TYPE_CHECKING -from slugify import slugify from loguru import logger import opentimestamps from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation +from opentimestamps.core.op import OpSHA256 +from opentimestamps.core import serialize from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media -from auto_archiver.version import __version__ - +from auto_archiver.utils.misc import calculate_file_hash class OpentimestampsEnricher(Enricher): - """ - Uses OpenTimestamps to create and verify timestamps for files. OpenTimestamps is a service that - timestamps data using the Bitcoin blockchain, providing a decentralized and secure way to prove - that data existed at a certain point in time. - - The enricher hashes files in the archive and creates timestamp proofs that can later be verified. - These proofs are stored alongside the original files and can be used to verify the timestamp - even if the OpenTimestamps calendar servers are unavailable. - """ - - def setup(self): - # Initialize any resources needed - pass - - def cleanup(self) -> None: - # Clean up any resources used - pass def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() @@ -38,7 +21,7 @@ class OpentimestampsEnricher(Enricher): # Get the media files to timestamp media_files = [m for m in to_enrich.media if m.get("filename") and not m.get("opentimestamps")] - + if not media_files: logger.warning(f"No files found to timestamp in {url=}") return @@ -52,21 +35,26 @@ class OpentimestampsEnricher(Enricher): logger.warning(f"File not found: {file_path}") continue - # Create timestamp for the file + # Create timestamp for the file - hash is SHA256 + # Note: ONLY SHA256 is used/supported here. Opentimestamps supports other hashes, but not SHA3-512 + # see opentimestamps.core.op logger.debug(f"Creating timestamp for {file_path}") - - # Hash the file + file_hash = None with open(file_path, 'rb') as f: - file_bytes = f.read() - file_hash = hashlib.sha256(file_bytes).digest() + file_hash = OpSHA256().hash_fd(f) + + if not file_hash: + logger.warning(f"Failed to hash file for timestamping, skipping: {file_path}") + continue # Create a timestamp with the file hash timestamp = Timestamp(file_hash) - # Create a detached timestamp file with the timestamp - detached_timestamp = DetachedTimestampFile(timestamp) + # Create a detached timestamp file with the hash operation and timestamp + detached_timestamp = DetachedTimestampFile(OpSHA256(), timestamp) # Submit to calendar servers + submitted_to_calendar = False if self.use_calendars: logger.debug(f"Submitting timestamp to calendar servers for {file_path}") calendars = [] @@ -76,9 +64,11 @@ class OpentimestampsEnricher(Enricher): whitelist = set(self.calendar_whitelist) # Create calendar instances + calendar_urls = [] for url in self.calendar_urls: if url in whitelist: calendars.append(RemoteCalendar(url)) + calendar_urls.append(url) # Submit the hash to each calendar for calendar in calendars: @@ -86,15 +76,35 @@ class OpentimestampsEnricher(Enricher): calendar_timestamp = calendar.submit(file_hash) timestamp.merge(calendar_timestamp) logger.debug(f"Successfully submitted to calendar: {calendar.url}") + submitted_to_calendar = True except Exception as e: logger.warning(f"Failed to submit to calendar {calendar.url}: {e}") + + # If all calendar submissions failed, add pending attestations + if not submitted_to_calendar and not timestamp.attestations: + logger.info("All calendar submissions failed, creating pending attestations") + for url in calendar_urls: + pending = PendingAttestation(url) + timestamp.attestations.add(pending) else: logger.info("Skipping calendar submission as per configuration") + + # Add dummy pending attestation for testing when calendars are disabled + for url in self.calendar_urls: + pending = PendingAttestation(url) + timestamp.attestations.add(pending) # Save the timestamp proof to a file timestamp_path = os.path.join(self.tmp_dir, f"{os.path.basename(file_path)}.ots") - with open(timestamp_path, 'wb') as f: - detached_timestamp.serialize(f) + try: + with open(timestamp_path, 'wb') as f: + # Create a serialization context and write to the file + ctx = serialize.BytesSerializationContext() + detached_timestamp.serialize(ctx) + f.write(ctx.getbytes()) + except Exception as e: + logger.warning(f"Failed to serialize timestamp file: {e}") + continue # Create media for the timestamp file timestamp_media = Media(filename=timestamp_path) @@ -106,6 +116,8 @@ class OpentimestampsEnricher(Enricher): verification_info = self.verify_timestamp(detached_timestamp) for key, value in verification_info.items(): timestamp_media.set(key, value) + else: + logger.warning(f"Not verifying the timestamp for media file {file_path}") timestamp_files.append(timestamp_media) @@ -151,7 +163,7 @@ class OpentimestampsEnricher(Enricher): # Process different types of attestations if isinstance(attestation, PendingAttestation): info["type"] = "pending" - info["uri"] = attestation.uri.decode('utf-8') + info["uri"] = attestation.uri elif isinstance(attestation, BitcoinBlockHeaderAttestation): info["type"] = "bitcoin" diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index 078c1ba..b83e86c 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -30,7 +30,7 @@ class TimestampingEnricher(Enricher): if not len(hashes): logger.warning(f"No hashes found in {url=}") return - + tmp_dir = self.tmp_dir hashes_fn = os.path.join(tmp_dir, "hashes.txt") diff --git a/tests/enrichers/test_opentimestamps_enricher.py b/tests/enrichers/test_opentimestamps_enricher.py index 5681561..db171e5 100644 --- a/tests/enrichers/test_opentimestamps_enricher.py +++ b/tests/enrichers/test_opentimestamps_enricher.py @@ -10,53 +10,69 @@ from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAtt from auto_archiver.core import Metadata, Media + +# TODO: Remove once timestamping overhaul is merged @pytest.fixture -def sample_file_path(): - with tempfile.NamedTemporaryFile(delete=False) as tmp: - tmp.write(b"This is a test file content for OpenTimestamps") - return tmp.name +def sample_media(tmp_path) -> Media: + """Fixture creating a Media object with temporary source file""" + src_file = tmp_path / "source.txt" + src_file.write_text("test content") + return Media(_key="subdir/test.txt", filename=str(src_file)) + + +@pytest.fixture +def sample_file_path(tmp_path): + tmp_file = tmp_path / "test.txt" + tmp_file.write_text("This is a test file content for OpenTimestamps") + return str(tmp_file) @pytest.fixture def detached_timestamp_file(): """Create a simple detached timestamp file for testing""" file_hash = hashlib.sha256(b"Test content").digest() + from opentimestamps.core.op import OpSHA256 + file_hash_op = OpSHA256() timestamp = Timestamp(file_hash) # Add a pending attestation - pending = PendingAttestation(b"https://example.calendar.com") + pending = PendingAttestation("https://example.calendar.com") timestamp.attestations.add(pending) # Add a bitcoin attestation bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height timestamp.attestations.add(bitcoin) - return DetachedTimestampFile(timestamp) + return DetachedTimestampFile(file_hash_op, timestamp) @pytest.fixture def verified_timestamp_file(): """Create a timestamp file with a Bitcoin attestation""" file_hash = hashlib.sha256(b"Verified content").digest() + from opentimestamps.core.op import OpSHA256 + file_hash_op = OpSHA256() timestamp = Timestamp(file_hash) # Add only a Bitcoin attestation bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height timestamp.attestations.add(bitcoin) - return DetachedTimestampFile(timestamp) + return DetachedTimestampFile(file_hash_op, timestamp) @pytest.fixture def pending_timestamp_file(): """Create a timestamp file with only pending attestations""" file_hash = hashlib.sha256(b"Pending content").digest() + from opentimestamps.core.op import OpSHA256 + file_hash_op = OpSHA256() timestamp = Timestamp(file_hash) # Add only pending attestations - pending1 = PendingAttestation(b"https://example1.calendar.com") - pending2 = PendingAttestation(b"https://example2.calendar.com") + pending1 = PendingAttestation("https://example1.calendar.com") + pending2 = PendingAttestation("https://example2.calendar.com") timestamp.attestations.add(pending1) timestamp.attestations.add(pending2) - return DetachedTimestampFile(timestamp) + return DetachedTimestampFile(file_hash_op, timestamp) @pytest.mark.download def test_download_tsr(setup_module, mocker): @@ -66,7 +82,7 @@ def test_download_tsr(setup_module, mocker): test_timestamp = Timestamp(hashlib.sha256(b"test").digest()) mock_submit.return_value = test_timestamp - # Setup enricher + ots = setup_module("opentimestamps_enricher") # Create a calendar @@ -121,6 +137,7 @@ def test_verify_pending_only(setup_module, pending_timestamp_file): def test_verify_bitcoin_completed(setup_module, verified_timestamp_file): """Test verification of timestamps with completed Bitcoin attestations""" + ots = setup_module("opentimestamps_enricher") verification_info = ots.verify_timestamp(verified_timestamp_file) @@ -136,15 +153,21 @@ def test_verify_bitcoin_completed(setup_module, verified_timestamp_file): def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): """Test the complete enrichment process""" + # Mock the calendar submission to avoid network requests mock_calendar = mocker.patch.object(RemoteCalendar, 'submit') - test_timestamp = Timestamp(hashlib.sha256(b"test").digest()) - # Add a bitcoin attestation to the test timestamp - bitcoin = BitcoinBlockHeaderAttestation(783000) - test_timestamp.attestations.add(bitcoin) - mock_calendar.return_value = test_timestamp - # Setup enricher + # Create a function that returns a new timestamp for each call + def side_effect(digest): + test_timestamp = Timestamp(digest) + # Add a bitcoin attestation to the test timestamp + bitcoin = BitcoinBlockHeaderAttestation(783000) + test_timestamp.attestations.add(bitcoin) + return test_timestamp + + mock_calendar.side_effect = side_effect + + ots = setup_module("opentimestamps_enricher") # Create test metadata with sample file @@ -176,8 +199,6 @@ def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): assert timestamp_media.get("attestation_count") == 1 def test_full_enriching_no_calendars(setup_module, sample_file_path, sample_media, mocker): - """Test enrichment process with calendars disabled""" - # Setup enricher with calendars disabled ots = setup_module("opentimestamps_enricher", {"use_calendars": False}) # Create test metadata with sample file @@ -198,7 +219,8 @@ def test_full_enriching_no_calendars(setup_module, sample_file_path, sample_medi # Verify status should be false since we didn't use calendars assert timestamp_media.get("verified") == False - assert timestamp_media.get("attestation_count") == 0 + # We expect 3 pending attestations (one for each calendar URL) + assert timestamp_media.get("attestation_count") == 3 def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker): """Test enrichment when calendar servers return errors""" @@ -206,7 +228,7 @@ def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_me mock_calendar = mocker.patch.object(RemoteCalendar, 'submit') mock_calendar.side_effect = Exception("Calendar server error") - # Setup enricher + ots = setup_module("opentimestamps_enricher") # Create test metadata with sample file @@ -224,11 +246,11 @@ def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_me # Verify status should be false since calendar submissions failed timestamp_media = metadata.media[1] assert timestamp_media.get("verified") == False - assert timestamp_media.get("attestation_count") == 0 + # We expect 3 pending attestations (one for each calendar URL that's enabled by default in __manifest__) + assert timestamp_media.get("attestation_count") == 3 def test_no_files_to_stamp(setup_module): """Test enrichment with no files to timestamp""" - # Setup enricher ots = setup_module("opentimestamps_enricher") # Create empty metadata From 394b8b2dd18aea008e4e742bb2e987c7b519c646 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 12 Mar 2025 11:45:13 +0000 Subject: [PATCH 06/12] Improvements to opentimestamps enricher - make OTS file a sub-file of original media --- scripts/settings/src/schema.json | 596 ++++++++---------- .../modules/gdrive_storage/__manifest__.py | 2 +- .../modules/local_storage/__manifest__.py | 2 +- .../opentimestamps_enricher/__manifest__.py | 7 + .../opentimestamps_enricher.py | 20 +- .../modules/s3_storage/__manifest__.py | 2 +- .../enrichers/test_opentimestamps_enricher.py | 27 +- 7 files changed, 309 insertions(+), 347 deletions(-) diff --git a/scripts/settings/src/schema.json b/scripts/settings/src/schema.json index 64a903a..70eb71b 100644 --- a/scripts/settings/src/schema.json +++ b/scripts/settings/src/schema.json @@ -1,151 +1,25 @@ { "modules": { - "gsheet_feeder": { - "name": "gsheet_feeder", - "display_name": "Google Sheets Feeder", + "atlos_feeder_db_storage": { + "name": "atlos_feeder_db_storage", + "display_name": "Atlos Feeder Database Storage", "manifest": { - "name": "Google Sheets Feeder", + "name": "Atlos Feeder Database Storage", "author": "Bellingcat", "type": [ - "feeder" + "feeder", + "database", + "storage" ], "requires_setup": true, - "description": "\n GsheetsFeeder \n A Google Sheets-based feeder for the Auto Archiver.\n\n This reads data from Google Sheets and filters rows based on user-defined rules.\n The filtered rows are processed into `Metadata` objects.\n\n ### Features\n - Validates the sheet structure and filters rows based on input configurations.\n - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.\n - Ensures only rows with valid URLs and unprocessed statuses are included for archival.\n - Supports organizing stored files into folder paths based on sheet and worksheet names.\n\n ### Setup\n - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.\n To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).\n - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.\n - Customize the column names in your Google sheet using the `columns` configuration.\n ", - "dependencies": { - "python": [ - "loguru", - "gspread", - "slugify" - ] - }, - "entry_point": "gsheet_feeder::GsheetsFeeder", - "version": "1.0", - "configs": { - "sheet": { - "default": null, - "help": "name of the sheet to archive" - }, - "sheet_id": { - "default": null, - "help": "the id of the sheet to archive (alternative to 'sheet' config)" - }, - "header": { - "default": 1, - "type": "int", - "help": "index of the header row (starts at 1)" - }, - "service_account": { - "default": "secrets/service_account.json", - "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html", - "required": true - }, - "columns": { - "default": { - "url": "link", - "status": "archive status", - "folder": "destination folder", - "archive": "archive location", - "date": "archive date", - "thumbnail": "thumbnail", - "timestamp": "upload timestamp", - "title": "upload title", - "text": "text content", - "screenshot": "screenshot", - "hash": "hash", - "pdq_hash": "perceptual hashes", - "wacz": "wacz", - "replaywebpage": "replaywebpage" - }, - "help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting", - "type": "json_loader" - }, - "allow_worksheets": { - "default": [], - "help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed" - }, - "block_worksheets": { - "default": [], - "help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed" - }, - "use_sheet_names_in_stored_paths": { - "default": true, - "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", - "type": "bool" - } - } - }, - "configs": { - "sheet": { - "default": null, - "help": "name of the sheet to archive" - }, - "sheet_id": { - "default": null, - "help": "the id of the sheet to archive (alternative to 'sheet' config)" - }, - "header": { - "default": 1, - "type": "int", - "help": "index of the header row (starts at 1)" - }, - "service_account": { - "default": "secrets/service_account.json", - "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html", - "required": true - }, - "columns": { - "default": { - "url": "link", - "status": "archive status", - "folder": "destination folder", - "archive": "archive location", - "date": "archive date", - "thumbnail": "thumbnail", - "timestamp": "upload timestamp", - "title": "upload title", - "text": "text content", - "screenshot": "screenshot", - "hash": "hash", - "pdq_hash": "perceptual hashes", - "wacz": "wacz", - "replaywebpage": "replaywebpage" - }, - "help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting", - "type": "json_loader" - }, - "allow_worksheets": { - "default": [], - "help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed" - }, - "block_worksheets": { - "default": [], - "help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed" - }, - "use_sheet_names_in_stored_paths": { - "default": true, - "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", - "type": "bool" - } - } - }, - "atlos_feeder": { - "name": "atlos_feeder", - "display_name": "Atlos Feeder", - "manifest": { - "name": "Atlos Feeder", - "author": "Bellingcat", - "type": [ - "feeder" - ], - "requires_setup": true, - "description": "\n AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.\n\n ### Features\n - Connects to the Atlos API to retrieve a list of source material URLs.\n - Filters source materials based on visibility, processing status, and metadata.\n - Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL.\n - Iterates through paginated results using a cursor for efficient API interaction.\n\n ### Notes\n - Requires an Atlos API endpoint and a valid API token for authentication.\n - Ensures only unprocessed, visible, and ready-to-archive URLs are returned.\n - Handles pagination transparently when retrieving data from the Atlos API.\n ", + "description": "\n A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media,\n \n [Atlos](https://www.atlos.org/) is a visual investigation and archiving platform designed for investigative research, journalism, and open-source intelligence (OSINT). \n It helps users organize, analyze, and store media from various sources, making it easier to track and investigate digital evidence.\n \n To get started create a new project and obtain an API token from the settings page. You can group event's into Atlos's 'incidents'.\n Here you can add 'source material' by URLn and the Atlos feeder will fetch these URLs for archival.\n \n You can use Atlos only as a 'feeder', however you can also implement the 'database' and 'storage' features to store the media files in Atlos which is recommended.\n The Auto Archiver will retain the Atlos ID for each item, ensuring that the media and database outputs are uplaoded back into the relevant media item.\n \n \n ### Features\n - Connects to the Atlos API to retrieve a list of source material URLs.\n - Iterates through the URLs from all source material items which are unprocessed, visible, and ready to archive.\n - If the storage option is selected, it will store the media files alongside the original source material item in Atlos.\n - Is the database option is selected it will output the results to the media item, as well as updating failure status with error details when archiving fails.\n - Skips Storege/ database upload for items without an Atlos ID - restricting that you must use the Atlos feeder so that it has the Atlos ID to store the results with.\n\n ### Notes\n - Requires an Atlos account with a project and a valid API token for authentication.\n - Ensures only unprocessed, visible, and ready-to-archive URLs are returned.\n - Feches any media items within an Atlos project, regardless of separation into incidents.\n ", "dependencies": { "python": [ "loguru", "requests" ] }, - "entry_point": "", + "entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage", "version": "1.0", "configs": { "api_token": { @@ -222,6 +96,135 @@ } } }, + "gsheet_feeder_db": { + "name": "gsheet_feeder_db", + "display_name": "Google Sheets Feeder Database", + "manifest": { + "name": "Google Sheets Feeder Database", + "author": "Bellingcat", + "type": [ + "feeder", + "database" + ], + "requires_setup": true, + "description": "\n GsheetsFeederDatabase\n A Google Sheets-based feeder and optional database for the Auto Archiver.\n\n This reads data from Google Sheets and filters rows based on user-defined rules.\n The filtered rows are processed into `Metadata` objects.\n\n ### Features\n - Validates the sheet structure and filters rows based on input configurations.\n - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.\n - Ensures only rows with valid URLs and unprocessed statuses are included for archival.\n - Supports organizing stored files into folder paths based on sheet and worksheet names.\n - If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.\n - Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.\n - Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.\n - Skips redundant updates for empty or invalid data fields.\n\n ### Setup\n - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.\n To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).\n - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.\n - Customize the column names in your Google sheet using the `columns` configuration.\n - The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.\n ", + "dependencies": { + "python": [ + "loguru", + "gspread", + "slugify" + ] + }, + "entry_point": "gsheet_feeder_db::GsheetsFeederDB", + "version": "1.0", + "configs": { + "sheet": { + "default": null, + "help": "name of the sheet to archive" + }, + "sheet_id": { + "default": null, + "help": "the id of the sheet to archive (alternative to 'sheet' config)" + }, + "header": { + "default": 1, + "type": "int", + "help": "index of the header row (starts at 1)" + }, + "service_account": { + "default": "secrets/service_account.json", + "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html", + "required": true + }, + "columns": { + "default": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage" + }, + "help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting", + "type": "json_loader" + }, + "allow_worksheets": { + "default": [], + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed" + }, + "block_worksheets": { + "default": [], + "help": "(CSV) explicitly block some worksheets from being processed" + }, + "use_sheet_names_in_stored_paths": { + "default": true, + "type": "bool", + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'" + } + } + }, + "configs": { + "sheet": { + "default": null, + "help": "name of the sheet to archive" + }, + "sheet_id": { + "default": null, + "help": "the id of the sheet to archive (alternative to 'sheet' config)" + }, + "header": { + "default": 1, + "type": "int", + "help": "index of the header row (starts at 1)" + }, + "service_account": { + "default": "secrets/service_account.json", + "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html", + "required": true + }, + "columns": { + "default": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage" + }, + "help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting", + "type": "json_loader" + }, + "allow_worksheets": { + "default": [], + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed" + }, + "block_worksheets": { + "default": [], + "help": "(CSV) explicitly block some worksheets from being processed" + }, + "use_sheet_names_in_stored_paths": { + "default": true, + "type": "bool", + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'" + } + } + }, "cli_feeder": { "name": "cli_feeder", "display_name": "Command Line Feeder", @@ -470,7 +473,7 @@ "extractor" ], "requires_setup": true, - "description": "\n Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts\n and user profiles, downloading as much information as possible, including images, videos, text, stories,\n highlights, and tagged posts. \n Authentication is required via username/password or a session file.\n \n ", + "description": "\n Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. \n \n > \u26a0\ufe0f **Warning** \n > This module is not actively maintained due to known issues with blocking. \n > Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)\n \n This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,\n highlights, and tagged posts. \n Authentication is required via username/password or a session file.\n \n ", "dependencies": { "python": [ "instaloader", @@ -482,38 +485,38 @@ "configs": { "username": { "required": true, - "help": "a valid Instagram username" + "help": "A valid Instagram username." }, "password": { "required": true, - "help": "the corresponding Instagram account password" + "help": "The corresponding Instagram account password." }, "download_folder": { "default": "instaloader", - "help": "name of a folder to temporarily download content to" + "help": "Name of a folder to temporarily download content to." }, "session_file": { "default": "secrets/instaloader.session", - "help": "path to the instagram session which saves session credentials" + "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one." } } }, "configs": { "username": { "required": true, - "help": "a valid Instagram username" + "help": "A valid Instagram username." }, "password": { "required": true, - "help": "the corresponding Instagram account password" + "help": "The corresponding Instagram account password." }, "download_folder": { "default": "instaloader", - "help": "name of a folder to temporarily download content to" + "help": "Name of a folder to temporarily download content to." }, "session_file": { "default": "secrets/instaloader.session", - "help": "path to the instagram session which saves session credentials" + "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one." } } }, @@ -661,7 +664,7 @@ "extractor" ], "requires_setup": false, - "description": "\nThis is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.\n\nThis module is responsible for downloading and processing media content from platforms\nsupported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality\nfor retrieving videos, subtitles, comments, and other metadata, and it integrates with\nthe broader archiving framework.\n\n### Features\n- Supports downloading videos and playlists.\n- Retrieves metadata like titles, descriptions, upload dates, and durations.\n- Downloads subtitles and comments when enabled.\n- Configurable options for handling live streams, proxies, and more.\n- Supports authentication of websites using the 'authentication' settings from your orchestration.\n\n### Dropins\n- For websites supported by `yt-dlp` that also contain posts in addition to videos\n (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create \n metadata objects. Some dropins are included in this generic_archiver by default, but\ncustom dropins can be created to handle additional websites and passed to the archiver\nvia the command line using the `--dropins` option (TODO!).\n", + "description": "\nThis is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.\n\nThis module is responsible for downloading and processing media content from platforms\nsupported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality\nfor retrieving videos, subtitles, comments, and other metadata, and it integrates with\nthe broader archiving framework.\n\n### Features\n- Supports downloading videos and playlists.\n- Retrieves metadata like titles, descriptions, upload dates, and durations.\n- Downloads subtitles and comments when enabled.\n- Configurable options for handling live streams, proxies, and more.\n- Supports authentication of websites using the 'authentication' settings from your orchestration.\n\n### Dropins\n- For websites supported by `yt-dlp` that also contain posts in addition to videos\n (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create \n metadata objects. Some dropins are included in this generic_archiver by default, but\ncustom dropins can be created to handle additional websites and passed to the archiver\nvia the command line using the `--dropins` option (TODO!).\n\n### Auto-Updates\n\nThe Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).\nThis can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).\nIf you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.\n\n", "dependencies": { "python": [ "yt_dlp", @@ -710,6 +713,11 @@ "max_downloads": { "default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit." + }, + "ytdlp_update_interval": { + "default": 5, + "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.", + "type": "int" } } }, @@ -751,9 +759,38 @@ "max_downloads": { "default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit." + }, + "ytdlp_update_interval": { + "default": 5, + "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.", + "type": "int" } } }, + "tiktok_tikwm_extractor": { + "name": "tiktok_tikwm_extractor", + "display_name": "Tiktok Tikwm Extractor", + "manifest": { + "name": "Tiktok Tikwm Extractor", + "author": "Bellingcat", + "type": [ + "extractor" + ], + "requires_setup": false, + "description": "\n Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/\n\t\n\tThis extractor complements the generic_extractor which can already get TikTok videos, but this one can extract special videos like those marked as sensitive.\n\n ### Features\n - Downloads the video and, if possible, also the video cover.\n\t- Stores extra metadata about the post like author information, and more as returned by tikwm.com. \n\n ### Notes\n - If tikwm.com is down, this extractor will not work.\n\t- If tikwm.com changes their API, this extractor may break.\n\t- If no video is found, this extractor will consider the extraction failed.\n ", + "dependencies": { + "python": [ + "loguru", + "requests" + ], + "bin": [] + }, + "entry_point": "", + "version": "1.0", + "configs": {} + }, + "configs": null + }, "telegram_extractor": { "name": "telegram_extractor", "display_name": "Telegram Extractor", @@ -1054,7 +1091,7 @@ "help": "width of the screenshots" }, "height": { - "default": 720, + "default": 1024, "type": "int", "help": "height of the screenshots" }, @@ -1091,7 +1128,7 @@ "help": "width of the screenshots" }, "height": { - "default": 720, + "default": 1024, "type": "int", "help": "height of the screenshots" }, @@ -1201,6 +1238,79 @@ } } }, + "opentimestamps_enricher": { + "name": "opentimestamps_enricher", + "display_name": "OpenTimestamps Enricher", + "manifest": { + "name": "OpenTimestamps Enricher", + "author": "Bellingcat", + "type": [ + "enricher" + ], + "requires_setup": true, + "description": "\n Creates OpenTimestamps proofs for archived files, providing blockchain-backed evidence of file existence at a specific time.\n\n Uses OpenTimestamps \u2013 a service that timestamps data using the Bitcoin blockchain, providing a decentralized \n and secure way to prove that data existed at a certain point in time.\n\n ### Features\n - Creates cryptographic timestamp proofs that link files to the Bitcoin blockchain\n - Verifies existing timestamp proofs to confirm the time a file existed\n - Uses multiple calendar servers to ensure reliability and redundancy\n - Stores timestamp proofs alongside original files for future verification\n\n ### Notes\n - Can work offline to create timestamp proofs that can be upgraded later\n - Verification checks if timestamps have been confirmed in the Bitcoin blockchain\n - Should run after files have been archived and hashed\n\n ### Verifying Timestamps Later\n If you wish to verify a timestamp (ots) file later, you can install the opentimestamps-client command line tool and use the `ots verify` command.\n Example: `ots verify my_file.ots`\n\n Note: if you're using local storage with a filename_generator set to 'static' (a hash) or random, the files will be renamed when they are saved to the\n final location meaning you will need to specify the original filename when verifying the timestamp with `ots verify -f original_filename my_file.ots`.\n ", + "dependencies": { + "python": [ + "loguru", + "opentimestamps" + ] + }, + "entry_point": "", + "version": "1.0", + "configs": { + "use_calendars": { + "default": true, + "help": "Whether to connect to OpenTimestamps calendar servers to create timestamps. If false, creates local timestamp proofs only.", + "type": "bool" + }, + "calendar_urls": { + "default": [ + "https://alice.btc.calendar.opentimestamps.org", + "https://bob.btc.calendar.opentimestamps.org", + "https://finney.calendar.eternitywall.com" + ], + "help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars", + "type": "list" + }, + "calendar_whitelist": { + "default": [], + "help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']", + "type": "list" + }, + "verify_timestamps": { + "default": true, + "help": "Whether to verify timestamps after creating them.", + "type": "bool" + } + } + }, + "configs": { + "use_calendars": { + "default": true, + "help": "Whether to connect to OpenTimestamps calendar servers to create timestamps. If false, creates local timestamp proofs only.", + "type": "bool" + }, + "calendar_urls": { + "default": [ + "https://alice.btc.calendar.opentimestamps.org", + "https://bob.btc.calendar.opentimestamps.org", + "https://finney.calendar.eternitywall.com" + ], + "help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars", + "type": "list" + }, + "calendar_whitelist": { + "default": [], + "help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']", + "type": "list" + }, + "verify_timestamps": { + "default": true, + "help": "Whether to verify timestamps after creating them.", + "type": "bool" + } + } + }, "thumbnail_enricher": { "name": "thumbnail_enricher", "display_name": "Thumbnail Enricher", @@ -1381,56 +1491,6 @@ } } }, - "atlos_db": { - "name": "atlos_db", - "display_name": "Atlos Database", - "manifest": { - "name": "Atlos Database", - "author": "Bellingcat", - "type": [ - "database" - ], - "requires_setup": true, - "description": "\nHandles integration with the Atlos platform for managing archival results.\n\n### Features\n- Outputs archival results to the Atlos API for storage and tracking.\n- Updates failure status with error details when archiving fails.\n- Processes and formats metadata, including ISO formatting for datetime fields.\n- Skips processing for items without an Atlos ID.\n\n### Setup\nRequired configs:\n- atlos_url: Base URL for the Atlos API.\n- api_token: Authentication token for API access.\n", - "dependencies": { - "python": [ - "loguru", - "" - ], - "bin": [ - "" - ] - }, - "entry_point": "atlos_db::AtlosDb", - "version": "1.0", - "configs": { - "api_token": { - "default": null, - "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "required": true, - "type": "str" - }, - "atlos_url": { - "default": "https://platform.atlos.org", - "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": "str" - } - } - }, - "configs": { - "api_token": { - "default": null, - "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "required": true, - "type": "str" - }, - "atlos_url": { - "default": "https://platform.atlos.org", - "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": "str" - } - } - }, "api_db": { "name": "api_db", "display_name": "Auto Archiver API Database", @@ -1473,9 +1533,9 @@ "help": "which group of users have access to the archive in case public=false as author" }, "use_api_cache": { - "default": true, + "default": false, "type": "bool", - "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived" + "help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived" }, "store_results": { "default": true, @@ -1511,9 +1571,9 @@ "help": "which group of users have access to the archive in case public=false as author" }, "use_api_cache": { - "default": true, + "default": false, "type": "bool", - "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived" + "help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived" }, "store_results": { "default": true, @@ -1526,58 +1586,6 @@ } } }, - "gsheet_db": { - "name": "gsheet_db", - "display_name": "Google Sheets Database", - "manifest": { - "name": "Google Sheets Database", - "author": "Bellingcat", - "type": [ - "database" - ], - "requires_setup": true, - "description": "\n GsheetsDatabase:\n Handles integration with Google Sheets for tracking archival tasks.\n\n### Features\n- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.\n- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.\n- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.\n- Skips redundant updates for empty or invalid data fields.\n\n### Notes\n- Currently works only with metadata provided by GsheetFeeder. \n- Requires configuration of a linked Google Sheet and appropriate API credentials.\n ", - "dependencies": { - "python": [ - "loguru", - "gspread", - "slugify" - ] - }, - "entry_point": "gsheet_db::GsheetsDb", - "version": "1.0", - "configs": { - "allow_worksheets": { - "default": [], - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed" - }, - "block_worksheets": { - "default": [], - "help": "(CSV) explicitly block some worksheets from being processed" - }, - "use_sheet_names_in_stored_paths": { - "default": true, - "type": "bool", - "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'" - } - } - }, - "configs": { - "allow_worksheets": { - "default": [], - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed" - }, - "block_worksheets": { - "default": [], - "help": "(CSV) explicitly block some worksheets from being processed" - }, - "use_sheet_names_in_stored_paths": { - "default": true, - "type": "bool", - "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'" - } - } - }, "console_db": { "name": "console_db", "display_name": "Console Database", @@ -1664,7 +1672,7 @@ }, "filename_generator": { "default": "static", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).", "choices": [ "random", "static" @@ -1696,7 +1704,7 @@ }, "filename_generator": { "default": "static", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).", "choices": [ "random", "static" @@ -1716,54 +1724,6 @@ } } }, - "atlos_storage": { - "name": "atlos_storage", - "display_name": "Atlos Storage", - "manifest": { - "name": "Atlos Storage", - "author": "Bellingcat", - "type": [ - "storage" - ], - "requires_setup": true, - "description": "\n Stores media files in a [Atlos](https://www.atlos.org/).\n\n ### Features\n - Saves media files to Atlos, organizing them into folders based on the provided path structure.\n\n ### Notes\n - Requires setup with Atlos credentials.\n - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.\n ", - "dependencies": { - "python": [ - "loguru", - "boto3" - ], - "bin": [] - }, - "entry_point": "", - "version": "1.0", - "configs": { - "api_token": { - "default": null, - "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "required": true, - "type": "str" - }, - "atlos_url": { - "default": "https://platform.atlos.org", - "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": "str" - } - } - }, - "configs": { - "api_token": { - "default": null, - "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "required": true, - "type": "str" - }, - "atlos_url": { - "default": "https://platform.atlos.org", - "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": "str" - } - } - }, "s3_storage": { "name": "s3_storage", "display_name": "S3 Storage", @@ -1796,7 +1756,7 @@ }, "filename_generator": { "default": "static", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).", "choices": [ "random", "static" @@ -1850,7 +1810,7 @@ }, "filename_generator": { "default": "static", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).", "choices": [ "random", "static" @@ -1922,7 +1882,7 @@ }, "filename_generator": { "default": "static", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)", "choices": [ "random", "static" @@ -1951,7 +1911,7 @@ }, "filename_generator": { "default": "static", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)", "choices": [ "random", "static" @@ -2029,9 +1989,9 @@ "steps": { "feeders": [ "cli_feeder", - "gsheet_feeder", - "atlos_feeder", - "csv_feeder" + "atlos_feeder_db_storage", + "csv_feeder", + "gsheet_feeder_db" ], "extractors": [ "wayback_extractor_enricher", @@ -2039,6 +1999,7 @@ "instagram_api_extractor", "instagram_tbot_extractor", "generic_extractor", + "tiktok_tikwm_extractor", "twitter_api_extractor", "instagram_extractor", "telethon_extractor", @@ -2055,20 +2016,21 @@ "meta_enricher", "pdq_hash_enricher", "whisper_enricher", + "opentimestamps_enricher", "ssl_enricher", "hash_enricher" ], "databases": [ "console_db", - "atlos_db", "api_db", "csv_db", - "gsheet_db" + "atlos_feeder_db_storage", + "gsheet_feeder_db" ], "storages": [ "local_storage", "gdrive_storage", - "atlos_storage", + "atlos_feeder_db_storage", "s3_storage" ], "formatters": [ @@ -2077,9 +2039,9 @@ ] }, "configs": [ - "gsheet_feeder", - "atlos_feeder", + "atlos_feeder_db_storage", "csv_feeder", + "gsheet_feeder_db", "cli_feeder", "instagram_api_extractor", "instagram_tbot_extractor", @@ -2093,15 +2055,13 @@ "timestamping_enricher", "screenshot_enricher", "whisper_enricher", + "opentimestamps_enricher", "thumbnail_enricher", "ssl_enricher", "hash_enricher", - "atlos_db", "api_db", - "gsheet_db", "csv_db", "gdrive_storage", - "atlos_storage", "s3_storage", "local_storage", "html_formatter" diff --git a/src/auto_archiver/modules/gdrive_storage/__manifest__.py b/src/auto_archiver/modules/gdrive_storage/__manifest__.py index 73784b8..46e4fa1 100644 --- a/src/auto_archiver/modules/gdrive_storage/__manifest__.py +++ b/src/auto_archiver/modules/gdrive_storage/__manifest__.py @@ -19,7 +19,7 @@ }, "filename_generator": { "default": "static", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).", "choices": ["random", "static"], }, "root_folder_id": {"required": True, diff --git a/src/auto_archiver/modules/local_storage/__manifest__.py b/src/auto_archiver/modules/local_storage/__manifest__.py index 8ad6381..72f59d1 100644 --- a/src/auto_archiver/modules/local_storage/__manifest__.py +++ b/src/auto_archiver/modules/local_storage/__manifest__.py @@ -13,7 +13,7 @@ }, "filename_generator": { "default": "static", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)", "choices": ["random", "static"], }, "save_to": {"default": "./local_archive", "help": "folder where to save archived content"}, diff --git a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py index 136c0c2..ff038e1 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py @@ -52,5 +52,12 @@ https://opentimestamps.org/#calendars", - Can work offline to create timestamp proofs that can be upgraded later - Verification checks if timestamps have been confirmed in the Bitcoin blockchain - Should run after files have been archived and hashed + + ### Verifying Timestamps Later + If you wish to verify a timestamp (ots) file later, you can install the opentimestamps-client command line tool and use the `ots verify` command. + Example: `ots verify my_file.ots` + + Note: if you're using local storage with a filename_generator set to 'static' (a hash) or random, the files will be renamed when they are saved to the + final location meaning you will need to specify the original filename when verifying the timestamp with `ots verify -f original_filename my_file.ots`. """ } \ No newline at end of file diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index 2b74ee6..cdeb78d 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -20,8 +20,7 @@ class OpentimestampsEnricher(Enricher): logger.debug(f"OpenTimestamps timestamping files for {url=}") # Get the media files to timestamp - media_files = [m for m in to_enrich.media if m.get("filename") and not m.get("opentimestamps")] - + media_files = [m for m in to_enrich.media if m.filename and not m.get("opentimestamps")] if not media_files: logger.warning(f"No files found to timestamp in {url=}") return @@ -30,7 +29,7 @@ class OpentimestampsEnricher(Enricher): for media in media_files: try: # Get the file path from the media - file_path = media.get("filename") + file_path = media.filename if not os.path.exists(file_path): logger.warning(f"File not found: {file_path}") continue @@ -108,7 +107,8 @@ class OpentimestampsEnricher(Enricher): # Create media for the timestamp file timestamp_media = Media(filename=timestamp_path) - timestamp_media.set("source_file", os.path.basename(file_path)) + # explicitly set the mimetype, normally .ots files are 'application/vnd.oasis.opendocument.spreadsheet-template' + media.mimetype = "application/vnd.opentimestamps" timestamp_media.set("opentimestamps_version", opentimestamps.__version__) # Verify the timestamp if needed @@ -119,20 +119,16 @@ class OpentimestampsEnricher(Enricher): else: logger.warning(f"Not verifying the timestamp for media file {file_path}") - timestamp_files.append(timestamp_media) - + media.set("opentimestamp_files", [timestamp_media]) + timestamp_files.append(timestamp_media.filename) # Update the original media to indicate it's been timestamped media.set("opentimestamps", True) - media.set("opentimestamp_file", timestamp_path) except Exception as e: - logger.warning(f"Error while timestamping {media.get('filename')}: {e}") + logger.warning(f"Error while timestamping {media.filename}: {e}") # Add timestamp files to the metadata if timestamp_files: - for ts_media in timestamp_files: - to_enrich.add_media(ts_media) - to_enrich.set("opentimestamped", True) to_enrich.set("opentimestamps_count", len(timestamp_files)) logger.success(f"{len(timestamp_files)} OpenTimestamps proofs created for {url=}") @@ -162,7 +158,7 @@ class OpentimestampsEnricher(Enricher): # Process different types of attestations if isinstance(attestation, PendingAttestation): - info["type"] = "pending" + info["type"] = f"pending (as of {attestation.date})" info["uri"] = attestation.uri elif isinstance(attestation, BitcoinBlockHeaderAttestation): diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py index bf032e7..156f562 100644 --- a/src/auto_archiver/modules/s3_storage/__manifest__.py +++ b/src/auto_archiver/modules/s3_storage/__manifest__.py @@ -13,7 +13,7 @@ }, "filename_generator": { "default": "static", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).", "choices": ["random", "static"], }, "bucket": {"default": None, "help": "S3 bucket name"}, diff --git a/tests/enrichers/test_opentimestamps_enricher.py b/tests/enrichers/test_opentimestamps_enricher.py index db171e5..391fb06 100644 --- a/tests/enrichers/test_opentimestamps_enricher.py +++ b/tests/enrichers/test_opentimestamps_enricher.py @@ -172,7 +172,7 @@ def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): # Create test metadata with sample file metadata = Metadata().set_url("https://example.com") - sample_media.set("filename", sample_file_path) + sample_media.filename = sample_file_path metadata.add_media(sample_media) # Run enrichment @@ -182,16 +182,17 @@ def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): assert metadata.get("opentimestamped") == True assert metadata.get("opentimestamps_count") == 1 - # Check that we have two media items: the original and the timestamp - assert len(metadata.media) == 2 + # Check that we have one parent media item: the original + assert len(metadata.media) == 1 # Check that the original media was updated assert metadata.media[0].get("opentimestamps") == True - assert metadata.media[0].get("opentimestamp_file") is not None - # Check the timestamp file media - timestamp_media = metadata.media[1] - assert timestamp_media.get("source_file") == os.path.basename(sample_file_path) + # Check the timestamp file media is a child of the original + assert len(metadata.media[0].get("opentimestamp_files")) == 1 + + timestamp_media = metadata.media[0].get("opentimestamp_files")[0] + assert timestamp_media.get("opentimestamps_version") is not None # Check verification results on the timestamp media @@ -203,7 +204,7 @@ def test_full_enriching_no_calendars(setup_module, sample_file_path, sample_medi # Create test metadata with sample file metadata = Metadata().set_url("https://example.com") - sample_media.set("filename", sample_file_path) + sample_media.filename = sample_file_path metadata.add_media(sample_media) # Run enrichment @@ -212,10 +213,8 @@ def test_full_enriching_no_calendars(setup_module, sample_file_path, sample_medi # Verify results assert metadata.get("opentimestamped") == True assert metadata.get("opentimestamps_count") == 1 - - # Check the timestamp file media - timestamp_media = metadata.media[1] - assert timestamp_media.get("source_file") == os.path.basename(sample_file_path) + + timestamp_media = metadata.media[0].get("opentimestamp_files")[0] # Verify status should be false since we didn't use calendars assert timestamp_media.get("verified") == False @@ -233,7 +232,7 @@ def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_me # Create test metadata with sample file metadata = Metadata().set_url("https://example.com") - sample_media.set("filename", sample_file_path) + sample_media.filename = sample_file_path metadata.add_media(sample_media) # Run enrichment (should complete despite calendar errors) @@ -244,7 +243,7 @@ def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_me assert metadata.get("opentimestamps_count") == 1 # Verify status should be false since calendar submissions failed - timestamp_media = metadata.media[1] + timestamp_media = metadata.media[0].get("opentimestamp_files")[0] assert timestamp_media.get("verified") == False # We expect 3 pending attestations (one for each calendar URL that's enabled by default in __manifest__) assert timestamp_media.get("attestation_count") == 3 From 1d664524eb61557311a26246354ed0ca16a9191b Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 12 Mar 2025 11:54:25 +0000 Subject: [PATCH 07/12] Add info on last check/last updated to the metadata --- .../opentimestamps_enricher/opentimestamps_enricher.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index cdeb78d..cf110a2 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -1,6 +1,5 @@ import os -import hashlib -from typing import TYPE_CHECKING +import datetime from loguru import logger import opentimestamps @@ -108,7 +107,7 @@ class OpentimestampsEnricher(Enricher): # Create media for the timestamp file timestamp_media = Media(filename=timestamp_path) # explicitly set the mimetype, normally .ots files are 'application/vnd.oasis.opendocument.spreadsheet-template' - media.mimetype = "application/vnd.opentimestamps" + timestamp_media.mimetype = "application/vnd.opentimestamps" timestamp_media.set("opentimestamps_version", opentimestamps.__version__) # Verify the timestamp if needed @@ -158,12 +157,14 @@ class OpentimestampsEnricher(Enricher): # Process different types of attestations if isinstance(attestation, PendingAttestation): - info["type"] = f"pending (as of {attestation.date})" + info["type"] = f"pending" info["uri"] = attestation.uri elif isinstance(attestation, BitcoinBlockHeaderAttestation): info["type"] = "bitcoin" info["block_height"] = attestation.height + + info["last_check"] = datetime.datetime.now().isoformat()[:-7] attestation_info.append(info) @@ -178,5 +179,6 @@ class OpentimestampsEnricher(Enricher): else: result["verified"] = False result["pending"] = False + result["last_updated"] = datetime.datetime.now().isoformat()[:-7] return result \ No newline at end of file From e7489ac4c41fb28270e8738cc13bcc129234a5b7 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 13 Mar 2025 14:30:33 +0000 Subject: [PATCH 08/12] Tidy up opentimestamps * Simplify * Don't add fake (pending) attestations if the calendar urls all have issues * Remove unnecessary configs * Improve docs on upgrading + verifying --- .../opentimestamps_enricher/__manifest__.py | 79 +++++++++++----- .../opentimestamps_enricher.py | 92 +++++++++---------- .../enrichers/test_opentimestamps_enricher.py | 43 ++------- 3 files changed, 106 insertions(+), 108 deletions(-) diff --git a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py index ff038e1..733ff1a 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py @@ -9,11 +9,6 @@ ], }, "configs": { - "use_calendars": { - "default": True, - "help": "Whether to connect to OpenTimestamps calendar servers to create timestamps. If false, creates local timestamp proofs only.", - "type": "bool" - }, "calendar_urls": { "default": [ "https://alice.btc.calendar.opentimestamps.org", @@ -30,34 +25,76 @@ https://opentimestamps.org/#calendars", "help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']", "type": "list" }, - "verify_timestamps": { - "default": True, - "help": "Whether to verify timestamps after creating them.", - "type": "bool" - } }, "description": """ Creates OpenTimestamps proofs for archived files, providing blockchain-backed evidence of file existence at a specific time. Uses OpenTimestamps – a service that timestamps data using the Bitcoin blockchain, providing a decentralized - and secure way to prove that data existed at a certain point in time. + and secure way to prove that data existed at a certain point in time. A SHA256 hash of the file to be timestamped is used as the token + and sent to each of the 'timestamp calendars' for inclusion in the blockchain. The proof is then saved alongside the original file in a file with + the '.ots' extension. ### Features - - Creates cryptographic timestamp proofs that link files to the Bitcoin blockchain - - Verifies existing timestamp proofs to confirm the time a file existed - - Uses multiple calendar servers to ensure reliability and redundancy + - Creates cryptographic timestamp proofs that link files to the Bitcoin or Litecoin blockchain + - Verifies timestamp proofs have been submitted to the blockchain (note: does not confirm they have been *added*) + - Can use multiple calendar servers to ensure reliability and redundancy - Stores timestamp proofs alongside original files for future verification - ### Notes - - Can work offline to create timestamp proofs that can be upgraded later - - Verification checks if timestamps have been confirmed in the Bitcoin blockchain - - Should run after files have been archived and hashed + ### Timestamp status + An opentimestamp, when submitted to a timestmap server will have a 'pending' status (Pending Attestation) as it waits to be added + to the blockchain. Once it has been added to the blockchain, it will have a 'confirmed' status (Bitcoin Block Timestamp). + This process typically takes several hours, depending on the calendar server and the current state of the Bitcoin network. As such, + the status of all timestamps added will be 'pending' until they are subsequently confirmed (see 'Upgrading Timestamps' below). - ### Verifying Timestamps Later - If you wish to verify a timestamp (ots) file later, you can install the opentimestamps-client command line tool and use the `ots verify` command. + There are two possible statuses for a timestamp: + - `Pending`: The timestamp has been submitted to the calendar server but has not yet been confirmed in the Bitcoin blockchain. + - `Confirmed`: The timestamp has been confirmed in the Bitcoin or Litecoin blockchain. + + ### Upgrading Timestamps + To upgrade a timestamp from 'pending' to 'confirmed', you can use the `ots upgrade` command from the opentimestamps-client package + (install it with `pip install opentimesptamps-client`). + Example: `ots upgrade my_file.ots` + + Here is a useful script that could be used to upgrade all timestamps in a directory, which could be run on a cron job: +```{code} bash +find . -name "*.ots" -type f | while read file; do + echo "Upgrading OTS $file" + ots upgrade $file +done +# The result might look like: +# Upgrading OTS ./my_file.ots +# Got 1 attestation(s) from https://alice.btc.calendar.opentimestamps.org +# Success! Timestamp complete +``` + +```{note} Note: this will only upgrade the .ots files, and will not change the status text in any output .html files or any databases where the +metadata is stored (e.g. Google Sheets, CSV database, API database etc.). +``` + + ### Verifying Timestamps + The easiest way to verify a timestamp (ots) file is to install the opentimestamps-client command line tool and use the `ots verify` command. Example: `ots verify my_file.ots` - Note: if you're using local storage with a filename_generator set to 'static' (a hash) or random, the files will be renamed when they are saved to the + ```{code} bash +$ ots verify my_file.ots +Calendar https://bob.btc.calendar.opentimestamps.org: Pending confirmation in Bitcoin blockchain +Calendar https://finney.calendar.eternitywall.com: Pending confirmation in Bitcoin blockchain +Calendar https://alice.btc.calendar.opentimestamps.org: Timestamped by transaction 12345; waiting for 6 confirmations +``` + + Note: if you're using a storage with `filename_generator` set to `static` or `random`, the files will be renamed when they are saved to the final location meaning you will need to specify the original filename when verifying the timestamp with `ots verify -f original_filename my_file.ots`. + + ### Choosing Calendar Servers + + By default, the OpenTimestamps enricher uses a set of public calendar servers provided by the 'opentimestamps' project. + You can customize the list of calendar servers by providing URLs in the `calendar_urls` configuration option. + + ### Calendar WhiteList + + By default, the opentimestamps package only allows their own calendars to be used (see `DEFAULT_CALENDAR_WHITELIST` in `opentimestamps.calendar`), + if you want to use your own calendars, then you can override this setting in the `calendar_whitelist` configuration option. + + """ } \ No newline at end of file diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index cf110a2..d6e8add 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -5,7 +5,7 @@ from loguru import logger import opentimestamps from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile -from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation +from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation, LitecoinBlockHeaderAttestation from opentimestamps.core.op import OpSHA256 from opentimestamps.core import serialize from auto_archiver.core import Enricher @@ -53,44 +53,36 @@ class OpentimestampsEnricher(Enricher): # Submit to calendar servers submitted_to_calendar = False - if self.use_calendars: - logger.debug(f"Submitting timestamp to calendar servers for {file_path}") - calendars = [] - whitelist = DEFAULT_CALENDAR_WHITELIST - - if self.calendar_whitelist: - whitelist = set(self.calendar_whitelist) - - # Create calendar instances - calendar_urls = [] - for url in self.calendar_urls: - if url in whitelist: - calendars.append(RemoteCalendar(url)) - calendar_urls.append(url) - - # Submit the hash to each calendar - for calendar in calendars: - try: - calendar_timestamp = calendar.submit(file_hash) - timestamp.merge(calendar_timestamp) - logger.debug(f"Successfully submitted to calendar: {calendar.url}") - submitted_to_calendar = True - except Exception as e: - logger.warning(f"Failed to submit to calendar {calendar.url}: {e}") - - # If all calendar submissions failed, add pending attestations - if not submitted_to_calendar and not timestamp.attestations: - logger.info("All calendar submissions failed, creating pending attestations") - for url in calendar_urls: - pending = PendingAttestation(url) - timestamp.attestations.add(pending) - else: - logger.info("Skipping calendar submission as per configuration") - - # Add dummy pending attestation for testing when calendars are disabled - for url in self.calendar_urls: - pending = PendingAttestation(url) - timestamp.attestations.add(pending) + + logger.debug(f"Submitting timestamp to calendar servers for {file_path}") + calendars = [] + whitelist = DEFAULT_CALENDAR_WHITELIST + + if self.calendar_whitelist: + whitelist = set(self.calendar_whitelist) + + # Create calendar instances + calendar_urls = [] + for url in self.calendar_urls: + if url in whitelist: + calendars.append(RemoteCalendar(url)) + calendar_urls.append(url) + + # Submit the hash to each calendar + for calendar in calendars: + try: + calendar_timestamp = calendar.submit(file_hash) + timestamp.merge(calendar_timestamp) + logger.debug(f"Successfully submitted to calendar: {calendar.url}") + submitted_to_calendar = True + except Exception as e: + logger.warning(f"Failed to submit to calendar {calendar.url}: {e}") + + # If all calendar submissions failed, add pending attestations + if not submitted_to_calendar and not timestamp.attestations: + logger.error(f"Failed to submit to any calendar for {file_path}. **This file will not be timestamped.**") + media.set("opentimestamps", False) + continue # Save the timestamp proof to a file timestamp_path = os.path.join(self.tmp_dir, f"{os.path.basename(file_path)}.ots") @@ -110,13 +102,9 @@ class OpentimestampsEnricher(Enricher): timestamp_media.mimetype = "application/vnd.opentimestamps" timestamp_media.set("opentimestamps_version", opentimestamps.__version__) - # Verify the timestamp if needed - if self.verify_timestamps: - verification_info = self.verify_timestamp(detached_timestamp) - for key, value in verification_info.items(): - timestamp_media.set(key, value) - else: - logger.warning(f"Not verifying the timestamp for media file {file_path}") + verification_info = self.verify_timestamp(detached_timestamp) + for key, value in verification_info.items(): + timestamp_media.set(key, value) media.set("opentimestamp_files", [timestamp_media]) timestamp_files.append(timestamp_media.filename) @@ -132,6 +120,7 @@ class OpentimestampsEnricher(Enricher): to_enrich.set("opentimestamps_count", len(timestamp_files)) logger.success(f"{len(timestamp_files)} OpenTimestamps proofs created for {url=}") else: + to_enrich.set("opentimestamped", False) logger.warning(f"No successful timestamps created for {url=}") def verify_timestamp(self, detached_timestamp): @@ -157,11 +146,14 @@ class OpentimestampsEnricher(Enricher): # Process different types of attestations if isinstance(attestation, PendingAttestation): - info["type"] = f"pending" + info["status"] = "pending" info["uri"] = attestation.uri elif isinstance(attestation, BitcoinBlockHeaderAttestation): - info["type"] = "bitcoin" + info["status"] = "confirmed - bitcoin" + info["block_height"] = attestation.height + elif isinstance(attestation, LitecoinBlockHeaderAttestation): + info["status"] = "confirmed - litecoin" info["block_height"] = attestation.height info["last_check"] = datetime.datetime.now().isoformat()[:-7] @@ -171,14 +163,12 @@ class OpentimestampsEnricher(Enricher): result["attestations"] = attestation_info # For at least one confirmed attestation - if any(a.get("type") == "bitcoin" for a in attestation_info): + if any("confirmed" in a.get("status") for a in attestation_info): result["verified"] = True else: result["verified"] = False - result["pending"] = True else: result["verified"] = False - result["pending"] = False result["last_updated"] = datetime.datetime.now().isoformat()[:-7] return result \ No newline at end of file diff --git a/tests/enrichers/test_opentimestamps_enricher.py b/tests/enrichers/test_opentimestamps_enricher.py index 391fb06..2cdefdf 100644 --- a/tests/enrichers/test_opentimestamps_enricher.py +++ b/tests/enrichers/test_opentimestamps_enricher.py @@ -109,12 +109,12 @@ def test_verify_timestamp(setup_module, detached_timestamp_file): assert len(verification_info["attestations"]) == 2 # Check attestation types - assertion_types = [a["type"] for a in verification_info["attestations"]] + assertion_types = [a["status"] for a in verification_info["attestations"]] assert "pending" in assertion_types - assert "bitcoin" in assertion_types + assert "confirmed - bitcoin" in assertion_types # Check Bitcoin attestation details - bitcoin_attestation = next(a for a in verification_info["attestations"] if a["type"] == "bitcoin") + bitcoin_attestation = next(a for a in verification_info["attestations"] if a["status"] == "confirmed - bitcoin") assert bitcoin_attestation["block_height"] == 783000 def test_verify_pending_only(setup_module, pending_timestamp_file): @@ -125,10 +125,9 @@ def test_verify_pending_only(setup_module, pending_timestamp_file): assert verification_info["attestation_count"] == 2 assert verification_info["verified"] == False - assert verification_info["pending"] == True # All attestations should be of type "pending" - assert all(a["type"] == "pending" for a in verification_info["attestations"]) + assert all(a["status"] == "pending" for a in verification_info["attestations"]) # Check URIs of pending attestations uris = [a["uri"] for a in verification_info["attestations"]] @@ -148,7 +147,7 @@ def test_verify_bitcoin_completed(setup_module, verified_timestamp_file): # Check that the attestation is a Bitcoin attestation attestation = verification_info["attestations"][0] - assert attestation["type"] == "bitcoin" + assert attestation["status"] == "confirmed - bitcoin" assert attestation["block_height"] == 783000 def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): @@ -199,28 +198,6 @@ def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): assert timestamp_media.get("verified") == True assert timestamp_media.get("attestation_count") == 1 -def test_full_enriching_no_calendars(setup_module, sample_file_path, sample_media, mocker): - ots = setup_module("opentimestamps_enricher", {"use_calendars": False}) - - # Create test metadata with sample file - metadata = Metadata().set_url("https://example.com") - sample_media.filename = sample_file_path - metadata.add_media(sample_media) - - # Run enrichment - ots.enrich(metadata) - - # Verify results - assert metadata.get("opentimestamped") == True - assert metadata.get("opentimestamps_count") == 1 - - timestamp_media = metadata.media[0].get("opentimestamp_files")[0] - - # Verify status should be false since we didn't use calendars - assert timestamp_media.get("verified") == False - # We expect 3 pending attestations (one for each calendar URL) - assert timestamp_media.get("attestation_count") == 3 - def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker): """Test enrichment when calendar servers return errors""" # Mock the calendar submission to raise an exception @@ -239,14 +216,8 @@ def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_me ots.enrich(metadata) # Verify results - assert metadata.get("opentimestamped") == True - assert metadata.get("opentimestamps_count") == 1 - - # Verify status should be false since calendar submissions failed - timestamp_media = metadata.media[0].get("opentimestamp_files")[0] - assert timestamp_media.get("verified") == False - # We expect 3 pending attestations (one for each calendar URL that's enabled by default in __manifest__) - assert timestamp_media.get("attestation_count") == 3 + assert metadata.get("opentimestamped") == False + assert metadata.get("opentimestamps_count") is None def test_no_files_to_stamp(setup_module): """Test enrichment with no files to timestamp""" From 15222199d92c99d80a227318634e3346ef4f1552 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 13 Mar 2025 14:45:38 +0000 Subject: [PATCH 09/12] Add unit test for if one calendar fails --- .../enrichers/test_opentimestamps_enricher.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/enrichers/test_opentimestamps_enricher.py b/tests/enrichers/test_opentimestamps_enricher.py index 2cdefdf..e91f97f 100644 --- a/tests/enrichers/test_opentimestamps_enricher.py +++ b/tests/enrichers/test_opentimestamps_enricher.py @@ -165,7 +165,6 @@ def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): return test_timestamp mock_calendar.side_effect = side_effect - ots = setup_module("opentimestamps_enricher") @@ -198,6 +197,32 @@ def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): assert timestamp_media.get("verified") == True assert timestamp_media.get("attestation_count") == 1 +def test_full_enriching_one_calendar_error(setup_module, sample_file_path, sample_media, mocker, pending_timestamp_file): + """Test enrichment when one calendar server returns an error""" + # Mock the calendar submission to raise an exception + mock_calendar = mocker.patch.object(RemoteCalendar, 'submit') + + test_timestamp = Timestamp(bytes.fromhex("583988e03646c26fa290c5c2408540a2f4e2aa9be087aa4546aefb531385b935")) + # Add a bitcoin attestation to the test timestamp + bitcoin = BitcoinBlockHeaderAttestation(783000) + test_timestamp.attestations.add(bitcoin) + + mock_calendar.side_effect = [test_timestamp, Exception("Calendar server error")] + + ots = setup_module("opentimestamps_enricher", {"calendar_urls": ["https://alice.btc.calendar.opentimestamps.org", "https://bob.btc.calendar.opentimestamps.org"]}) + + # Create test metadata with sample file + metadata = Metadata().set_url("https://example.com") + sample_media.filename = sample_file_path + metadata.add_media(sample_media) + + # Run enrichment (should complete despite calendar errors) + ots.enrich(metadata) + + # Verify results + assert metadata.get("opentimestamped") == True + assert metadata.get("opentimestamps_count") == 1 # only alice worked, not bob + def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker): """Test enrichment when calendar servers return errors""" # Mock the calendar submission to raise an exception From b908655cc8d91129940fd85bdb596633edb8c8dd Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 13 Mar 2025 17:40:00 +0000 Subject: [PATCH 10/12] Remove references to litecoin + several tidy-ups --- .../opentimestamps_enricher/__manifest__.py | 4 ++-- .../opentimestamps_enricher.py | 18 +++++++----------- .../enrichers/test_opentimestamps_enricher.py | 6 +++--- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py index 733ff1a..b489d66 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py @@ -35,7 +35,7 @@ https://opentimestamps.org/#calendars", the '.ots' extension. ### Features - - Creates cryptographic timestamp proofs that link files to the Bitcoin or Litecoin blockchain + - Creates cryptographic timestamp proofs that link files to the Bitcoin - Verifies timestamp proofs have been submitted to the blockchain (note: does not confirm they have been *added*) - Can use multiple calendar servers to ensure reliability and redundancy - Stores timestamp proofs alongside original files for future verification @@ -48,7 +48,7 @@ https://opentimestamps.org/#calendars", There are two possible statuses for a timestamp: - `Pending`: The timestamp has been submitted to the calendar server but has not yet been confirmed in the Bitcoin blockchain. - - `Confirmed`: The timestamp has been confirmed in the Bitcoin or Litecoin blockchain. + - `Confirmed`: The timestamp has been confirmed in the Bitcoin blockchain. ### Upgrading Timestamps To upgrade a timestamp from 'pending' to 'confirmed', you can use the `ots upgrade` command from the opentimestamps-client package diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index d6e8add..4785dd2 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -1,16 +1,15 @@ import os -import datetime from loguru import logger import opentimestamps from opentimestamps.calendar import RemoteCalendar, DEFAULT_CALENDAR_WHITELIST from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile -from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation, LitecoinBlockHeaderAttestation +from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation from opentimestamps.core.op import OpSHA256 from opentimestamps.core import serialize from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media -from auto_archiver.utils.misc import calculate_file_hash +from auto_archiver.utils.misc import get_current_timestamp class OpentimestampsEnricher(Enricher): @@ -34,8 +33,8 @@ class OpentimestampsEnricher(Enricher): continue # Create timestamp for the file - hash is SHA256 - # Note: ONLY SHA256 is used/supported here. Opentimestamps supports other hashes, but not SHA3-512 - # see opentimestamps.core.op + # Note: hash is hard-coded to SHA256 and does not use hash_enricher to set it. + # SHA256 is the recommended hash, ref: https://github.com/bellingcat/auto-archiver/pull/247#discussion_r1992433181 logger.debug(f"Creating timestamp for {file_path}") file_hash = None with open(file_path, 'rb') as f: @@ -150,13 +149,10 @@ class OpentimestampsEnricher(Enricher): info["uri"] = attestation.uri elif isinstance(attestation, BitcoinBlockHeaderAttestation): - info["status"] = "confirmed - bitcoin" - info["block_height"] = attestation.height - elif isinstance(attestation, LitecoinBlockHeaderAttestation): - info["status"] = "confirmed - litecoin" + info["status"] = "confirmed" info["block_height"] = attestation.height - info["last_check"] = datetime.datetime.now().isoformat()[:-7] + info["last_check"] = get_current_timestamp() attestation_info.append(info) @@ -169,6 +165,6 @@ class OpentimestampsEnricher(Enricher): result["verified"] = False else: result["verified"] = False - result["last_updated"] = datetime.datetime.now().isoformat()[:-7] + result["last_updated"] = get_current_timestamp() return result \ No newline at end of file diff --git a/tests/enrichers/test_opentimestamps_enricher.py b/tests/enrichers/test_opentimestamps_enricher.py index e91f97f..5b6a079 100644 --- a/tests/enrichers/test_opentimestamps_enricher.py +++ b/tests/enrichers/test_opentimestamps_enricher.py @@ -111,10 +111,10 @@ def test_verify_timestamp(setup_module, detached_timestamp_file): # Check attestation types assertion_types = [a["status"] for a in verification_info["attestations"]] assert "pending" in assertion_types - assert "confirmed - bitcoin" in assertion_types + assert "confirmed" in assertion_types # Check Bitcoin attestation details - bitcoin_attestation = next(a for a in verification_info["attestations"] if a["status"] == "confirmed - bitcoin") + bitcoin_attestation = next(a for a in verification_info["attestations"] if a["status"] == "confirmed") assert bitcoin_attestation["block_height"] == 783000 def test_verify_pending_only(setup_module, pending_timestamp_file): @@ -147,7 +147,7 @@ def test_verify_bitcoin_completed(setup_module, verified_timestamp_file): # Check that the attestation is a Bitcoin attestation attestation = verification_info["attestations"][0] - assert attestation["status"] == "confirmed - bitcoin" + assert attestation["status"] == "confirmed" assert attestation["block_height"] == 783000 def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): From 17ae75fb95a0b6f753197e615cee07e8d6a5df0a Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 14 Mar 2025 12:38:12 +0000 Subject: [PATCH 11/12] Ruff fixes --- .../opentimestamps_enricher/__manifest__.py | 8 +- .../opentimestamps_enricher.py | 62 ++++----- .../enrichers/test_opentimestamps_enricher.py | 127 ++++++++++-------- 3 files changed, 109 insertions(+), 88 deletions(-) diff --git a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py index b489d66..283d114 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/__manifest__.py @@ -18,12 +18,12 @@ ], "help": "List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:\ https://opentimestamps.org/#calendars", - "type": "list" + "type": "list", }, "calendar_whitelist": { "default": [], "help": "Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']", - "type": "list" + "type": "list", }, }, "description": """ @@ -96,5 +96,5 @@ Calendar https://alice.btc.calendar.opentimestamps.org: Timestamped by transacti if you want to use your own calendars, then you can override this setting in the `calendar_whitelist` configuration option. - """ -} \ No newline at end of file + """, +} diff --git a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py index 4785dd2..d909d8e 100644 --- a/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py +++ b/src/auto_archiver/modules/opentimestamps_enricher/opentimestamps_enricher.py @@ -11,8 +11,8 @@ from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media from auto_archiver.utils.misc import get_current_timestamp -class OpentimestampsEnricher(Enricher): +class OpentimestampsEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"OpenTimestamps timestamping files for {url=}") @@ -31,42 +31,42 @@ class OpentimestampsEnricher(Enricher): if not os.path.exists(file_path): logger.warning(f"File not found: {file_path}") continue - + # Create timestamp for the file - hash is SHA256 # Note: hash is hard-coded to SHA256 and does not use hash_enricher to set it. # SHA256 is the recommended hash, ref: https://github.com/bellingcat/auto-archiver/pull/247#discussion_r1992433181 logger.debug(f"Creating timestamp for {file_path}") file_hash = None - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: file_hash = OpSHA256().hash_fd(f) if not file_hash: logger.warning(f"Failed to hash file for timestamping, skipping: {file_path}") continue - + # Create a timestamp with the file hash timestamp = Timestamp(file_hash) - + # Create a detached timestamp file with the hash operation and timestamp detached_timestamp = DetachedTimestampFile(OpSHA256(), timestamp) - + # Submit to calendar servers submitted_to_calendar = False logger.debug(f"Submitting timestamp to calendar servers for {file_path}") calendars = [] whitelist = DEFAULT_CALENDAR_WHITELIST - + if self.calendar_whitelist: whitelist = set(self.calendar_whitelist) - + # Create calendar instances calendar_urls = [] for url in self.calendar_urls: if url in whitelist: calendars.append(RemoteCalendar(url)) calendar_urls.append(url) - + # Submit the hash to each calendar for calendar in calendars: try: @@ -76,17 +76,19 @@ class OpentimestampsEnricher(Enricher): submitted_to_calendar = True except Exception as e: logger.warning(f"Failed to submit to calendar {calendar.url}: {e}") - + # If all calendar submissions failed, add pending attestations if not submitted_to_calendar and not timestamp.attestations: - logger.error(f"Failed to submit to any calendar for {file_path}. **This file will not be timestamped.**") + logger.error( + f"Failed to submit to any calendar for {file_path}. **This file will not be timestamped.**" + ) media.set("opentimestamps", False) continue - + # Save the timestamp proof to a file timestamp_path = os.path.join(self.tmp_dir, f"{os.path.basename(file_path)}.ots") try: - with open(timestamp_path, 'wb') as f: + with open(timestamp_path, "wb") as f: # Create a serialization context and write to the file ctx = serialize.BytesSerializationContext() detached_timestamp.serialize(ctx) @@ -94,25 +96,25 @@ class OpentimestampsEnricher(Enricher): except Exception as e: logger.warning(f"Failed to serialize timestamp file: {e}") continue - + # Create media for the timestamp file timestamp_media = Media(filename=timestamp_path) # explicitly set the mimetype, normally .ots files are 'application/vnd.oasis.opendocument.spreadsheet-template' timestamp_media.mimetype = "application/vnd.opentimestamps" timestamp_media.set("opentimestamps_version", opentimestamps.__version__) - + verification_info = self.verify_timestamp(detached_timestamp) for key, value in verification_info.items(): timestamp_media.set(key, value) - + media.set("opentimestamp_files", [timestamp_media]) timestamp_files.append(timestamp_media.filename) # Update the original media to indicate it's been timestamped media.set("opentimestamps", True) - + except Exception as e: logger.warning(f"Error while timestamping {media.filename}: {e}") - + # Add timestamp files to the metadata if timestamp_files: to_enrich.set("opentimestamped", True) @@ -121,43 +123,43 @@ class OpentimestampsEnricher(Enricher): else: to_enrich.set("opentimestamped", False) logger.warning(f"No successful timestamps created for {url=}") - + def verify_timestamp(self, detached_timestamp): """ Verify a timestamp and extract verification information. - + Args: detached_timestamp: The detached timestamp to verify. - + Returns: dict: Information about the verification result. """ result = {} - + # Check if we have attestations attestations = list(detached_timestamp.timestamp.all_attestations()) result["attestation_count"] = len(attestations) - + if attestations: attestation_info = [] for msg, attestation in attestations: info = {} - + # Process different types of attestations if isinstance(attestation, PendingAttestation): info["status"] = "pending" info["uri"] = attestation.uri - + elif isinstance(attestation, BitcoinBlockHeaderAttestation): info["status"] = "confirmed" info["block_height"] = attestation.height info["last_check"] = get_current_timestamp() - + attestation_info.append(info) - + result["attestations"] = attestation_info - + # For at least one confirmed attestation if any("confirmed" in a.get("status") for a in attestation_info): result["verified"] = True @@ -166,5 +168,5 @@ class OpentimestampsEnricher(Enricher): else: result["verified"] = False result["last_updated"] = get_current_timestamp() - - return result \ No newline at end of file + + return result diff --git a/tests/enrichers/test_opentimestamps_enricher.py b/tests/enrichers/test_opentimestamps_enricher.py index 5b6a079..8d535d0 100644 --- a/tests/enrichers/test_opentimestamps_enricher.py +++ b/tests/enrichers/test_opentimestamps_enricher.py @@ -1,7 +1,4 @@ -from pathlib import Path import pytest -import os -import tempfile import hashlib from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile @@ -26,136 +23,146 @@ def sample_file_path(tmp_path): tmp_file.write_text("This is a test file content for OpenTimestamps") return str(tmp_file) + @pytest.fixture def detached_timestamp_file(): """Create a simple detached timestamp file for testing""" file_hash = hashlib.sha256(b"Test content").digest() from opentimestamps.core.op import OpSHA256 + file_hash_op = OpSHA256() timestamp = Timestamp(file_hash) - + # Add a pending attestation pending = PendingAttestation("https://example.calendar.com") timestamp.attestations.add(pending) - + # Add a bitcoin attestation bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height timestamp.attestations.add(bitcoin) - + return DetachedTimestampFile(file_hash_op, timestamp) + @pytest.fixture def verified_timestamp_file(): """Create a timestamp file with a Bitcoin attestation""" file_hash = hashlib.sha256(b"Verified content").digest() from opentimestamps.core.op import OpSHA256 + file_hash_op = OpSHA256() timestamp = Timestamp(file_hash) - + # Add only a Bitcoin attestation bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height timestamp.attestations.add(bitcoin) - + return DetachedTimestampFile(file_hash_op, timestamp) + @pytest.fixture def pending_timestamp_file(): """Create a timestamp file with only pending attestations""" file_hash = hashlib.sha256(b"Pending content").digest() from opentimestamps.core.op import OpSHA256 + file_hash_op = OpSHA256() timestamp = Timestamp(file_hash) - + # Add only pending attestations pending1 = PendingAttestation("https://example1.calendar.com") pending2 = PendingAttestation("https://example2.calendar.com") timestamp.attestations.add(pending1) timestamp.attestations.add(pending2) - + return DetachedTimestampFile(file_hash_op, timestamp) + @pytest.mark.download def test_download_tsr(setup_module, mocker): """Test submitting a hash to calendar servers""" # Mock the RemoteCalendar submit method - mock_submit = mocker.patch.object(RemoteCalendar, 'submit') + mock_submit = mocker.patch.object(RemoteCalendar, "submit") test_timestamp = Timestamp(hashlib.sha256(b"test").digest()) mock_submit.return_value = test_timestamp - ots = setup_module("opentimestamps_enricher") - + # Create a calendar calendar = RemoteCalendar("https://alice.btc.calendar.opentimestamps.org") - + # Test submission file_hash = hashlib.sha256(b"Test file content").digest() result = calendar.submit(file_hash) - + assert mock_submit.called assert isinstance(result, Timestamp) assert result == test_timestamp + def test_verify_timestamp(setup_module, detached_timestamp_file): """Test the verification of timestamp attestations""" ots = setup_module("opentimestamps_enricher") - + # Test verification verification_info = ots.verify_timestamp(detached_timestamp_file) - + # Check verification results assert verification_info["attestation_count"] == 2 assert verification_info["verified"] == True assert len(verification_info["attestations"]) == 2 - + # Check attestation types assertion_types = [a["status"] for a in verification_info["attestations"]] assert "pending" in assertion_types assert "confirmed" in assertion_types - + # Check Bitcoin attestation details bitcoin_attestation = next(a for a in verification_info["attestations"] if a["status"] == "confirmed") assert bitcoin_attestation["block_height"] == 783000 + def test_verify_pending_only(setup_module, pending_timestamp_file): """Test verification of timestamps with only pending attestations""" ots = setup_module("opentimestamps_enricher") - + verification_info = ots.verify_timestamp(pending_timestamp_file) - + assert verification_info["attestation_count"] == 2 assert verification_info["verified"] == False - + # All attestations should be of type "pending" assert all(a["status"] == "pending" for a in verification_info["attestations"]) - + # Check URIs of pending attestations uris = [a["uri"] for a in verification_info["attestations"]] assert "https://example1.calendar.com" in uris assert "https://example2.calendar.com" in uris + def test_verify_bitcoin_completed(setup_module, verified_timestamp_file): """Test verification of timestamps with completed Bitcoin attestations""" ots = setup_module("opentimestamps_enricher") - + verification_info = ots.verify_timestamp(verified_timestamp_file) - + assert verification_info["attestation_count"] == 1 assert verification_info["verified"] == True assert "pending" not in verification_info - + # Check that the attestation is a Bitcoin attestation attestation = verification_info["attestations"][0] assert attestation["status"] == "confirmed" assert attestation["block_height"] == 783000 + def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): """Test the complete enrichment process""" # Mock the calendar submission to avoid network requests - mock_calendar = mocker.patch.object(RemoteCalendar, 'submit') - + mock_calendar = mocker.patch.object(RemoteCalendar, "submit") + # Create a function that returns a new timestamp for each call def side_effect(digest): test_timestamp = Timestamp(digest) @@ -163,97 +170,109 @@ def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): bitcoin = BitcoinBlockHeaderAttestation(783000) test_timestamp.attestations.add(bitcoin) return test_timestamp - + mock_calendar.side_effect = side_effect ots = setup_module("opentimestamps_enricher") - + # Create test metadata with sample file metadata = Metadata().set_url("https://example.com") sample_media.filename = sample_file_path metadata.add_media(sample_media) - + # Run enrichment ots.enrich(metadata) - + # Verify results assert metadata.get("opentimestamped") == True assert metadata.get("opentimestamps_count") == 1 - + # Check that we have one parent media item: the original assert len(metadata.media) == 1 - + # Check that the original media was updated assert metadata.media[0].get("opentimestamps") == True - + # Check the timestamp file media is a child of the original assert len(metadata.media[0].get("opentimestamp_files")) == 1 timestamp_media = metadata.media[0].get("opentimestamp_files")[0] assert timestamp_media.get("opentimestamps_version") is not None - + # Check verification results on the timestamp media assert timestamp_media.get("verified") == True assert timestamp_media.get("attestation_count") == 1 -def test_full_enriching_one_calendar_error(setup_module, sample_file_path, sample_media, mocker, pending_timestamp_file): + +def test_full_enriching_one_calendar_error( + setup_module, sample_file_path, sample_media, mocker, pending_timestamp_file +): """Test enrichment when one calendar server returns an error""" # Mock the calendar submission to raise an exception - mock_calendar = mocker.patch.object(RemoteCalendar, 'submit') - + mock_calendar = mocker.patch.object(RemoteCalendar, "submit") + test_timestamp = Timestamp(bytes.fromhex("583988e03646c26fa290c5c2408540a2f4e2aa9be087aa4546aefb531385b935")) - # Add a bitcoin attestation to the test timestamp + # Add a bitcoin attestation to the test timestamp bitcoin = BitcoinBlockHeaderAttestation(783000) test_timestamp.attestations.add(bitcoin) mock_calendar.side_effect = [test_timestamp, Exception("Calendar server error")] - ots = setup_module("opentimestamps_enricher", {"calendar_urls": ["https://alice.btc.calendar.opentimestamps.org", "https://bob.btc.calendar.opentimestamps.org"]}) - + ots = setup_module( + "opentimestamps_enricher", + { + "calendar_urls": [ + "https://alice.btc.calendar.opentimestamps.org", + "https://bob.btc.calendar.opentimestamps.org", + ] + }, + ) + # Create test metadata with sample file metadata = Metadata().set_url("https://example.com") sample_media.filename = sample_file_path metadata.add_media(sample_media) - + # Run enrichment (should complete despite calendar errors) ots.enrich(metadata) - + # Verify results assert metadata.get("opentimestamped") == True - assert metadata.get("opentimestamps_count") == 1 # only alice worked, not bob + assert metadata.get("opentimestamps_count") == 1 # only alice worked, not bob + def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker): """Test enrichment when calendar servers return errors""" # Mock the calendar submission to raise an exception - mock_calendar = mocker.patch.object(RemoteCalendar, 'submit') + mock_calendar = mocker.patch.object(RemoteCalendar, "submit") mock_calendar.side_effect = Exception("Calendar server error") - ots = setup_module("opentimestamps_enricher") - + # Create test metadata with sample file metadata = Metadata().set_url("https://example.com") sample_media.filename = sample_file_path metadata.add_media(sample_media) - + # Run enrichment (should complete despite calendar errors) ots.enrich(metadata) - + # Verify results assert metadata.get("opentimestamped") == False assert metadata.get("opentimestamps_count") is None + def test_no_files_to_stamp(setup_module): """Test enrichment with no files to timestamp""" ots = setup_module("opentimestamps_enricher") - + # Create empty metadata metadata = Metadata().set_url("https://example.com") - + # Run enrichment ots.enrich(metadata) - + # Verify no timestamping occurred assert metadata.get("opentimestamped") is None - assert len(metadata.media) == 0 \ No newline at end of file + assert len(metadata.media) == 0 From b21467c922f538a4093ba7c0382b5c78def28728 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 14 Mar 2025 12:59:37 +0000 Subject: [PATCH 12/12] Fix ruff checks --- .../enrichers/test_opentimestamps_enricher.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/enrichers/test_opentimestamps_enricher.py b/tests/enrichers/test_opentimestamps_enricher.py index 8d535d0..99ddd66 100644 --- a/tests/enrichers/test_opentimestamps_enricher.py +++ b/tests/enrichers/test_opentimestamps_enricher.py @@ -86,8 +86,6 @@ def test_download_tsr(setup_module, mocker): test_timestamp = Timestamp(hashlib.sha256(b"test").digest()) mock_submit.return_value = test_timestamp - ots = setup_module("opentimestamps_enricher") - # Create a calendar calendar = RemoteCalendar("https://alice.btc.calendar.opentimestamps.org") @@ -109,7 +107,7 @@ def test_verify_timestamp(setup_module, detached_timestamp_file): # Check verification results assert verification_info["attestation_count"] == 2 - assert verification_info["verified"] == True + assert verification_info["verified"] is True assert len(verification_info["attestations"]) == 2 # Check attestation types @@ -129,7 +127,7 @@ def test_verify_pending_only(setup_module, pending_timestamp_file): verification_info = ots.verify_timestamp(pending_timestamp_file) assert verification_info["attestation_count"] == 2 - assert verification_info["verified"] == False + assert verification_info["verified"] is False # All attestations should be of type "pending" assert all(a["status"] == "pending" for a in verification_info["attestations"]) @@ -148,7 +146,7 @@ def test_verify_bitcoin_completed(setup_module, verified_timestamp_file): verification_info = ots.verify_timestamp(verified_timestamp_file) assert verification_info["attestation_count"] == 1 - assert verification_info["verified"] == True + assert verification_info["verified"] is True assert "pending" not in verification_info # Check that the attestation is a Bitcoin attestation @@ -184,14 +182,14 @@ def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): ots.enrich(metadata) # Verify results - assert metadata.get("opentimestamped") == True + assert metadata.get("opentimestamped") is True assert metadata.get("opentimestamps_count") == 1 # Check that we have one parent media item: the original assert len(metadata.media) == 1 # Check that the original media was updated - assert metadata.media[0].get("opentimestamps") == True + assert metadata.media[0].get("opentimestamps") is True # Check the timestamp file media is a child of the original assert len(metadata.media[0].get("opentimestamp_files")) == 1 @@ -201,7 +199,7 @@ def test_full_enriching(setup_module, sample_file_path, sample_media, mocker): assert timestamp_media.get("opentimestamps_version") is not None # Check verification results on the timestamp media - assert timestamp_media.get("verified") == True + assert timestamp_media.get("verified") is True assert timestamp_media.get("attestation_count") == 1 @@ -238,7 +236,7 @@ def test_full_enriching_one_calendar_error( ots.enrich(metadata) # Verify results - assert metadata.get("opentimestamped") == True + assert metadata.get("opentimestamped") is True assert metadata.get("opentimestamps_count") == 1 # only alice worked, not bob @@ -259,7 +257,7 @@ def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_me ots.enrich(metadata) # Verify results - assert metadata.get("opentimestamped") == False + assert metadata.get("opentimestamped") is False assert metadata.get("opentimestamps_count") is None