kopia lustrzana https://github.com/bellingcat/auto-archiver
Initial Atlos merge
rodzic
22932645aa
commit
6cb7afefdc
|
@ -103,14 +103,14 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
|
|||
|
||||
[[package]]
|
||||
name = "authlib"
|
||||
version = "1.5.0"
|
||||
version = "1.5.1"
|
||||
description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "Authlib-1.5.0-py2.py3-none-any.whl", hash = "sha256:b3cc5ccfc19cf87678046b6e7cb19d402d8a631a33c40e36385232203227953a"},
|
||||
{file = "authlib-1.5.0.tar.gz", hash = "sha256:8fd8bd8f806485a532ac39a17b579982cf54688f956174f995cc938a91725423"},
|
||||
{file = "authlib-1.5.1-py2.py3-none-any.whl", hash = "sha256:8408861cbd9b4ea2ff759b00b6f02fd7d81ac5a56d0b2b22c08606c6049aae11"},
|
||||
{file = "authlib-1.5.1.tar.gz", hash = "sha256:5cbc85ecb0667312c1cdc2f9095680bb735883b123fb509fde1e65b1c5df972e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -172,18 +172,18 @@ lxml = ["lxml"]
|
|||
|
||||
[[package]]
|
||||
name = "boto3"
|
||||
version = "1.37.0"
|
||||
version = "1.37.5"
|
||||
description = "The AWS SDK for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "boto3-1.37.0-py3-none-any.whl", hash = "sha256:03bd8c93b226f07d944fd6b022e11a307bff94ab6a21d51675d7e3ea81ee8424"},
|
||||
{file = "boto3-1.37.0.tar.gz", hash = "sha256:01015b38017876d79efd7273f35d9a4adfba505237159621365bed21b9b65eca"},
|
||||
{file = "boto3-1.37.5-py3-none-any.whl", hash = "sha256:12166353519aca0cc8d9dcfbbb0d38f8915955a5912b8cb241b2b2314f0dbc14"},
|
||||
{file = "boto3-1.37.5.tar.gz", hash = "sha256:ae6e7048beeaa4478368e554a4b290e3928beb0ae8d8767d108d72381a81af30"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
botocore = ">=1.37.0,<1.38.0"
|
||||
botocore = ">=1.37.5,<1.38.0"
|
||||
jmespath = ">=0.7.1,<2.0.0"
|
||||
s3transfer = ">=0.11.0,<0.12.0"
|
||||
|
||||
|
@ -192,14 +192,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
|
|||
|
||||
[[package]]
|
||||
name = "botocore"
|
||||
version = "1.37.0"
|
||||
version = "1.37.5"
|
||||
description = "Low-level, data-driven core of boto 3."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "botocore-1.37.0-py3-none-any.whl", hash = "sha256:d01661f38c0edac87424344cdf4169f3ab9bc1bf1b677c8b230d025eb66c54a3"},
|
||||
{file = "botocore-1.37.0.tar.gz", hash = "sha256:b129d091a8360b4152ab65327186bf4e250de827c4a9b7ddf40a72b1acf1f3c1"},
|
||||
{file = "botocore-1.37.5-py3-none-any.whl", hash = "sha256:e5cfbb8026d5b4fadd9b3a18b61d238a41a8b8f620ab75873dc1467d456150d6"},
|
||||
{file = "botocore-1.37.5.tar.gz", hash = "sha256:f8f526d33ae74d242c577e0440b57b9ec7d53edd41db211155ec8087fe7a5a21"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -781,14 +781,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
|
|||
|
||||
[[package]]
|
||||
name = "google-api-python-client"
|
||||
version = "2.161.0"
|
||||
version = "2.162.0"
|
||||
description = "Google API Client Library for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "google_api_python_client-2.161.0-py2.py3-none-any.whl", hash = "sha256:9476a5a4f200bae368140453df40f9cda36be53fa7d0e9a9aac4cdb859a26448"},
|
||||
{file = "google_api_python_client-2.161.0.tar.gz", hash = "sha256:324c0cce73e9ea0a0d2afd5937e01b7c2d6a4d7e2579cdb6c384f9699d6c9f37"},
|
||||
{file = "google_api_python_client-2.162.0-py2.py3-none-any.whl", hash = "sha256:49365fa4f7795fe81a747f5544d6528ea94314fa59664e0ea1005f603facf1ec"},
|
||||
{file = "google_api_python_client-2.162.0.tar.gz", hash = "sha256:5f8bc934a5b6eea73a7d12d999e6585c1823179f48340234acb385e2502e735a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -860,14 +860,14 @@ tool = ["click (>=6.0.0)"]
|
|||
|
||||
[[package]]
|
||||
name = "googleapis-common-protos"
|
||||
version = "1.68.0"
|
||||
version = "1.69.0"
|
||||
description = "Common protobufs used in Google APIs"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "googleapis_common_protos-1.68.0-py2.py3-none-any.whl", hash = "sha256:aaf179b2f81df26dfadac95def3b16a95064c76a5f45f07e4c68a21bb371c4ac"},
|
||||
{file = "googleapis_common_protos-1.68.0.tar.gz", hash = "sha256:95d38161f4f9af0d9423eed8fb7b64ffd2568c3464eb542ff02c5bfa1953ab3c"},
|
||||
{file = "googleapis_common_protos-1.69.0-py2.py3-none-any.whl", hash = "sha256:17835fdc4fa8da1d61cfe2d4d5d57becf7c61d4112f8d81c67eaa9d7ce43042d"},
|
||||
{file = "googleapis_common_protos-1.69.0.tar.gz", hash = "sha256:5a46d58af72846f59009b9c4710425b9af2139555c71837081706b213b298187"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -878,14 +878,14 @@ grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
|
|||
|
||||
[[package]]
|
||||
name = "gspread"
|
||||
version = "6.1.4"
|
||||
version = "6.2.0"
|
||||
description = "Google Spreadsheets Python API"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "gspread-6.1.4-py3-none-any.whl", hash = "sha256:c34781c426031a243ad154952b16f21ac56a5af90687885fbee3d1fba5280dcd"},
|
||||
{file = "gspread-6.1.4.tar.gz", hash = "sha256:b8eec27de7cadb338bb1b9f14a9be168372dee8965c0da32121816b5050ac1de"},
|
||||
{file = "gspread-6.2.0-py3-none-any.whl", hash = "sha256:7fa1a11e1ecacc6c5946fa016be05941baca8540404314f59aec963dd8ae5db3"},
|
||||
{file = "gspread-6.2.0.tar.gz", hash = "sha256:bc3d02d1c39e0b40bfc8035b4fec407aa71a17f343fc81cc7e3f75bfa6555de6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -1777,14 +1777,14 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "8.3.4"
|
||||
version = "8.3.5"
|
||||
description = "pytest: simple powerful testing with Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"},
|
||||
{file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"},
|
||||
{file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
|
||||
{file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -2248,14 +2248,14 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "s3transfer"
|
||||
version = "0.11.2"
|
||||
version = "0.11.3"
|
||||
description = "An Amazon S3 Transfer Manager"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "s3transfer-0.11.2-py3-none-any.whl", hash = "sha256:be6ecb39fadd986ef1701097771f87e4d2f821f27f6071c872143884d2950fbc"},
|
||||
{file = "s3transfer-0.11.2.tar.gz", hash = "sha256:3b39185cb72f5acc77db1a58b6e25b977f28d20496b6e58d6813d75f464d632f"},
|
||||
{file = "s3transfer-0.11.3-py3-none-any.whl", hash = "sha256:ca855bdeb885174b5ffa95b9913622459d4ad8e331fc98eb01e6d5eb6a30655d"},
|
||||
{file = "s3transfer-0.11.3.tar.gz", hash = "sha256:edae4977e3a122445660c7c114bba949f9d191bae3b34a096f18a1c8c354527a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"name": "Atlos Feeder Database",
|
||||
"type": ["feeder", "database"],
|
||||
"entry_point": "atlos_feeder_db::AtlosFeederDb",
|
||||
"name": "Atlos Feeder Database Storage",
|
||||
"type": ["feeder", "database", "storage"],
|
||||
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
|
@ -19,11 +19,9 @@
|
|||
},
|
||||
},
|
||||
"description": """
|
||||
AtlosFeederDb: A feeder module that integrates with the Atlos API to fetch source material URLs for archival,
|
||||
AtlosFeederDbStorage: A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media,
|
||||
along with a database option to output archival results.
|
||||
|
||||
Feeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.
|
||||
|
||||
### Features
|
||||
- Connects to the Atlos API to retrieve a list of source material URLs.
|
||||
- Filters source materials based on visibility, processing status, and metadata.
|
||||
|
@ -33,6 +31,7 @@
|
|||
- Updates failure status with error details when archiving fails.
|
||||
- Processes and formats metadata, including ISO formatting for datetime fields.
|
||||
- Skips processing for items without an Atlos ID.
|
||||
- Saves media files to Atlos, organizing them into folders based on the provided path structure.
|
||||
|
||||
### Notes
|
||||
- Requires an Atlos API endpoint and a valid API token for authentication.
|
|
@ -1,15 +1,19 @@
|
|||
import requests
|
||||
import hashlib
|
||||
import os
|
||||
from typing import IO, Optional
|
||||
from typing import Union
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
|
||||
class AtlosFeederDb(Feeder, Database):
|
||||
class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
# Get all the urls from the Atlos API
|
||||
|
@ -98,3 +102,59 @@ class AtlosFeederDb(Feeder, Database):
|
|||
logger.info(
|
||||
f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
|
||||
)
|
||||
|
||||
def get_cdn_url(self, _media: Media) -> str:
|
||||
# It's not always possible to provide an exact URL, because it's
|
||||
# possible that the media once uploaded could have been copied to
|
||||
# another project.
|
||||
return self.atlos_url
|
||||
|
||||
def _hash(self, media: Media) -> str:
|
||||
# Hash the media file using sha-256. We don't use the existing auto archiver
|
||||
# hash because there's no guarantee that the configuerer is using sha-256, which
|
||||
# is how Atlos hashes files.
|
||||
|
||||
sha256 = hashlib.sha256()
|
||||
with open(media.filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(4096)
|
||||
if not buf: break
|
||||
sha256.update(buf)
|
||||
return sha256.hexdigest()
|
||||
|
||||
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
|
||||
atlos_id = metadata.get("atlos_id")
|
||||
if atlos_id is None:
|
||||
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos")
|
||||
return False
|
||||
|
||||
media_hash = self._hash(media)
|
||||
# media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
|
||||
|
||||
# Check whether the media has already been uploaded
|
||||
source_material = requests.get(
|
||||
f"{self.atlos_url}/api/v2/source_material/{atlos_id}",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
).json()["result"]
|
||||
existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])]
|
||||
if media_hash in existing_media:
|
||||
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
|
||||
return True
|
||||
|
||||
# Upload the media to the Atlos API
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
params={
|
||||
"title": media.properties
|
||||
},
|
||||
files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))},
|
||||
).raise_for_status()
|
||||
|
||||
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||
|
||||
return True
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
pass
|
Ładowanie…
Reference in New Issue