kopia lustrzana https://github.com/bellingcat/auto-archiver
commit
eebd040e13
|
@ -0,0 +1,38 @@
|
|||
name: Core Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- src/**
|
||||
pull_request:
|
||||
paths:
|
||||
- src/**
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
runs-on: ubuntu-22.04
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10", "3.11", "3.12"]
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install Poetry
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache: 'poetry'
|
||||
|
||||
- name: Install dependencies
|
||||
run: poetry install --no-interaction --with dev
|
||||
|
||||
- name: Run Core Tests
|
||||
run: poetry run pytest -ra -v -m "not download"
|
|
@ -0,0 +1,38 @@
|
|||
name: Download Tests
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '35 14 * * 1'
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- src/**
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
runs-on: ubuntu-22.04
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.10"] # only run expensive downloads on one (lowest) python version
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install poetry
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache: 'poetry'
|
||||
|
||||
- name: Install dependencies
|
||||
run: poetry install --no-interaction --with dev
|
||||
|
||||
- name: Run Download Tests
|
||||
run: poetry run pytest -ra -v -m "download"
|
16
README.md
16
README.md
|
@ -2,6 +2,8 @@
|
|||
|
||||
[](https://badge.fury.io/py/auto-archiver)
|
||||
[](https://hub.docker.com/r/bellingcat/auto-archiver)
|
||||
[](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-core.yaml)
|
||||
[](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-download.yaml)
|
||||
<!--  -->
|
||||
<!-- [](https://pypi.python.org/pypi/auto-archiver/) -->
|
||||
<!-- [](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest) -->
|
||||
|
@ -259,6 +261,20 @@ The "archive location" link contains the path of the archived file, in local sto
|
|||
## Development
|
||||
Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run from the local development environment.
|
||||
|
||||
### Testing
|
||||
|
||||
Tests are split using `pytest.mark` into 'core' and 'download' tests. Download tests will hit the network and make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed.
|
||||
|
||||
Tests can be run as follows:
|
||||
```
|
||||
# run core tests
|
||||
pytest -ra -v -m "not download" # or poetry run pytest -ra -v -m "not download"
|
||||
# run download tests
|
||||
pytest -ra -v -m "download" # or poetry run pytest -ra -v -m "download"
|
||||
# run all tests
|
||||
pytest -ra -v # or poetry run pytest -ra -v
|
||||
```
|
||||
|
||||
#### Docker development
|
||||
working with docker locally:
|
||||
* `docker build . -t auto-archiver` to build a local image
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
|
@ -221,6 +221,22 @@ files = [
|
|||
[package.dependencies]
|
||||
cryptography = "*"
|
||||
|
||||
[[package]]
|
||||
name = "autopep8"
|
||||
version = "2.3.1"
|
||||
description = "A tool that automatically formats Python code to conform to the PEP 8 style guide"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "autopep8-2.3.1-py2.py3-none-any.whl", hash = "sha256:a203fe0fcad7939987422140ab17a930f684763bf7335bdb6709991dd7ef6c2d"},
|
||||
{file = "autopep8-2.3.1.tar.gz", hash = "sha256:8d6c87eba648fdcfc83e29b788910b8643171c395d9c4bcf115ece035b9c9dda"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pycodestyle = ">=2.12.0"
|
||||
tomli = {version = "*", markers = "python_version < \"3.11\""}
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.12.3"
|
||||
|
@ -747,12 +763,12 @@ version = "0.4.6"
|
|||
description = "Cross-platform colored terminal text."
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||
groups = ["main"]
|
||||
markers = "platform_system == \"Windows\" or sys_platform == \"win32\""
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
|
||||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
]
|
||||
markers = {main = "platform_system == \"Windows\" or sys_platform == \"win32\"", dev = "sys_platform == \"win32\""}
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
|
@ -845,7 +861,7 @@ version = "1.2.2"
|
|||
description = "Backport of PEP 654 (exception groups)"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
groups = ["main", "dev"]
|
||||
markers = "python_version < \"3.11\""
|
||||
files = [
|
||||
{file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
|
||||
|
@ -1237,6 +1253,18 @@ files = [
|
|||
[package.extras]
|
||||
all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "2.0.0"
|
||||
description = "brain-dead simple config-ini parsing"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
|
||||
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "instaloader"
|
||||
version = "4.14"
|
||||
|
@ -1760,7 +1788,7 @@ version = "24.2"
|
|||
description = "Core utilities for Python packages"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
|
||||
{file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
|
||||
|
@ -1876,6 +1904,22 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole
|
|||
typing = ["typing-extensions"]
|
||||
xmp = ["defusedxml"]
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.5.0"
|
||||
description = "plugin and hook calling mechanisms for python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
|
||||
{file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["pre-commit", "tox"]
|
||||
testing = ["pytest", "pytest-benchmark"]
|
||||
|
||||
[[package]]
|
||||
name = "propcache"
|
||||
version = "0.2.1"
|
||||
|
@ -2045,6 +2089,18 @@ files = [
|
|||
[package.dependencies]
|
||||
pyasn1 = ">=0.4.6,<0.7.0"
|
||||
|
||||
[[package]]
|
||||
name = "pycodestyle"
|
||||
version = "2.12.1"
|
||||
description = "Python style guide checker"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"},
|
||||
{file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pycparser"
|
||||
version = "2.22"
|
||||
|
@ -2173,6 +2229,29 @@ files = [
|
|||
{file = "pysubs2-1.8.0.tar.gz", hash = "sha256:3397bb58a4a15b1325ba2ae3fd4d7c214e2c0ddb9f33190d6280d783bb433b20"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "8.3.4"
|
||||
description = "pytest: simple powerful testing with Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"},
|
||||
{file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "sys_platform == \"win32\""}
|
||||
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
|
||||
iniconfig = "*"
|
||||
packaging = "*"
|
||||
pluggy = ">=1.5,<2"
|
||||
tomli = {version = ">=1", markers = "python_version < \"3.11\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.9.0.post0"
|
||||
|
@ -2659,6 +2738,49 @@ files = [
|
|||
{file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tomli"
|
||||
version = "2.2.1"
|
||||
description = "A lil' TOML parser"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
markers = "python_version < \"3.11\""
|
||||
files = [
|
||||
{file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"},
|
||||
{file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"},
|
||||
{file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a"},
|
||||
{file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee"},
|
||||
{file = "tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e"},
|
||||
{file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4"},
|
||||
{file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106"},
|
||||
{file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8"},
|
||||
{file = "tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff"},
|
||||
{file = "tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98"},
|
||||
{file = "tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec"},
|
||||
{file = "tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69"},
|
||||
{file = "tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc"},
|
||||
{file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tqdm"
|
||||
version = "4.67.1"
|
||||
|
@ -3174,4 +3296,4 @@ test = ["pytest (>=8.1,<9.0)"]
|
|||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "7a5be9f0580f12d85fb5d5e60814148aa3fce47e9c9e7d255977ef185245ed96"
|
||||
content-hash = "7c7dc6d26e5af1c9bb6e4393b4ac64b155049d20a9f5317baec48c964a2708ac"
|
||||
|
|
|
@ -62,10 +62,9 @@ dependencies = [
|
|||
"toml (>=0.10.2,<0.11.0)"
|
||||
]
|
||||
|
||||
|
||||
[poetry.group.dev.dependencies]
|
||||
autopep8 = "*"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^8.3.4"
|
||||
autopep8 = "^2.3.1"
|
||||
|
||||
[project.scripts]
|
||||
auto-archiver = "auto_archiver.__main__:main"
|
||||
|
@ -74,3 +73,9 @@ auto-archiver = "auto_archiver.__main__:main"
|
|||
homepage = "https://github.com/bellingcat/auto-archiver"
|
||||
repository = "https://github.com/bellingcat/auto-archiver"
|
||||
documentation = "https://github.com/bellingcat/auto-archiver"
|
||||
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
markers = [
|
||||
"download: marks tests that download content from the network",
|
||||
]
|
|
@ -114,6 +114,10 @@ class TwitterArchiver(Archiver):
|
|||
result = Metadata()
|
||||
tweet = r.json()
|
||||
|
||||
if tweet.get('__typename') == 'TweetTombstone':
|
||||
logger.error(f"Failed to get tweet {tweet_id}: {tweet['tombstone']['text']['text']}")
|
||||
return False
|
||||
|
||||
urls = []
|
||||
for p in tweet.get("photos", []):
|
||||
urls.append(p["url"])
|
||||
|
@ -158,7 +162,8 @@ class TwitterArchiver(Archiver):
|
|||
.set_timestamp(timestamp)
|
||||
if not tweet.get("entities", {}).get("media"):
|
||||
logger.debug('No media found, archiving tweet text only')
|
||||
return result.success("twitter-ytdl")
|
||||
result.status = "twitter-ytdl"
|
||||
return result
|
||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||
media = Media(filename="")
|
||||
mimetype = ""
|
||||
|
|
|
@ -14,9 +14,26 @@ class HashEnricher(Enricher):
|
|||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
algo_choices = self.configs()["algorithm"]["choices"]
|
||||
algos = self.configs()["algorithm"]
|
||||
algo_choices = algos["choices"]
|
||||
if not getattr(self, 'algorithm', None):
|
||||
if not config.get('algorithm'):
|
||||
logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
|
||||
self.algorithm = algos["default"]
|
||||
else:
|
||||
self.algorithm = config["algorithm"]
|
||||
|
||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||
|
||||
if not getattr(self, 'chunksize', None):
|
||||
if config.get('chunksize'):
|
||||
self.chunksize = config["chunksize"]
|
||||
else:
|
||||
self.chunksize = self.configs()["chunksize"]["default"]
|
||||
|
||||
self.chunksize = int(self.chunksize)
|
||||
assert self.chunksize >= -1, "read length must be non-negative or -1"
|
||||
|
||||
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import unittest
|
||||
import tempfile
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
from auto_archiver.core.context import ArchivingContext
|
||||
|
||||
ArchivingContext.reset(full_reset=True)
|
||||
ArchivingContext.set_tmp_dir(tempfile.gettempdir())
|
|
@ -1,7 +0,0 @@
|
|||
import tempfile
|
||||
|
||||
from auto_archiver.core.context import ArchivingContext
|
||||
|
||||
|
||||
ArchivingContext.reset(full_reset=True)
|
||||
ArchivingContext.set_tmp_dir(tempfile.gettempdir())
|
|
@ -1,22 +1,27 @@
|
|||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import Step
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
class TestArchiverBase(object):
|
||||
|
||||
archiver_class = None
|
||||
config = None
|
||||
|
||||
def setUp(self):
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_archiver(self):
|
||||
assert self.archiver_class is not None, "self.archiver_class must be set on the subclass"
|
||||
assert self.config is not None, "self.config must be a dict set on the subclass"
|
||||
self.archiver = self.archiver_class(self.config)
|
||||
|
||||
def create_item(self, url, **kwargs):
|
||||
item = Metadata().set_url(url)
|
||||
for key, value in kwargs.items():
|
||||
item.set(key, value)
|
||||
return item
|
||||
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
|
||||
assert test_response is not False
|
||||
|
||||
def assertValidResponseMetadata(self, test_response, title, timestamp):
|
||||
self.assertTrue(test_response.is_success())
|
||||
self.assertEqual(title, test_response.get_title())
|
||||
self.assertTrue(timestamp, test_response.get("timestamp"))
|
||||
if not status:
|
||||
assert test_response.is_success()
|
||||
else:
|
||||
assert status == test_response.status
|
||||
|
||||
assert title == test_response.get_title()
|
||||
assert timestamp, test_response.get("timestamp")
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import pytest
|
||||
|
||||
from auto_archiver.archivers.bluesky_archiver import BlueskyArchiver
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
import unittest
|
||||
|
||||
class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
|
||||
class TestBlueskyArchiver(TestArchiverBase):
|
||||
"""Tests Bluesky Archiver
|
||||
|
||||
Note that these tests will download API responses from the bluesky API, so they may be slow.
|
||||
|
@ -13,57 +14,60 @@ class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
|
|||
archiver_class = BlueskyArchiver
|
||||
config = {}
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_media_with_images(self):
|
||||
# url https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y
|
||||
post = self.archiver._get_post_from_uri("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
|
||||
|
||||
# just make sure bsky haven't changed their format, images should be under "record/embed/media/images"
|
||||
# there should be 2 images
|
||||
self.assertTrue("record" in post)
|
||||
self.assertTrue("embed" in post["record"])
|
||||
self.assertTrue("media" in post["record"]["embed"])
|
||||
self.assertTrue("images" in post["record"]["embed"]["media"])
|
||||
self.assertEqual(len(post["record"]["embed"]["media"]["images"]), 2)
|
||||
assert "record" in post
|
||||
assert "embed" in post["record"]
|
||||
assert "media" in post["record"]["embed"]
|
||||
assert "images" in post["record"]["embed"]["media"]
|
||||
assert len(post["record"]["embed"]["media"]["images"]) == 2
|
||||
|
||||
# try downloading the media files
|
||||
media = self.archiver._download_bsky_embeds(post)
|
||||
self.assertEqual(len(media), 2)
|
||||
assert len(media) == 2
|
||||
|
||||
# check the IDs
|
||||
self.assertTrue("bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src'))
|
||||
self.assertTrue("bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src'))
|
||||
assert "bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src')
|
||||
assert "bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src')
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_post_with_single_image(self):
|
||||
# url https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l
|
||||
post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l")
|
||||
|
||||
# just make sure bsky haven't changed their format, images should be under "record/embed/images"
|
||||
# there should be 1 image
|
||||
self.assertTrue("record" in post)
|
||||
self.assertTrue("embed" in post["record"])
|
||||
self.assertTrue("images" in post["record"]["embed"])
|
||||
self.assertEqual(len(post["record"]["embed"]["images"]), 1)
|
||||
assert "record" in post
|
||||
assert "embed" in post["record"]
|
||||
assert "images" in post["record"]["embed"]
|
||||
assert len(post["record"]["embed"]["images"]) == 1
|
||||
|
||||
media = self.archiver._download_bsky_embeds(post)
|
||||
self.assertEqual(len(media), 1)
|
||||
assert len(media) == 1
|
||||
|
||||
# check the ID
|
||||
self.assertTrue("bafkreihljdtomy4yulx4nfxuqdatlgvdg45vxdmjzzhclsd4ludk7zfma4" in media[0].get('src'))
|
||||
assert "bafkreihljdtomy4yulx4nfxuqdatlgvdg45vxdmjzzhclsd4ludk7zfma4" in media[0].get('src')
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_post_with_video(self):
|
||||
# url https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i
|
||||
post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||
|
||||
# just make sure bsky haven't changed their format, video should be under "record/embed/video"
|
||||
self.assertTrue("record" in post)
|
||||
self.assertTrue("embed" in post["record"])
|
||||
self.assertTrue("video" in post["record"]["embed"])
|
||||
assert "record" in post
|
||||
assert "embed" in post["record"]
|
||||
assert "video" in post["record"]["embed"]
|
||||
|
||||
media = self.archiver._download_bsky_embeds(post)
|
||||
self.assertEqual(len(media), 1)
|
||||
assert len(media) == 1
|
||||
|
||||
# check the ID
|
||||
self.assertTrue("bafkreiaiskn2nt5cxjnxbgcqqcrnurvkr2ni3unekn6zvhvgr5nrqg6u2q" in media[0].get('src'))
|
||||
assert "bafkreiaiskn2nt5cxjnxbgcqqcrnurvkr2ni3unekn6zvhvgr5nrqg6u2q" in media[0].get('src')
|
||||
|
||||
|
|
@ -1,128 +1,140 @@
|
|||
import unittest
|
||||
import datetime
|
||||
import pytest
|
||||
|
||||
from auto_archiver.archivers.twitter_archiver import TwitterArchiver
|
||||
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
|
||||
|
||||
class TestTwitterArchiver(TestArchiverBase, unittest.TestCase):
|
||||
class TestTwitterArchiver(TestArchiverBase):
|
||||
|
||||
archiver_class = TwitterArchiver
|
||||
config = {}
|
||||
@pytest.mark.parametrize("url, expected", [
|
||||
("https://t.co/yl3oOJatFp", "https://www.bellingcat.com/category/resources/"), # t.co URL
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839"), # strip tracking params
|
||||
("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
|
||||
("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
|
||||
])
|
||||
def test_sanitize_url(self, url, expected):
|
||||
assert expected == self.archiver.sanitize_url(url)
|
||||
|
||||
def test_sanitize_url(self):
|
||||
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://www.bellingcat.com/category/resources/", False, False)
|
||||
])
|
||||
|
||||
# should expand t.co URLs
|
||||
t_co_url = "https://t.co/yl3oOJatFp"
|
||||
t_co_resolved_url = "https://www.bellingcat.com/category/resources/"
|
||||
self.assertEqual(t_co_resolved_url, self.archiver.sanitize_url(t_co_url))
|
||||
def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
|
||||
|
||||
# shouldn't alter valid x URLs
|
||||
x_url = "https://x.com/bellingcat/status/1874097816571961839"
|
||||
self.assertEqual(x_url, self.archiver.sanitize_url(x_url))
|
||||
|
||||
# shouldn't alter valid twitter.com URLs
|
||||
twitter_url = "https://twitter.com/bellingcat/status/1874097816571961839"
|
||||
self.assertEqual(twitter_url, self.archiver.sanitize_url(twitter_url))
|
||||
|
||||
# should strip tracking params
|
||||
tracking_url = "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"
|
||||
self.assertEqual("https://twitter.com/bellingcat/status/1874097816571961839", self.archiver.sanitize_url(tracking_url))
|
||||
|
||||
# shouldn't alter non-twitter/x URLs
|
||||
test_url = "https://www.bellingcat.com/category/resources/"
|
||||
self.assertEqual(test_url, self.archiver.sanitize_url(test_url))
|
||||
|
||||
# shouldn't strip params from non-twitter/x URLs
|
||||
test_url = "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"
|
||||
self.assertEqual(test_url, self.archiver.sanitize_url(test_url))
|
||||
|
||||
def test_get_username_tweet_id_from_url(self):
|
||||
|
||||
# test valid twitter URL
|
||||
url = "https://twitter.com/bellingcat/status/1874097816571961839"
|
||||
username, tweet_id = self.archiver.get_username_tweet_id(url)
|
||||
self.assertEqual("bellingcat", username)
|
||||
self.assertEqual("1874097816571961839", tweet_id)
|
||||
assert exptected_username == username
|
||||
assert exptected_tweetid == tweet_id
|
||||
|
||||
# test valid x URL
|
||||
url = "https://x.com/bellingcat/status/1874097816571961839"
|
||||
username, tweet_id = self.archiver.get_username_tweet_id(url)
|
||||
self.assertEqual("bellingcat", username)
|
||||
self.assertEqual("1874097816571961839", tweet_id)
|
||||
def test_choose_variants(self):
|
||||
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
|
||||
variant_list = [{'content_type': 'application/x-mpegURL', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'},
|
||||
{'bitrate': 256000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'},
|
||||
{'bitrate': 832000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'},
|
||||
{'bitrate': 2176000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12'}
|
||||
]
|
||||
chosen_variant = self.archiver.choose_variant(variant_list)
|
||||
assert chosen_variant == variant_list[3]
|
||||
|
||||
# test invalid URL
|
||||
# TODO: should this return None, False or raise an exception? Right now it returns False
|
||||
url = "https://www.bellingcat.com/category/resources/"
|
||||
username, tweet_id = self.archiver.get_username_tweet_id(url)
|
||||
self.assertFalse(username)
|
||||
self.assertFalse(tweet_id)
|
||||
|
||||
def test_youtube_dlp_archiver(self):
|
||||
|
||||
url = "https://x.com/bellingcat/status/1874097816571961839"
|
||||
post = self.archiver.download_yt_dlp(self.create_item(url), url, "1874097816571961839")
|
||||
self.assertTrue(post)
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
|
||||
)
|
||||
|
||||
def test_reverse_engineer_token(self):
|
||||
@pytest.mark.parametrize("tweet_id, expected_token", [
|
||||
("1874097816571961839", "4jjngwkifa"),
|
||||
("1674700676612386816", "42586mwa3uv"),
|
||||
("1877747914073620506", "4jv4aahw36n"),
|
||||
("1876710769913450647", "4jruzjz5lux"),
|
||||
("1346554693649113090", "39ibqxei7mo")
|
||||
])
|
||||
def test_reverse_engineer_token(self, tweet_id, expected_token):
|
||||
# see Vercel's implementation here: https://github.com/vercel/react-tweet/blob/main/packages/react-tweet/src/api/fetch-tweet.ts#L27C1-L31C2
|
||||
# and the discussion here: https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-2211358215
|
||||
|
||||
for tweet_id, real_token in [
|
||||
("1874097816571961839", "4jjngwkifa"),
|
||||
("1674700676612386816", "42586mwa3uv"),
|
||||
("1877747914073620506", "4jv4aahw36n"),
|
||||
("1876710769913450647", "4jruzjz5lux"),
|
||||
("1346554693649113090", "39ibqxei7mo"),]:
|
||||
generated_token = self.archiver.generate_token(tweet_id)
|
||||
self.assertEqual(real_token, generated_token)
|
||||
generated_token = self.archiver.generate_token(tweet_id)
|
||||
assert expected_token == generated_token
|
||||
|
||||
def test_syndication_archiver(self):
|
||||
@pytest.mark.download
|
||||
def test_youtube_dlp_archiver(self, make_item):
|
||||
|
||||
url = "https://x.com/bellingcat/status/1874097816571961839"
|
||||
post = self.archiver.download_syndication(self.create_item(url), url, "1874097816571961839")
|
||||
self.assertTrue(post)
|
||||
post = self.archiver.download_yt_dlp(make_item(url), url, "1874097816571961839")
|
||||
assert post
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"twitter-ytdl"
|
||||
)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_syndication_archiver(self, make_item):
|
||||
|
||||
url = "https://x.com/bellingcat/status/1874097816571961839"
|
||||
post = self.archiver.download_syndication(make_item(url), url, "1874097816571961839")
|
||||
assert post
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
|
||||
)
|
||||
|
||||
def test_download_nonexistend_tweet(self):
|
||||
@pytest.mark.download
|
||||
def test_download_nonexistend_tweet(self, make_item):
|
||||
# this tweet does not exist
|
||||
url = "https://x.com/Bellingcat/status/17197025860711058"
|
||||
response = self.archiver.download(self.create_item(url))
|
||||
self.assertFalse(response)
|
||||
response = self.archiver.download(make_item(url))
|
||||
assert not response
|
||||
|
||||
def test_download_malformed_tweetid(self):
|
||||
@pytest.mark.download
|
||||
def test_download_malformed_tweetid(self, make_item):
|
||||
# this tweet does not exist
|
||||
url = "https://x.com/Bellingcat/status/1719702586071100058"
|
||||
response = self.archiver.download(self.create_item(url))
|
||||
self.assertFalse(response)
|
||||
response = self.archiver.download(make_item(url))
|
||||
assert not response
|
||||
|
||||
def test_download_media_with_images(self):
|
||||
# url https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
||||
@pytest.mark.download
|
||||
def test_download_tweet_no_media(self, make_item):
|
||||
|
||||
post = self.archiver.download()
|
||||
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
|
||||
post = self.archiver.download(item)
|
||||
|
||||
# just make sure twitter haven't changed their format, images should be under "record/embed/media/images"
|
||||
# there should be 2 images
|
||||
self.assertTrue("record" in post)
|
||||
self.assertTrue("embed" in post["record"])
|
||||
self.assertTrue("media" in post["record"]["embed"])
|
||||
self.assertTrue("images" in post["record"]["embed"]["media"])
|
||||
self.assertEqual(len(post["record"]["embed"]["media"]["images"]), 2)
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"Onion rings are just vegetable donuts.",
|
||||
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
||||
"twitter-ytdl"
|
||||
)
|
||||
|
||||
# try downloading the media files
|
||||
media = self.archiver.download(post)
|
||||
self.assertEqual(len(media), 2)
|
||||
@pytest.mark.download
|
||||
def test_download_video(self, make_item):
|
||||
url = "https://x.com/bellingcat/status/1871552600346415571"
|
||||
post = self.archiver.download(make_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
|
||||
)
|
||||
|
||||
# check the IDs
|
||||
self.assertTrue("bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src'))
|
||||
self.assertTrue("bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src'))
|
||||
@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
|
||||
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
])
|
||||
def test_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
|
||||
|
||||
"""Download tweets with sensitive media"""
|
||||
|
||||
post = self.archiver.download(make_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
title,
|
||||
timestamp
|
||||
)
|
||||
assert len(post.media) == 1
|
||||
assert post.media[0].hash == image_hash
|
|
@ -0,0 +1,12 @@
|
|||
import pytest
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
@pytest.fixture
|
||||
def make_item():
|
||||
def _make_item(url: str, **kwargs) -> Metadata:
|
||||
item = Metadata().set_url(url)
|
||||
for key, value in kwargs.items():
|
||||
item.set(key, value)
|
||||
return item
|
||||
|
||||
return _make_item
|
|
@ -0,0 +1 @@
|
|||
test1
|
|
@ -0,0 +1 @@
|
|||
test2
|
|
@ -0,0 +1,22 @@
|
|||
|
||||
from auto_archiver.databases.csv_db import CSVDb
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
def test_store_item(tmp_path):
|
||||
"""Tests storing an item in the CSV database"""
|
||||
|
||||
temp_db = tmp_path / "temp_db.csv"
|
||||
db = CSVDb({
|
||||
"csv_db": {"csv_file": temp_db.as_posix()}
|
||||
})
|
||||
|
||||
item = Metadata().set_url("http://example.com").set_title("Example").set_content("Example content").success("my-archiver")
|
||||
|
||||
db.done(item)
|
||||
|
||||
with open(temp_db, "r", encoding="utf-8") as f:
|
||||
assert f.read().strip() == f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
|
||||
|
||||
# TODO: csv db doesn't have a fetch method - need to add it (?)
|
||||
# assert db.fetch(item) == item
|
|
@ -0,0 +1,55 @@
|
|||
import pytest
|
||||
|
||||
from auto_archiver.enrichers.hash_enricher import HashEnricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
@pytest.mark.parametrize("algorithm, filename, expected_hash", [
|
||||
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
|
||||
("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
|
||||
("SHA3-512", "tests/data/testfile_1.txt", "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e"),
|
||||
("SHA3-512", "tests/data/testfile_2.txt", "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6")
|
||||
])
|
||||
def test_calculate_hash(algorithm, filename, expected_hash):
|
||||
# test SHA-256
|
||||
he = HashEnricher({"algorithm": algorithm, "chunksize": 1})
|
||||
assert he.calculate_hash(filename) == expected_hash
|
||||
|
||||
def test_default_config_values():
|
||||
he = HashEnricher(config={})
|
||||
assert he.algorithm == "SHA-256"
|
||||
assert he.chunksize == 16000000
|
||||
|
||||
def test_invalid_chunksize():
|
||||
with pytest.raises(AssertionError):
|
||||
he = HashEnricher({"chunksize": "-100"})
|
||||
|
||||
def test_invalid_algorithm():
|
||||
with pytest.raises(AssertionError):
|
||||
HashEnricher({"algorithm": "SHA-123"})
|
||||
|
||||
def test_config():
|
||||
# test default config
|
||||
c = HashEnricher.configs()
|
||||
assert c["algorithm"]["default"] == "SHA-256"
|
||||
assert c["chunksize"]["default"] == 16000000
|
||||
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]
|
||||
assert c["algorithm"]["help"] == "hash algorithm to use"
|
||||
assert c["chunksize"]["help"] == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
|
||||
|
||||
def test_hash_media():
|
||||
|
||||
he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1})
|
||||
|
||||
# generate metadata with two test files
|
||||
m = Metadata().set_url("https://example.com")
|
||||
|
||||
# noop - the metadata has no media. Shouldn't fail
|
||||
he.enrich(m)
|
||||
|
||||
m.add_media(Media("tests/data/testfile_1.txt"))
|
||||
m.add_media(Media("tests/data/testfile_2.txt"))
|
||||
|
||||
he.enrich(m)
|
||||
|
||||
assert m.media[0].get("hash") == "SHA-256:1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"
|
||||
assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"
|
|
@ -0,0 +1,17 @@
|
|||
from auto_archiver.core.context import ArchivingContext
|
||||
from auto_archiver.formatters.html_formatter import HtmlFormatter
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
def test_format():
|
||||
formatter = HtmlFormatter({})
|
||||
metadata = Metadata().set("content", "Hello, world!").set_url('https://example.com')
|
||||
|
||||
final_media = formatter.format(metadata)
|
||||
assert isinstance(final_media, Media)
|
||||
assert ".html" in final_media.filename
|
||||
with open (final_media.filename, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
assert "Hello, world!" in content
|
||||
assert final_media.mimetype == "text/html"
|
||||
assert "SHA-256:" in final_media.get('hash')
|
Ładowanie…
Reference in New Issue