kopia lustrzana https://github.com/bellingcat/auto-archiver
switch to pytest and pytest-recording
rodzic
e2bc84ccb9
commit
63973e2ce7
|
@ -14,9 +14,26 @@ class HashEnricher(Enricher):
|
|||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
algo_choices = self.configs()["algorithm"]["choices"]
|
||||
algos = self.configs()["algorithm"]
|
||||
algo_choices = algos["choices"]
|
||||
if not getattr(self, 'algorithm', None):
|
||||
if not config.get('algorithm'):
|
||||
logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
|
||||
self.algorithm = algos["default"]
|
||||
else:
|
||||
self.algorithm = config["algorithm"]
|
||||
|
||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||
|
||||
if not getattr(self, 'chunksize', None):
|
||||
if config.get('chunksize'):
|
||||
self.chunksize = config["chunksize"]
|
||||
else:
|
||||
self.chunksize = self.configs()["chunksize"]["default"]
|
||||
|
||||
self.chunksize = int(self.chunksize)
|
||||
assert self.chunksize >= -1, "read length must be non-negative or -1"
|
||||
|
||||
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
|
||||
|
||||
@staticmethod
|
||||
|
|
Plik diff jest za duży
Load Diff
Plik diff jest za duży
Load Diff
Plik diff jest za duży
Load Diff
|
@ -1,4 +1,5 @@
|
|||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
class TestArchiverBase(object):
|
||||
|
||||
|
@ -16,7 +17,13 @@ class TestArchiverBase(object):
|
|||
item.set(key, value)
|
||||
return item
|
||||
|
||||
def assertValidResponseMetadata(self, test_response, title, timestamp):
|
||||
assert test_response.is_success()
|
||||
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
|
||||
assert test_response is not False
|
||||
|
||||
if not status:
|
||||
assert test_response.is_success()
|
||||
else:
|
||||
assert status == test_response.status
|
||||
|
||||
assert title == test_response.get_title()
|
||||
assert timestamp, test_response.get("timestamp")
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import pytest
|
||||
import unittest
|
||||
|
||||
from auto_archiver.archivers.bluesky_archiver import BlueskyArchiver
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
from vcr.unittest import VCRMixin
|
||||
import unittest
|
||||
|
||||
class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
|
||||
"""Tests Bluesky Archiver
|
||||
|
@ -14,6 +15,7 @@ class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
|
|||
archiver_class = BlueskyArchiver
|
||||
config = {}
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_media_with_images(self):
|
||||
# url https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y
|
||||
post = self.archiver._get_post_from_uri("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
|
||||
|
@ -34,6 +36,7 @@ class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
|
|||
assert "bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src')
|
||||
assert "bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src')
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_post_with_single_image(self):
|
||||
# url https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l
|
||||
post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l")
|
||||
|
@ -50,8 +53,9 @@ class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
|
|||
|
||||
# check the ID
|
||||
assert "bafkreihljdtomy4yulx4nfxuqdatlgvdg45vxdmjzzhclsd4ludk7zfma4" in media[0].get('src')
|
||||
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_post_with_video(self):
|
||||
# url https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i
|
||||
post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import unittest
|
||||
import datetime
|
||||
import pytest
|
||||
|
||||
from auto_archiver.archivers.twitter_archiver import TwitterArchiver
|
||||
|
||||
from .test_archiver_base import TestArchiverBase
|
||||
|
||||
|
||||
class TestTwitterArchiver(TestArchiverBase, unittest.TestCase):
|
||||
|
||||
archiver_class = TwitterArchiver
|
||||
|
@ -60,6 +60,7 @@ class TestTwitterArchiver(TestArchiverBase, unittest.TestCase):
|
|||
assert not username
|
||||
assert not tweet_id
|
||||
|
||||
@pytest.mark.download
|
||||
def test_youtube_dlp_archiver(self):
|
||||
|
||||
url = "https://x.com/bellingcat/status/1874097816571961839"
|
||||
|
@ -68,28 +69,45 @@ class TestTwitterArchiver(TestArchiverBase, unittest.TestCase):
|
|||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"twitter-ytdl"
|
||||
)
|
||||
breakpoint()
|
||||
|
||||
|
||||
def test_download_media_with_images(self):
|
||||
@pytest.mark.download
|
||||
def test_download_tweet_no_media(self):
|
||||
# url https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
||||
|
||||
post = self.archiver.download()
|
||||
item = self.create_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
|
||||
post = self.archiver.download(item)
|
||||
|
||||
# just make sure twitter haven't changed their format, images should be under "record/embed/media/images"
|
||||
# there should be 2 images
|
||||
assert "record" in post
|
||||
assert "embed" in post["record"]
|
||||
assert "media" in post["record"]["embed"]
|
||||
assert "images" in post["record"]["embed"]["media"]
|
||||
assert len(post["record"]["embed"]["media"]["images"]) == 2
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"Onion rings are just vegetable donuts.",
|
||||
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
||||
"twitter-ytdl"
|
||||
)
|
||||
|
||||
# try downloading the media files
|
||||
media = self.archiver.download(post)
|
||||
assert len(media) == 2
|
||||
@pytest.mark.download
|
||||
def test_download_sensitive_media(self):
|
||||
|
||||
# check the IDs
|
||||
assert "bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src')
|
||||
assert "bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src')
|
||||
"""Download tweets with sensitive media
|
||||
|
||||
Note: currently failing, youtube-dlp requres logged in users"""
|
||||
|
||||
|
||||
test_data = [
|
||||
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash")
|
||||
]
|
||||
|
||||
for url, title, timestamp, image_hash in test_data:
|
||||
post = self.archiver.download(self.create_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
title,
|
||||
timestamp
|
||||
)
|
||||
assert len(post.media) == 1
|
||||
assert post.media[0].hash == image_hash
|
|
@ -0,0 +1 @@
|
|||
test1
|
|
@ -0,0 +1 @@
|
|||
test2
|
|
@ -0,0 +1,57 @@
|
|||
from unittest import TestCase
|
||||
|
||||
from auto_archiver.enrichers.hash_enricher import HashEnricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
class TestHashEnricher(TestCase):
|
||||
def test_calculate_hash_sha256(self):
|
||||
# test SHA-256
|
||||
he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1})
|
||||
assert he.calculate_hash("tests/data/testfile_1.txt") == "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"
|
||||
assert he.calculate_hash("tests/data/testfile_2.txt") == "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"
|
||||
|
||||
def test_calculate_hash_sha3_512(self):
|
||||
# test SHA3-512
|
||||
he = HashEnricher({"algorithm": "SHA3-512", "chunksize": 1})
|
||||
assert he.calculate_hash("tests/data/testfile_1.txt") == "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e"
|
||||
assert he.calculate_hash("tests/data/testfile_2.txt") == "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6"
|
||||
|
||||
def test_default_config_values(self):
|
||||
he = HashEnricher(config={})
|
||||
assert he.algorithm == "SHA-256"
|
||||
assert he.chunksize == 16000000
|
||||
|
||||
def test_invalid_chunksize(self):
|
||||
with self.assertRaises(AssertionError):
|
||||
he = HashEnricher({"chunksize": "-100"})
|
||||
|
||||
def test_invalid_algorithm(self):
|
||||
with self.assertRaises(AssertionError):
|
||||
HashEnricher({"algorithm": "SHA-123"})
|
||||
|
||||
def test_config(self):
|
||||
# test default config
|
||||
c = HashEnricher.configs()
|
||||
assert c["algorithm"]["default"] == "SHA-256"
|
||||
assert c["chunksize"]["default"] == 16000000
|
||||
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]
|
||||
assert c["algorithm"]["help"] == "hash algorithm to use"
|
||||
assert c["chunksize"]["help"] == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
|
||||
|
||||
def test_hash_media(self):
|
||||
|
||||
he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1})
|
||||
|
||||
# generate metadata with two test files
|
||||
m = Metadata().set_url("https://example.com")
|
||||
|
||||
# noop - the metadata has no media. Shouldn't fail
|
||||
he.enrich(m)
|
||||
|
||||
m.add_media(Media("tests/data/testfile_1.txt"))
|
||||
m.add_media(Media("tests/data/testfile_2.txt"))
|
||||
|
||||
he.enrich(m)
|
||||
|
||||
self.assertEqual(m.media[0].get("hash"), "SHA-256:1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014")
|
||||
self.assertEqual(m.media[1].get("hash"), "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752")
|
Ładowanie…
Reference in New Issue