switch to pytest and pytest-recording

2025-01-13 14:31:29 +01:00 · 2025-01-13 14:31:29 +01:00 · 63973e2ce7
commit 63973e2ce7
--- a/src/auto_archiver/enrichers/hash_enricher.py
+++ b/src/auto_archiver/enrichers/hash_enricher.py
@ -14,9 +14,26 @@ class HashEnricher(Enricher):
    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
-        algo_choices = self.configs()["algorithm"]["choices"]
+        algos = self.configs()["algorithm"]
+        algo_choices = algos["choices"]
+        if not getattr(self, 'algorithm', None):
+            if not config.get('algorithm'):
+                logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
+                self.algorithm = algos["default"]
+            else:
+                self.algorithm = config["algorithm"]
+
        assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
+
+        if not getattr(self, 'chunksize', None):
+            if config.get('chunksize'):
+                self.chunksize = config["chunksize"]
+            else:
+                self.chunksize = self.configs()["chunksize"]["default"]
+
        self.chunksize = int(self.chunksize)
+        assert self.chunksize >= -1, "read length must be non-negative or -1"
+
        ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)

    @staticmethod
--- a/tests/archivers/cassettes/TestBlueskyArchiver.test_download_media_with_images.yaml
+++ b/tests/archivers/cassettes/TestBlueskyArchiver.test_download_media_with_images.yaml
--- a/tests/archivers/cassettes/TestBlueskyArchiver.test_download_post_with_single_image.yaml
+++ b/tests/archivers/cassettes/TestBlueskyArchiver.test_download_post_with_single_image.yaml
--- a/tests/archivers/cassettes/TestBlueskyArchiver.test_download_post_with_video.yaml
+++ b/tests/archivers/cassettes/TestBlueskyArchiver.test_download_post_with_video.yaml
--- a/tests/archivers/test_archiver_base.py
+++ b/tests/archivers/test_archiver_base.py
@ -1,4 +1,5 @@
 from auto_archiver.core import Metadata
+from auto_archiver.core.metadata import Metadata

 class TestArchiverBase(object):

@ -16,7 +17,13 @@ class TestArchiverBase(object):
            item.set(key, value)
        return item
    
-    def assertValidResponseMetadata(self, test_response, title, timestamp):
-        assert test_response.is_success()
+    def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
+        assert test_response is not False
+
+        if not status:
+            assert test_response.is_success()
+        else:
+            assert status == test_response.status
+
        assert title == test_response.get_title()
        assert timestamp, test_response.get("timestamp")
--- a/tests/archivers/test_bluesky_archiver.py
+++ b/tests/archivers/test_bluesky_archiver.py
@ -1,7 +1,8 @@
+import pytest
+import unittest
+
 from auto_archiver.archivers.bluesky_archiver import BlueskyArchiver
 from .test_archiver_base import TestArchiverBase
-from vcr.unittest import VCRMixin
-import unittest

 class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
    """Tests Bluesky Archiver
@ -14,6 +15,7 @@ class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
    archiver_class = BlueskyArchiver
    config = {}

+    @pytest.mark.download
    def test_download_media_with_images(self):
        # url https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y
        post = self.archiver._get_post_from_uri("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
@ -34,6 +36,7 @@ class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
        assert "bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src')
        assert "bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src')

+    @pytest.mark.download
    def test_download_post_with_single_image(self):
        # url https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l
        post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l")
@ -50,8 +53,9 @@ class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):

        # check the ID 
        assert "bafkreihljdtomy4yulx4nfxuqdatlgvdg45vxdmjzzhclsd4ludk7zfma4" in media[0].get('src')
-                         
+                        

+    @pytest.mark.download
    def test_download_post_with_video(self):
        # url https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i
        post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
--- a/tests/archivers/test_twitter_archiver.py
+++ b/tests/archivers/test_twitter_archiver.py
@ -1,11 +1,11 @@
 import unittest
 import datetime
+import pytest

 from auto_archiver.archivers.twitter_archiver import TwitterArchiver

 from .test_archiver_base import TestArchiverBase

-
 class TestTwitterArchiver(TestArchiverBase, unittest.TestCase):

    archiver_class = TwitterArchiver
@ -60,6 +60,7 @@ class TestTwitterArchiver(TestArchiverBase, unittest.TestCase):
        assert not username
        assert not tweet_id

+    @pytest.mark.download
    def test_youtube_dlp_archiver(self):

        url = "https://x.com/bellingcat/status/1874097816571961839"
@ -68,28 +69,45 @@ class TestTwitterArchiver(TestArchiverBase, unittest.TestCase):
        self.assertValidResponseMetadata(
            post,
            "As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
-            datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
+            datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+            "twitter-ytdl"
        )
-        breakpoint()

-
-    def test_download_media_with_images(self):
+    @pytest.mark.download
+    def test_download_tweet_no_media(self):
        # url https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
        
-        post = self.archiver.download()
+        item = self.create_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
+        post = self.archiver.download(item)

-        # just make sure twitter haven't changed their format, images should be under "record/embed/media/images"
-        # there should be 2 images
-        assert "record" in post
-        assert "embed" in post["record"]
-        assert "media" in post["record"]["embed"]
-        assert "images" in post["record"]["embed"]["media"]
-        assert len(post["record"]["embed"]["media"]["images"]) == 2
+        self.assertValidResponseMetadata(
+            post,
+            "Onion rings are just vegetable donuts.",
+            datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
+            "twitter-ytdl"
+        )

-        # try downloading the media files
-        media = self.archiver.download(post)
-        assert len(media) == 2
+    @pytest.mark.download
+    def test_download_sensitive_media(self):

-        # check the IDs
-        assert "bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src')
-        assert "bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src')
+        """Download tweets with sensitive media
+        
+        Note: currently failing, youtube-dlp requres logged in users"""
+
+
+        test_data = [
+            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash")
+        ]
+
+        for url, title, timestamp, image_hash in test_data:
+            post = self.archiver.download(self.create_item(url))
+            self.assertValidResponseMetadata(
+                post,
+                title,
+                timestamp
+            )
+            assert len(post.media) == 1
+            assert post.media[0].hash == image_hash
--- a/tests/data/testfile_1.txt
+++ b/tests/data/testfile_1.txt
@ -0,0 +1 @@
+test1
--- a/tests/data/testfile_2.txt
+++ b/tests/data/testfile_2.txt
@ -0,0 +1 @@
+test2
--- a/tests/enrichers/test_hash_enricher.py
+++ b/tests/enrichers/test_hash_enricher.py
@ -0,0 +1,57 @@
+from unittest import TestCase
+
+from auto_archiver.enrichers.hash_enricher import HashEnricher
+from auto_archiver.core import Metadata, Media
+
+class TestHashEnricher(TestCase):
+    def test_calculate_hash_sha256(self):
+        # test SHA-256
+        he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1})
+        assert he.calculate_hash("tests/data/testfile_1.txt") == "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"
+        assert he.calculate_hash("tests/data/testfile_2.txt") == "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"
+
+    def test_calculate_hash_sha3_512(self):
+        # test SHA3-512
+        he = HashEnricher({"algorithm": "SHA3-512", "chunksize": 1})
+        assert he.calculate_hash("tests/data/testfile_1.txt") == "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e"
+        assert he.calculate_hash("tests/data/testfile_2.txt") == "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6"
+
+    def test_default_config_values(self):
+        he = HashEnricher(config={})
+        assert he.algorithm == "SHA-256"
+        assert he.chunksize == 16000000
+    
+    def test_invalid_chunksize(self):
+        with self.assertRaises(AssertionError):
+            he = HashEnricher({"chunksize": "-100"})
+
+    def test_invalid_algorithm(self):
+        with self.assertRaises(AssertionError):
+            HashEnricher({"algorithm": "SHA-123"})
+
+    def test_config(self):
+        # test default config
+        c = HashEnricher.configs()
+        assert c["algorithm"]["default"] == "SHA-256"
+        assert c["chunksize"]["default"] == 16000000
+        assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]
+        assert c["algorithm"]["help"] == "hash algorithm to use"
+        assert c["chunksize"]["help"] == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
+
+    def test_hash_media(self):
+
+        he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1})
+
+        # generate metadata with two test files
+        m = Metadata().set_url("https://example.com")
+
+        # noop - the metadata has no media. Shouldn't fail
+        he.enrich(m)
+
+        m.add_media(Media("tests/data/testfile_1.txt"))
+        m.add_media(Media("tests/data/testfile_2.txt"))
+
+        he.enrich(m)
+
+        self.assertEqual(m.media[0].get("hash"), "SHA-256:1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014")
+        self.assertEqual(m.media[1].get("hash"), "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752")