add module for perceptual hashing with pdq

2023-06-26 15:25:55 +02:00 · 2023-06-26 15:25:55 +02:00 · 9fc09c724b
commit 9fc09c724b
--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@ -1,6 +1,6 @@
 steps:
  # only 1 feeder allowed
-  feeder: gsheet_feeder # defaults to cli_feeder
+  feeder: cli_feeder # defaults to cli_feeder
  archivers: # order matters, uncomment to activate
    # - vk_archiver
    # - telethon_archiver
@ -11,14 +11,14 @@ steps:
    # - instagram_archiver
    # - tiktok_archiver
    - youtubedl_archiver
-    - wayback_archiver_enricher
+    # - wayback_archiver_enricher
  enrichers:
    - hash_enricher
    # - screenshot_enricher
    # - thumbnail_enricher
    # - wayback_archiver_enricher
    # - wacz_enricher
-    
+    - pdq_hash_enricher
  formatter: html_formatter # defaults to mute_formatter
  storages:
    - local_storage
--- a/src/auto_archiver/enrichers/init.py
+++ b/src/auto_archiver/enrichers/init.py
@ -4,4 +4,5 @@ from .wayback_enricher import WaybackArchiverEnricher
 from .hash_enricher import HashEnricher
 from .thumbnail_enricher import ThumbnailEnricher
 from .wacz_enricher import WaczEnricher
-from .whisper_enricher import WhisperEnricher
+from .whisper_enricher import WhisperEnricher
+from .pdq_hash_enricher import PdqHashEnricher
--- a/src/auto_archiver/enrichers/pdq_hash_enricher.py
+++ b/src/auto_archiver/enrichers/pdq_hash_enricher.py
@ -0,0 +1,41 @@
+import pdqhash
+import numpy as np
+from PIL import Image
+from loguru import logger
+
+from . import Enricher
+from ..core import Metadata
+
+
+class PdqHashEnricher(Enricher):
+    """
+    Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection
+    """
+    name = "pdq_hash_enricher"
+
+    def __init__(self, config: dict) -> None:
+        # Without this STEP.__init__ is not called
+        super().__init__(config)
+
+    @staticmethod
+    def configs() -> dict:
+        return {}
+
+    def enrich(self, to_enrich: Metadata) -> None:
+        url = to_enrich.get_url()
+        logger.debug(f"calculating media hashes for {url=}")
+
+        for i, m in enumerate(to_enrich.media):
+            # only run for images and video thumbnails, not screenshots
+            if m.filename.endswith(('.jpg', '.png', '.jpeg')) and m.key != "screenshot":
+                if len(hd := self.calculate_pdq_hash(m.filename)):
+                    to_enrich.media[i].set("pdq_hash", hd)
+
+    def calculate_pdq_hash(self, filename):
+        # open the image file
+        with Image.open(filename) as img:
+            # convert the image to RGB
+            image_rgb = np.array(img.convert("RGB"))
+            # compute the 256-bit PDQ hash (we do not store the quality score)
+            hash, _ = pdqhash.compute(image_rgb)
+            return hash