From 9fc09c724b1eb67726934f101fc3c3403a5eb51a Mon Sep 17 00:00:00 2001 From: Emiel de Heij Date: Mon, 26 Jun 2023 15:25:55 +0200 Subject: [PATCH] add module for perceptual hashing with pdq --- example.orchestration.yaml | 6 +-- src/auto_archiver/enrichers/__init__.py | 3 +- .../enrichers/pdq_hash_enricher.py | 41 +++++++++++++++++++ 3 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 src/auto_archiver/enrichers/pdq_hash_enricher.py diff --git a/example.orchestration.yaml b/example.orchestration.yaml index 8fbc914..f0cd990 100644 --- a/example.orchestration.yaml +++ b/example.orchestration.yaml @@ -1,6 +1,6 @@ steps: # only 1 feeder allowed - feeder: gsheet_feeder # defaults to cli_feeder + feeder: cli_feeder # defaults to cli_feeder archivers: # order matters, uncomment to activate # - vk_archiver # - telethon_archiver @@ -11,14 +11,14 @@ steps: # - instagram_archiver # - tiktok_archiver - youtubedl_archiver - - wayback_archiver_enricher + # - wayback_archiver_enricher enrichers: - hash_enricher # - screenshot_enricher # - thumbnail_enricher # - wayback_archiver_enricher # - wacz_enricher - + - pdq_hash_enricher formatter: html_formatter # defaults to mute_formatter storages: - local_storage diff --git a/src/auto_archiver/enrichers/__init__.py b/src/auto_archiver/enrichers/__init__.py index d33b49d..f0f1ebe 100644 --- a/src/auto_archiver/enrichers/__init__.py +++ b/src/auto_archiver/enrichers/__init__.py @@ -4,4 +4,5 @@ from .wayback_enricher import WaybackArchiverEnricher from .hash_enricher import HashEnricher from .thumbnail_enricher import ThumbnailEnricher from .wacz_enricher import WaczEnricher -from .whisper_enricher import WhisperEnricher \ No newline at end of file +from .whisper_enricher import WhisperEnricher +from .pdq_hash_enricher import PdqHashEnricher \ No newline at end of file diff --git a/src/auto_archiver/enrichers/pdq_hash_enricher.py b/src/auto_archiver/enrichers/pdq_hash_enricher.py new file mode 100644 index 0000000..3bda318 --- /dev/null +++ b/src/auto_archiver/enrichers/pdq_hash_enricher.py @@ -0,0 +1,41 @@ +import pdqhash +import numpy as np +from PIL import Image +from loguru import logger + +from . import Enricher +from ..core import Metadata + + +class PdqHashEnricher(Enricher): + """ + Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection + """ + name = "pdq_hash_enricher" + + def __init__(self, config: dict) -> None: + # Without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"calculating media hashes for {url=}") + + for i, m in enumerate(to_enrich.media): + # only run for images and video thumbnails, not screenshots + if m.filename.endswith(('.jpg', '.png', '.jpeg')) and m.key != "screenshot": + if len(hd := self.calculate_pdq_hash(m.filename)): + to_enrich.media[i].set("pdq_hash", hd) + + def calculate_pdq_hash(self, filename): + # open the image file + with Image.open(filename) as img: + # convert the image to RGB + image_rgb = np.array(img.convert("RGB")) + # compute the 256-bit PDQ hash (we do not store the quality score) + hash, _ = pdqhash.compute(image_rgb) + return hash