add module for perceptual hashing with pdq

pull/81/head
Emiel de Heij 2023-06-26 15:25:55 +02:00
rodzic f6e5a14d75
commit 9fc09c724b
3 zmienionych plików z 46 dodań i 4 usunięć

Wyświetl plik

@ -1,6 +1,6 @@
steps:
# only 1 feeder allowed
feeder: gsheet_feeder # defaults to cli_feeder
feeder: cli_feeder # defaults to cli_feeder
archivers: # order matters, uncomment to activate
# - vk_archiver
# - telethon_archiver
@ -11,14 +11,14 @@ steps:
# - instagram_archiver
# - tiktok_archiver
- youtubedl_archiver
- wayback_archiver_enricher
# - wayback_archiver_enricher
enrichers:
- hash_enricher
# - screenshot_enricher
# - thumbnail_enricher
# - wayback_archiver_enricher
# - wacz_enricher
- pdq_hash_enricher
formatter: html_formatter # defaults to mute_formatter
storages:
- local_storage

Wyświetl plik

@ -4,4 +4,5 @@ from .wayback_enricher import WaybackArchiverEnricher
from .hash_enricher import HashEnricher
from .thumbnail_enricher import ThumbnailEnricher
from .wacz_enricher import WaczEnricher
from .whisper_enricher import WhisperEnricher
from .whisper_enricher import WhisperEnricher
from .pdq_hash_enricher import PdqHashEnricher

Wyświetl plik

@ -0,0 +1,41 @@
import pdqhash
import numpy as np
from PIL import Image
from loguru import logger
from . import Enricher
from ..core import Metadata
class PdqHashEnricher(Enricher):
"""
Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection
"""
name = "pdq_hash_enricher"
def __init__(self, config: dict) -> None:
# Without this STEP.__init__ is not called
super().__init__(config)
@staticmethod
def configs() -> dict:
return {}
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"calculating media hashes for {url=}")
for i, m in enumerate(to_enrich.media):
# only run for images and video thumbnails, not screenshots
if m.filename.endswith(('.jpg', '.png', '.jpeg')) and m.key != "screenshot":
if len(hd := self.calculate_pdq_hash(m.filename)):
to_enrich.media[i].set("pdq_hash", hd)
def calculate_pdq_hash(self, filename):
# open the image file
with Image.open(filename) as img:
# convert the image to RGB
image_rgb = np.array(img.convert("RGB"))
# compute the 256-bit PDQ hash (we do not store the quality score)
hash, _ = pdqhash.compute(image_rgb)
return hash