kopia lustrzana https://github.com/bellingcat/auto-archiver
add module for perceptual hashing with pdq
rodzic
f6e5a14d75
commit
9fc09c724b
|
@ -1,6 +1,6 @@
|
|||
steps:
|
||||
# only 1 feeder allowed
|
||||
feeder: gsheet_feeder # defaults to cli_feeder
|
||||
feeder: cli_feeder # defaults to cli_feeder
|
||||
archivers: # order matters, uncomment to activate
|
||||
# - vk_archiver
|
||||
# - telethon_archiver
|
||||
|
@ -11,14 +11,14 @@ steps:
|
|||
# - instagram_archiver
|
||||
# - tiktok_archiver
|
||||
- youtubedl_archiver
|
||||
- wayback_archiver_enricher
|
||||
# - wayback_archiver_enricher
|
||||
enrichers:
|
||||
- hash_enricher
|
||||
# - screenshot_enricher
|
||||
# - thumbnail_enricher
|
||||
# - wayback_archiver_enricher
|
||||
# - wacz_enricher
|
||||
|
||||
- pdq_hash_enricher
|
||||
formatter: html_formatter # defaults to mute_formatter
|
||||
storages:
|
||||
- local_storage
|
||||
|
|
|
@ -4,4 +4,5 @@ from .wayback_enricher import WaybackArchiverEnricher
|
|||
from .hash_enricher import HashEnricher
|
||||
from .thumbnail_enricher import ThumbnailEnricher
|
||||
from .wacz_enricher import WaczEnricher
|
||||
from .whisper_enricher import WhisperEnricher
|
||||
from .whisper_enricher import WhisperEnricher
|
||||
from .pdq_hash_enricher import PdqHashEnricher
|
|
@ -0,0 +1,41 @@
|
|||
import pdqhash
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
|
||||
|
||||
class PdqHashEnricher(Enricher):
|
||||
"""
|
||||
Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection
|
||||
"""
|
||||
name = "pdq_hash_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# Without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"calculating media hashes for {url=}")
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
# only run for images and video thumbnails, not screenshots
|
||||
if m.filename.endswith(('.jpg', '.png', '.jpeg')) and m.key != "screenshot":
|
||||
if len(hd := self.calculate_pdq_hash(m.filename)):
|
||||
to_enrich.media[i].set("pdq_hash", hd)
|
||||
|
||||
def calculate_pdq_hash(self, filename):
|
||||
# open the image file
|
||||
with Image.open(filename) as img:
|
||||
# convert the image to RGB
|
||||
image_rgb = np.array(img.convert("RGB"))
|
||||
# compute the 256-bit PDQ hash (we do not store the quality score)
|
||||
hash, _ = pdqhash.compute(image_rgb)
|
||||
return hash
|
Ładowanie…
Reference in New Issue