refactor to hash all images and save hex string

pull/81/head
msramalho 2023-06-26 17:06:30 +01:00
rodzic 6cf3e109ed
commit b4f86d0e8d
1 zmienionych plików z 12 dodań i 10 usunięć

Wyświetl plik

@ -9,7 +9,8 @@ from ..core import Metadata
class PdqHashEnricher(Enricher): class PdqHashEnricher(Enricher):
""" """
Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection.
Ideally this enrichment is orchestrated to run after the thumbnail_enricher.
""" """
name = "pdq_hash_enricher" name = "pdq_hash_enricher"
@ -23,19 +24,20 @@ class PdqHashEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() url = to_enrich.get_url()
logger.debug(f"calculating media hashes for {url=}") logger.debug(f"calculating perceptual hashes for {url=}")
for i, m in enumerate(to_enrich.media): for m in to_enrich.media:
# only run for images and video thumbnails, not screenshots for media in m.all_inner_media(True):
if m.filename.endswith(('.jpg', '.png', '.jpeg')) and m.key != "screenshot": if media.is_image() and media.key != "screenshot":
if len(hd := self.calculate_pdq_hash(m.filename)): if len(hd := self.calculate_pdq_hash(media.filename)):
to_enrich.media[i].set("pdq_hash", hd) media.set("pdq_hash", hd)
def calculate_pdq_hash(self, filename): def calculate_pdq_hash(self, filename):
# open the image file # returns a hexadecimal string with the perceptual hash for the given filename
with Image.open(filename) as img: with Image.open(filename) as img:
# convert the image to RGB # convert the image to RGB
image_rgb = np.array(img.convert("RGB")) image_rgb = np.array(img.convert("RGB"))
# compute the 256-bit PDQ hash (we do not store the quality score) # compute the 256-bit PDQ hash (we do not store the quality score)
hash, _ = pdqhash.compute(image_rgb) hash_array, _ = pdqhash.compute(image_rgb)
return hash hash = "".join(str(b) for b in hash_array)
return hex(int(hash, 2))[2:]