detects duplicates before storing, eg: wacz getting media already fetched by another archiver

pull/87/head
msramalho 2023-07-28 10:51:48 +01:00
rodzic fc93ebaba0
commit 7a5c9c65bd
2 zmienionych plików z 25 dodań i 2 usunięć

Wyświetl plik

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import hashlib
from typing import Any, List, Union, Dict from typing import Any, List, Union, Dict
from dataclasses import dataclass, field from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config from dataclasses_json import dataclass_json, config
@ -42,6 +43,7 @@ class Metadata:
def store(self: Metadata, override_storages: List = None): def store(self: Metadata, override_storages: List = None):
# calls .store for all contained media. storages [Storage] # calls .store for all contained media. storages [Storage]
self.remove_duplicate_media_by_hash()
storages = override_storages or ArchivingContext.get("storages") storages = override_storages or ArchivingContext.get("storages")
for media in self.media: for media in self.media:
media.store(override_storages=storages, url=self.get_url()) media.store(override_storages=storages, url=self.get_url())
@ -122,6 +124,27 @@ class Metadata:
if m.get("id") == id: return m if m.get("id") == id: return m
return default return default
def remove_duplicate_media_by_hash(self) -> None:
# iterates all media, calculates a hash if it's missing and deletes duplicates
def calculate_hash_in_chunks(hash_algo, chunksize, filename) -> str:
# taken from hash_enricher, cannot be isolated to misc due to circular imports
with open(filename, "rb") as f:
while True:
buf = f.read(chunksize)
if not buf: break
hash_algo.update(buf)
return hash_algo.hexdigest()
media_hashes = set()
new_media = []
for m in self.media:
h = m.get("hash")
if not h: h = calculate_hash_in_chunks(hashlib.sha256(), 1.6e7, m.filename)
if len(h) and h in media_hashes: continue
media_hashes.add(h)
new_media.append(m)
self.media = new_media
def get_first_image(self, default=None) -> Media: def get_first_image(self, default=None) -> Media:
for m in self.media: for m in self.media:
if "image" in m.mimetype: return m if "image" in m.mimetype: return m

Wyświetl plik

@ -34,7 +34,7 @@ class HashEnricher(Enricher):
if len(hd := self.calculate_hash(m.filename)): if len(hd := self.calculate_hash(m.filename)):
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}") to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
def calculate_hash(self, filename): def calculate_hash(self, filename) -> str:
hash = None hash = None
if self.algorithm == "SHA-256": if self.algorithm == "SHA-256":
hash = hashlib.sha256() hash = hashlib.sha256()