hash calculation in chunks to avoid exhausting RAM

pull/72/head v0.4.4
msramalho 2023-03-10 11:34:29 +00:00
rodzic 0e3c427371
commit 0654e8c5c6
3 zmienionych plików z 28 dodań i 18 usunięć

Wyświetl plik

@ -16,11 +16,13 @@ class HashEnricher(Enricher):
super().__init__(config)
algo_choices = self.configs()["algorithm"]["choices"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
self.chunksize = int(self.chunksize)
@staticmethod
def configs() -> dict:
return {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
}
def enrich(self, to_enrich: Metadata) -> None:
@ -28,12 +30,19 @@ class HashEnricher(Enricher):
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
for i, m in enumerate(to_enrich.media):
with open(m.filename, "rb") as f:
bytes = f.read() # read entire file as bytes
hash = None
if self.algorithm == "SHA-256":
hash = hashlib.sha256(bytes)
elif self.algorithm == "SHA3-512":
hash = hashlib.sha3_512(bytes)
else: continue
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
if len(hd := self.calculate_hash(m.filename)):
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
def calculate_hash(self, filename):
hash = None
if self.algorithm == "SHA-256":
hash = hashlib.sha256()
elif self.algorithm == "SHA3-512":
hash = hashlib.sha3_512()
else: return ""
with open(filename, "rb") as f:
while True:
buf = f.read(self.chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()

Wyświetl plik

@ -5,6 +5,7 @@ import hashlib
from typing import IO, Any
from ..core import Media, Metadata, Step
from ..enrichers import HashEnricher
from loguru import logger
import os, uuid
from slugify import slugify
@ -64,18 +65,18 @@ class Storage(Step):
filename, ext = os.path.splitext(media.filename)
# path_generator logic
if self.path_generator == "flat":
if self.path_generator == "flat":
path = ""
filename = slugify(filename) # in case it comes with os.sep
filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(item.get_url())
elif self.path_generator == "random":
path = item.get("random_path", str(uuid.uuid4())[:16], True)
# filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
elif self.filename_generator == "static":
with open(media.filename, "rb") as f:
bytes = f.read() # read entire file as bytes
filename = hashlib.sha256(bytes).hexdigest()[:24]
elif self.filename_generator == "static":
he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1.6e7})
hd = he.calculate_hash(media.filename)
filename = hd[:24]
media.key = os.path.join(folder, path, f"{filename}{ext}")
media.key = os.path.join(folder, path, f"{filename}{ext}")

Wyświetl plik

@ -3,7 +3,7 @@ _MAJOR = "0"
_MINOR = "4"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "3"
_PATCH = "4"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""