kopia lustrzana https://github.com/bellingcat/auto-archiver
detects duplicates before storing, eg: wacz getting media already fetched by another archiver
rodzic
fc93ebaba0
commit
7a5c9c65bd
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import hashlib
|
||||||
from typing import Any, List, Union, Dict
|
from typing import Any, List, Union, Dict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json, config
|
from dataclasses_json import dataclass_json, config
|
||||||
|
@ -42,6 +43,7 @@ class Metadata:
|
||||||
|
|
||||||
def store(self: Metadata, override_storages: List = None):
|
def store(self: Metadata, override_storages: List = None):
|
||||||
# calls .store for all contained media. storages [Storage]
|
# calls .store for all contained media. storages [Storage]
|
||||||
|
self.remove_duplicate_media_by_hash()
|
||||||
storages = override_storages or ArchivingContext.get("storages")
|
storages = override_storages or ArchivingContext.get("storages")
|
||||||
for media in self.media:
|
for media in self.media:
|
||||||
media.store(override_storages=storages, url=self.get_url())
|
media.store(override_storages=storages, url=self.get_url())
|
||||||
|
@ -122,6 +124,27 @@ class Metadata:
|
||||||
if m.get("id") == id: return m
|
if m.get("id") == id: return m
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
def remove_duplicate_media_by_hash(self) -> None:
|
||||||
|
# iterates all media, calculates a hash if it's missing and deletes duplicates
|
||||||
|
def calculate_hash_in_chunks(hash_algo, chunksize, filename) -> str:
|
||||||
|
# taken from hash_enricher, cannot be isolated to misc due to circular imports
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
while True:
|
||||||
|
buf = f.read(chunksize)
|
||||||
|
if not buf: break
|
||||||
|
hash_algo.update(buf)
|
||||||
|
return hash_algo.hexdigest()
|
||||||
|
|
||||||
|
media_hashes = set()
|
||||||
|
new_media = []
|
||||||
|
for m in self.media:
|
||||||
|
h = m.get("hash")
|
||||||
|
if not h: h = calculate_hash_in_chunks(hashlib.sha256(), 1.6e7, m.filename)
|
||||||
|
if len(h) and h in media_hashes: continue
|
||||||
|
media_hashes.add(h)
|
||||||
|
new_media.append(m)
|
||||||
|
self.media = new_media
|
||||||
|
|
||||||
def get_first_image(self, default=None) -> Media:
|
def get_first_image(self, default=None) -> Media:
|
||||||
for m in self.media:
|
for m in self.media:
|
||||||
if "image" in m.mimetype: return m
|
if "image" in m.mimetype: return m
|
||||||
|
|
|
@ -34,7 +34,7 @@ class HashEnricher(Enricher):
|
||||||
if len(hd := self.calculate_hash(m.filename)):
|
if len(hd := self.calculate_hash(m.filename)):
|
||||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||||
|
|
||||||
def calculate_hash(self, filename):
|
def calculate_hash(self, filename) -> str:
|
||||||
hash = None
|
hash = None
|
||||||
if self.algorithm == "SHA-256":
|
if self.algorithm == "SHA-256":
|
||||||
hash = hashlib.sha256()
|
hash = hashlib.sha256()
|
||||||
|
|
Ładowanie…
Reference in New Issue