diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 6851cb5..819070a 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -136,7 +136,7 @@ class TelethonArchiver(Archiverv2): for i, om_url in enumerate(other_media_urls): filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}') self.download_from_url(om_url, filename) - result.add_media(Media(filename=filename, id=f"{group_id}_{i}")) + result.add_media(Media(filename=filename), id=f"{group_id}_{i}") filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) diff --git a/src/configs/v2config.py b/src/configs/v2config.py index 5b47d0f..dec3565 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -57,7 +57,7 @@ class ConfigV2: assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" config_path = f"{child.name}.{config}" - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})") + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) self.defaults[config_path] = details["default"] if "cli_set" in details: self.cli_ops[config_path] = details["cli_set"] diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py index 2a871d1..95b3fad 100644 --- a/src/enrichers/__init__.py +++ b/src/enrichers/__init__.py @@ -1,3 +1,4 @@ from .enricher import Enricher from .screenshot_enricher import ScreenshotEnricher -from .wayback_enricher import WaybackEnricher \ No newline at end of file +from .wayback_enricher import WaybackEnricher +from .hash_enricher import HashEnricher \ No newline at end of file diff --git a/src/enrichers/hash_enricher.py b/src/enrichers/hash_enricher.py new file mode 100644 index 0000000..786c861 --- /dev/null +++ b/src/enrichers/hash_enricher.py @@ -0,0 +1,41 @@ +import hashlib +from utils import Webdriver +from . import Enricher +from metadata import Metadata +from loguru import logger +from selenium.common.exceptions import TimeoutException +import time, requests + + +class HashEnricher(Enricher): + """ + Calculates hashes for Media instances + """ + name = "hash_enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + algo_choices = self.configs()["algorithm"]["choices"] + assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})." + + @staticmethod + def configs() -> dict: + return { + "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]} + } + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") + + for i, m in enumerate(to_enrich.media): + with open(m.filename, "rb") as f: + bytes = f.read() # read entire file as bytes + hash = None + if self.algorithm == "SHA-256": + hash = hashlib.sha256(bytes) + elif self.algorithm == "SHA3-512": + hash = hashlib.sha3_512(bytes) + else: continue + to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}") diff --git a/src/enrichers/screenshot_enricher.py b/src/enrichers/screenshot_enricher.py index b008e52..0375e3b 100644 --- a/src/enrichers/screenshot_enricher.py +++ b/src/enrichers/screenshot_enricher.py @@ -27,7 +27,7 @@ class ScreenshotEnricher(Enricher): time.sleep(2) screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") driver.save_screenshot(screenshot_file) - to_enrich.add_media(Media(filename=screenshot_file, id="screenshot")) + to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") except TimeoutException: logger.info("TimeoutException loading page for screenshot") except Exception as e: diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index f488a5f..e757cae 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -39,12 +39,29 @@ .center { text-align: center; } + + .copy:hover { + font-weight: 600; + cursor: copy; + } + + #notification { + position: fixed; + right: 20px; + top: 20px; + background: aquamarine; + box-shadow: 6px 8px 5px 0px #000000; + padding: 10px; + font-size: large; + display: none; + } +

Archived media for {{ url }}

-

title: '{{ title }}'

+

title: '{{ title }}'

content {{ media | length }} item(s)

@@ -55,21 +72,24 @@ @@ -97,7 +120,9 @@ {% for key in metadata %} - + {% endfor %}
    - {% if m.hash | length > 1 %} -
  • hash: {{ m.hash }}
  • - {% endif %} -
  • key: {{ m.key }}
  • -
  • type: {{ m.mimetype }}
  • - {% if m.id | length >0 %} -
  • id: {{ m.id }}
  • +
  • key: {{ m.key }}
  • +
  • type: {{ m.mimetype }}
  • + + {% for prop in m.properties %} + {% if m.properties[prop] | length > 1 %} +
  • {{ prop }}: {{ m.properties[prop] }}
  • {% endif %} + {% endfor %}
{% for url in m.urls %} + {% if 'http' in url %} {% if 'image' in m.mimetype %} - + + + {% elif 'video' in m.mimetype %}
{{ key }}{{ metadata[key] | urlize }} + {{ metadata[key] | urlize }} +
@@ -105,5 +130,33 @@

Made with bellingcat/auto-archiver

+ \ No newline at end of file diff --git a/src/media.py b/src/media.py index e50cc14..f0f91a2 100644 --- a/src/media.py +++ b/src/media.py @@ -8,12 +8,19 @@ import mimetypes @dataclass class Media: + # other properties eg: hash, id, exif, ... filename: str key: str = None - urls: List[str] = field(default_factory=list) _mimetype: str = None # eg: image/jpeg - id: str = "" # in case this type of media needs a special id, eg: screenshot - # hash: str = None # TODO: added by enrichers + urls: List[str] = field(default_factory=list) + properties: dict = field(default_factory=dict) + + def set(self, key: str, value: Any) -> Media: + self.properties[key] = value + return self + + def get(self, key: str, default: Any = None) -> Any: + return self.properties.get(key, default) def add_url(self, url: str) -> None: # url can be remote, local, ... diff --git a/src/metadata.py b/src/metadata.py index 7f57c3b..70984fa 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -96,13 +96,16 @@ class Metadata: if iso: return ts.isoformat() return ts - def add_media(self, media: Media) -> Metadata: + def add_media(self, media: Media, id: str = None) -> Metadata: + # adds a new media, optionally including an id if media is None: return - return self.media.append(media) + if id is not None: media.set("id", id) + self.media.append(media) + return media - def get_media_by_id(self, id:str) -> Media: + def get_media_by_id(self, id: str) -> Media: for m in self.media: - if m.id == id: return m + if m.get("id") == id: return m return None def set_final_media(self, final: Media) -> Metadata: @@ -113,7 +116,7 @@ class Metadata: return self def get_single_media(self) -> Media: - #TODO: could be refactored to use a custom media.id + # TODO: could be refactored to use a custom media.id if self.final_media: return self.final_media return self.media[0]