hash enricher and media refactor

pull/72/head
msramalho 2023-01-13 02:12:08 +00:00
rodzic 6ca46417fe
commit 74e50eccf1
8 zmienionych plików z 129 dodań i 24 usunięć

Wyświetl plik

@ -136,7 +136,7 @@ class TelethonArchiver(Archiverv2):
for i, om_url in enumerate(other_media_urls):
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
self.download_from_url(om_url, filename)
result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
filename = self.client.download_media(mp.media, filename_dest)

Wyświetl plik

@ -57,7 +57,7 @@ class ConfigV2:
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
assert "." not in config, f"config property cannot contain dots('.'): {config}"
config_path = f"{child.name}.{config}"
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})")
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
self.defaults[config_path] = details["default"]
if "cli_set" in details:
self.cli_ops[config_path] = details["cli_set"]

Wyświetl plik

@ -1,3 +1,4 @@
from .enricher import Enricher
from .screenshot_enricher import ScreenshotEnricher
from .wayback_enricher import WaybackEnricher
from .wayback_enricher import WaybackEnricher
from .hash_enricher import HashEnricher

Wyświetl plik

@ -0,0 +1,41 @@
import hashlib
from utils import Webdriver
from . import Enricher
from metadata import Metadata
from loguru import logger
from selenium.common.exceptions import TimeoutException
import time, requests
class HashEnricher(Enricher):
"""
Calculates hashes for Media instances
"""
name = "hash_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
algo_choices = self.configs()["algorithm"]["choices"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
@staticmethod
def configs() -> dict:
return {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
}
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
for i, m in enumerate(to_enrich.media):
with open(m.filename, "rb") as f:
bytes = f.read() # read entire file as bytes
hash = None
if self.algorithm == "SHA-256":
hash = hashlib.sha256(bytes)
elif self.algorithm == "SHA3-512":
hash = hashlib.sha3_512(bytes)
else: continue
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")

Wyświetl plik

@ -27,7 +27,7 @@ class ScreenshotEnricher(Enricher):
time.sleep(2)
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file, id="screenshot"))
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
except TimeoutException:
logger.info("TimeoutException loading page for screenshot")
except Exception as e:

Wyświetl plik

@ -39,12 +39,29 @@
.center {
text-align: center;
}
.copy:hover {
font-weight: 600;
cursor: copy;
}
#notification {
position: fixed;
right: 20px;
top: 20px;
background: aquamarine;
box-shadow: 6px 8px 5px 0px #000000;
padding: 10px;
font-size: large;
display: none;
}
</style>
</head>
<body>
<div id="notification"></div>
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
<p><b>title:</b> '<span>{{ title }}</span>'</p>
<p><b>title:</b> '<span class="copy">{{ title }}</span>'</p>
<h2 class="center">content {{ media | length }} item(s)</h2>
<table class="content">
<tr>
@ -55,21 +72,24 @@
<tr>
<td>
<ul>
{% if m.hash | length > 1 %}
<li>hash: <span>{{ m.hash }}</span></li>
{% endif %}
<li>key: <span>{{ m.key }}</span></li>
<li>type: <span>{{ m.mimetype }}</span></li>
{% if m.id | length >0 %}
<li>id: <span>{{ m.id }}</span></li>
<li><b>key:</b> <span class="copy">{{ m.key }}</span></li>
<li><b>type:</b> <span class="copy">{{ m.mimetype }}</span></li>
{% for prop in m.properties %}
{% if m.properties[prop] | length > 1 %}
<li><b>{{ prop }}:</b> <span class="copy">{{ m.properties[prop] }}</span></li>
{% endif %}
{% endfor %}
</ul>
</td>
<td>
{% for url in m.urls %}
{% if 'http' in url %}
{% if 'image' in m.mimetype %}
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
<a href="{{ url }}">
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
</a>
{% elif 'video' in m.mimetype %}
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
Your browser does not support the video element.
@ -80,9 +100,12 @@
Your browser does not support the audio element.
</audio>
{% else %}
No preview available, please open the link.
No preview available.
{% endif %}
<li><a href="{{ url }}">{{ url}}</a></li>
{% endif %}
<br>
<a href="{{ url }}">open</a> or
<a href="{{ url }}" download="">download</a>
{% endfor %}
</td>
</tr>
@ -97,7 +120,9 @@
{% for key in metadata %}
<tr>
<td>{{ key }}</td>
<td>{{ metadata[key] | urlize }}</td>
<td>
<span class="copy">{{ metadata[key] | urlize }}</span>
</td>
</tr>
{% endfor %}
</table>
@ -105,5 +130,33 @@
<hr>
<p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
</body>
<script defer>
// notification logic
const notification = document.getElementById("notification");
function showNotification(message, miliseconds) {
notification.style.display = "block";
notification.innerText = message;
setTimeout(() => {
notification.style.display = "none";
notification.innerText = "";
}, miliseconds || 1000)
}
// copy logic
Array.from(document.querySelectorAll(".copy")).forEach(el => {
el.onclick = () => {
document.execCommand("copy");
}
el.addEventListener("copy", (e) => {
e.preventDefault();
if (e.clipboardData) {
e.clipboardData.setData("text/plain", el.textContent);
console.log(e.clipboardData.getData("text"))
showNotification("copied...")
}
})
})
</script>
</html>

Wyświetl plik

@ -8,12 +8,19 @@ import mimetypes
@dataclass
class Media:
# other properties eg: hash, id, exif, ...
filename: str
key: str = None
urls: List[str] = field(default_factory=list)
_mimetype: str = None # eg: image/jpeg
id: str = "" # in case this type of media needs a special id, eg: screenshot
# hash: str = None # TODO: added by enrichers
urls: List[str] = field(default_factory=list)
properties: dict = field(default_factory=dict)
def set(self, key: str, value: Any) -> Media:
self.properties[key] = value
return self
def get(self, key: str, default: Any = None) -> Any:
return self.properties.get(key, default)
def add_url(self, url: str) -> None:
# url can be remote, local, ...

Wyświetl plik

@ -96,13 +96,16 @@ class Metadata:
if iso: return ts.isoformat()
return ts
def add_media(self, media: Media) -> Metadata:
def add_media(self, media: Media, id: str = None) -> Metadata:
# adds a new media, optionally including an id
if media is None: return
return self.media.append(media)
if id is not None: media.set("id", id)
self.media.append(media)
return media
def get_media_by_id(self, id:str) -> Media:
def get_media_by_id(self, id: str) -> Media:
for m in self.media:
if m.id == id: return m
if m.get("id") == id: return m
return None
def set_final_media(self, final: Media) -> Metadata:
@ -113,7 +116,7 @@ class Metadata:
return self
def get_single_media(self) -> Media:
#TODO: could be refactored to use a custom media.id
# TODO: could be refactored to use a custom media.id
if self.final_media:
return self.final_media
return self.media[0]