kopia lustrzana https://github.com/bellingcat/auto-archiver
hash enricher and media refactor
rodzic
6ca46417fe
commit
74e50eccf1
|
@ -136,7 +136,7 @@ class TelethonArchiver(Archiverv2):
|
|||
for i, om_url in enumerate(other_media_urls):
|
||||
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
|
||||
self.download_from_url(om_url, filename)
|
||||
result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
|
||||
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
||||
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
|
||||
filename = self.client.download_media(mp.media, filename_dest)
|
||||
|
|
|
@ -57,7 +57,7 @@ class ConfigV2:
|
|||
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||
assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||
config_path = f"{child.name}.{config}"
|
||||
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})")
|
||||
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
|
||||
self.defaults[config_path] = details["default"]
|
||||
if "cli_set" in details:
|
||||
self.cli_ops[config_path] = details["cli_set"]
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from .enricher import Enricher
|
||||
from .screenshot_enricher import ScreenshotEnricher
|
||||
from .wayback_enricher import WaybackEnricher
|
||||
from .hash_enricher import HashEnricher
|
|
@ -0,0 +1,41 @@
|
|||
import hashlib
|
||||
from utils import Webdriver
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time, requests
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
"""
|
||||
Calculates hashes for Media instances
|
||||
"""
|
||||
name = "hash_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
algo_choices = self.configs()["algorithm"]["choices"]
|
||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
with open(m.filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
hash = None
|
||||
if self.algorithm == "SHA-256":
|
||||
hash = hashlib.sha256(bytes)
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash = hashlib.sha3_512(bytes)
|
||||
else: continue
|
||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
|
|
@ -27,7 +27,7 @@ class ScreenshotEnricher(Enricher):
|
|||
time.sleep(2)
|
||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file, id="screenshot"))
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
|
|
|
@ -39,12 +39,29 @@
|
|||
.center {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.copy:hover {
|
||||
font-weight: 600;
|
||||
cursor: copy;
|
||||
}
|
||||
|
||||
#notification {
|
||||
position: fixed;
|
||||
right: 20px;
|
||||
top: 20px;
|
||||
background: aquamarine;
|
||||
box-shadow: 6px 8px 5px 0px #000000;
|
||||
padding: 10px;
|
||||
font-size: large;
|
||||
display: none;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="notification"></div>
|
||||
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
|
||||
<p><b>title:</b> '<span>{{ title }}</span>'</p>
|
||||
<p><b>title:</b> '<span class="copy">{{ title }}</span>'</p>
|
||||
<h2 class="center">content {{ media | length }} item(s)</h2>
|
||||
<table class="content">
|
||||
<tr>
|
||||
|
@ -55,21 +72,24 @@
|
|||
<tr>
|
||||
<td>
|
||||
<ul>
|
||||
{% if m.hash | length > 1 %}
|
||||
<li>hash: <span>{{ m.hash }}</span></li>
|
||||
{% endif %}
|
||||
<li>key: <span>{{ m.key }}</span></li>
|
||||
<li>type: <span>{{ m.mimetype }}</span></li>
|
||||
{% if m.id | length >0 %}
|
||||
<li>id: <span>{{ m.id }}</span></li>
|
||||
<li><b>key:</b> <span class="copy">{{ m.key }}</span></li>
|
||||
<li><b>type:</b> <span class="copy">{{ m.mimetype }}</span></li>
|
||||
|
||||
{% for prop in m.properties %}
|
||||
{% if m.properties[prop] | length > 1 %}
|
||||
<li><b>{{ prop }}:</b> <span class="copy">{{ m.properties[prop] }}</span></li>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
</td>
|
||||
<td>
|
||||
{% for url in m.urls %}
|
||||
{% if 'http' in url %}
|
||||
{% if 'image' in m.mimetype %}
|
||||
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
|
||||
<a href="{{ url }}">
|
||||
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
|
||||
</a>
|
||||
{% elif 'video' in m.mimetype %}
|
||||
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
|
||||
Your browser does not support the video element.
|
||||
|
@ -80,9 +100,12 @@
|
|||
Your browser does not support the audio element.
|
||||
</audio>
|
||||
{% else %}
|
||||
No preview available, please open the link.
|
||||
No preview available.
|
||||
{% endif %}
|
||||
<li><a href="{{ url }}">{{ url}}</a></li>
|
||||
{% endif %}
|
||||
<br>
|
||||
<a href="{{ url }}">open</a> or
|
||||
<a href="{{ url }}" download="">download</a>
|
||||
{% endfor %}
|
||||
</td>
|
||||
</tr>
|
||||
|
@ -97,7 +120,9 @@
|
|||
{% for key in metadata %}
|
||||
<tr>
|
||||
<td>{{ key }}</td>
|
||||
<td>{{ metadata[key] | urlize }}</td>
|
||||
<td>
|
||||
<span class="copy">{{ metadata[key] | urlize }}</span>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
|
@ -105,5 +130,33 @@
|
|||
<hr>
|
||||
<p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
|
||||
</body>
|
||||
<script defer>
|
||||
// notification logic
|
||||
const notification = document.getElementById("notification");
|
||||
|
||||
function showNotification(message, miliseconds) {
|
||||
notification.style.display = "block";
|
||||
notification.innerText = message;
|
||||
setTimeout(() => {
|
||||
notification.style.display = "none";
|
||||
notification.innerText = "";
|
||||
}, miliseconds || 1000)
|
||||
}
|
||||
|
||||
// copy logic
|
||||
Array.from(document.querySelectorAll(".copy")).forEach(el => {
|
||||
el.onclick = () => {
|
||||
document.execCommand("copy");
|
||||
}
|
||||
el.addEventListener("copy", (e) => {
|
||||
e.preventDefault();
|
||||
if (e.clipboardData) {
|
||||
e.clipboardData.setData("text/plain", el.textContent);
|
||||
console.log(e.clipboardData.getData("text"))
|
||||
showNotification("copied...")
|
||||
}
|
||||
})
|
||||
})
|
||||
</script>
|
||||
|
||||
</html>
|
13
src/media.py
13
src/media.py
|
@ -8,12 +8,19 @@ import mimetypes
|
|||
|
||||
@dataclass
|
||||
class Media:
|
||||
# other properties eg: hash, id, exif, ...
|
||||
filename: str
|
||||
key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
id: str = "" # in case this type of media needs a special id, eg: screenshot
|
||||
# hash: str = None # TODO: added by enrichers
|
||||
urls: List[str] = field(default_factory=list)
|
||||
properties: dict = field(default_factory=dict)
|
||||
|
||||
def set(self, key: str, value: Any) -> Media:
|
||||
self.properties[key] = value
|
||||
return self
|
||||
|
||||
def get(self, key: str, default: Any = None) -> Any:
|
||||
return self.properties.get(key, default)
|
||||
|
||||
def add_url(self, url: str) -> None:
|
||||
# url can be remote, local, ...
|
||||
|
|
|
@ -96,13 +96,16 @@ class Metadata:
|
|||
if iso: return ts.isoformat()
|
||||
return ts
|
||||
|
||||
def add_media(self, media: Media) -> Metadata:
|
||||
def add_media(self, media: Media, id: str = None) -> Metadata:
|
||||
# adds a new media, optionally including an id
|
||||
if media is None: return
|
||||
return self.media.append(media)
|
||||
if id is not None: media.set("id", id)
|
||||
self.media.append(media)
|
||||
return media
|
||||
|
||||
def get_media_by_id(self, id:str) -> Media:
|
||||
def get_media_by_id(self, id: str) -> Media:
|
||||
for m in self.media:
|
||||
if m.id == id: return m
|
||||
if m.get("id") == id: return m
|
||||
return None
|
||||
|
||||
def set_final_media(self, final: Media) -> Metadata:
|
||||
|
@ -113,7 +116,7 @@ class Metadata:
|
|||
return self
|
||||
|
||||
def get_single_media(self) -> Media:
|
||||
#TODO: could be refactored to use a custom media.id
|
||||
# TODO: could be refactored to use a custom media.id
|
||||
if self.final_media:
|
||||
return self.final_media
|
||||
return self.media[0]
|
||||
|
|
Ładowanie…
Reference in New Issue