kopia lustrzana https://github.com/bellingcat/auto-archiver
hash enricher and media refactor
rodzic
6ca46417fe
commit
74e50eccf1
|
@ -136,7 +136,7 @@ class TelethonArchiver(Archiverv2):
|
||||||
for i, om_url in enumerate(other_media_urls):
|
for i, om_url in enumerate(other_media_urls):
|
||||||
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
|
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
|
||||||
self.download_from_url(om_url, filename)
|
self.download_from_url(om_url, filename)
|
||||||
result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
|
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
||||||
|
|
||||||
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
|
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
|
||||||
filename = self.client.download_media(mp.media, filename_dest)
|
filename = self.client.download_media(mp.media, filename_dest)
|
||||||
|
|
|
@ -57,7 +57,7 @@ class ConfigV2:
|
||||||
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||||
assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||||
config_path = f"{child.name}.{config}"
|
config_path = f"{child.name}.{config}"
|
||||||
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})")
|
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
|
||||||
self.defaults[config_path] = details["default"]
|
self.defaults[config_path] = details["default"]
|
||||||
if "cli_set" in details:
|
if "cli_set" in details:
|
||||||
self.cli_ops[config_path] = details["cli_set"]
|
self.cli_ops[config_path] = details["cli_set"]
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
from .enricher import Enricher
|
from .enricher import Enricher
|
||||||
from .screenshot_enricher import ScreenshotEnricher
|
from .screenshot_enricher import ScreenshotEnricher
|
||||||
from .wayback_enricher import WaybackEnricher
|
from .wayback_enricher import WaybackEnricher
|
||||||
|
from .hash_enricher import HashEnricher
|
|
@ -0,0 +1,41 @@
|
||||||
|
import hashlib
|
||||||
|
from utils import Webdriver
|
||||||
|
from . import Enricher
|
||||||
|
from metadata import Metadata
|
||||||
|
from loguru import logger
|
||||||
|
from selenium.common.exceptions import TimeoutException
|
||||||
|
import time, requests
|
||||||
|
|
||||||
|
|
||||||
|
class HashEnricher(Enricher):
|
||||||
|
"""
|
||||||
|
Calculates hashes for Media instances
|
||||||
|
"""
|
||||||
|
name = "hash_enricher"
|
||||||
|
|
||||||
|
def __init__(self, config: dict) -> None:
|
||||||
|
# without this STEP.__init__ is not called
|
||||||
|
super().__init__(config)
|
||||||
|
algo_choices = self.configs()["algorithm"]["choices"]
|
||||||
|
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def configs() -> dict:
|
||||||
|
return {
|
||||||
|
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
|
||||||
|
}
|
||||||
|
|
||||||
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
|
url = to_enrich.get_url()
|
||||||
|
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||||
|
|
||||||
|
for i, m in enumerate(to_enrich.media):
|
||||||
|
with open(m.filename, "rb") as f:
|
||||||
|
bytes = f.read() # read entire file as bytes
|
||||||
|
hash = None
|
||||||
|
if self.algorithm == "SHA-256":
|
||||||
|
hash = hashlib.sha256(bytes)
|
||||||
|
elif self.algorithm == "SHA3-512":
|
||||||
|
hash = hashlib.sha3_512(bytes)
|
||||||
|
else: continue
|
||||||
|
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
|
|
@ -27,7 +27,7 @@ class ScreenshotEnricher(Enricher):
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||||
driver.save_screenshot(screenshot_file)
|
driver.save_screenshot(screenshot_file)
|
||||||
to_enrich.add_media(Media(filename=screenshot_file, id="screenshot"))
|
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||||
except TimeoutException:
|
except TimeoutException:
|
||||||
logger.info("TimeoutException loading page for screenshot")
|
logger.info("TimeoutException loading page for screenshot")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
@ -39,12 +39,29 @@
|
||||||
.center {
|
.center {
|
||||||
text-align: center;
|
text-align: center;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.copy:hover {
|
||||||
|
font-weight: 600;
|
||||||
|
cursor: copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
#notification {
|
||||||
|
position: fixed;
|
||||||
|
right: 20px;
|
||||||
|
top: 20px;
|
||||||
|
background: aquamarine;
|
||||||
|
box-shadow: 6px 8px 5px 0px #000000;
|
||||||
|
padding: 10px;
|
||||||
|
font-size: large;
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
<body>
|
<body>
|
||||||
|
<div id="notification"></div>
|
||||||
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
|
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
|
||||||
<p><b>title:</b> '<span>{{ title }}</span>'</p>
|
<p><b>title:</b> '<span class="copy">{{ title }}</span>'</p>
|
||||||
<h2 class="center">content {{ media | length }} item(s)</h2>
|
<h2 class="center">content {{ media | length }} item(s)</h2>
|
||||||
<table class="content">
|
<table class="content">
|
||||||
<tr>
|
<tr>
|
||||||
|
@ -55,21 +72,24 @@
|
||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
<ul>
|
<ul>
|
||||||
{% if m.hash | length > 1 %}
|
<li><b>key:</b> <span class="copy">{{ m.key }}</span></li>
|
||||||
<li>hash: <span>{{ m.hash }}</span></li>
|
<li><b>type:</b> <span class="copy">{{ m.mimetype }}</span></li>
|
||||||
{% endif %}
|
|
||||||
<li>key: <span>{{ m.key }}</span></li>
|
{% for prop in m.properties %}
|
||||||
<li>type: <span>{{ m.mimetype }}</span></li>
|
{% if m.properties[prop] | length > 1 %}
|
||||||
{% if m.id | length >0 %}
|
<li><b>{{ prop }}:</b> <span class="copy">{{ m.properties[prop] }}</span></li>
|
||||||
<li>id: <span>{{ m.id }}</span></li>
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
{% for url in m.urls %}
|
{% for url in m.urls %}
|
||||||
|
{% if 'http' in url %}
|
||||||
{% if 'image' in m.mimetype %}
|
{% if 'image' in m.mimetype %}
|
||||||
|
<a href="{{ url }}">
|
||||||
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
|
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
|
||||||
|
</a>
|
||||||
{% elif 'video' in m.mimetype %}
|
{% elif 'video' in m.mimetype %}
|
||||||
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
|
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
|
||||||
Your browser does not support the video element.
|
Your browser does not support the video element.
|
||||||
|
@ -80,9 +100,12 @@
|
||||||
Your browser does not support the audio element.
|
Your browser does not support the audio element.
|
||||||
</audio>
|
</audio>
|
||||||
{% else %}
|
{% else %}
|
||||||
No preview available, please open the link.
|
No preview available.
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<li><a href="{{ url }}">{{ url}}</a></li>
|
{% endif %}
|
||||||
|
<br>
|
||||||
|
<a href="{{ url }}">open</a> or
|
||||||
|
<a href="{{ url }}" download="">download</a>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
@ -97,7 +120,9 @@
|
||||||
{% for key in metadata %}
|
{% for key in metadata %}
|
||||||
<tr>
|
<tr>
|
||||||
<td>{{ key }}</td>
|
<td>{{ key }}</td>
|
||||||
<td>{{ metadata[key] | urlize }}</td>
|
<td>
|
||||||
|
<span class="copy">{{ metadata[key] | urlize }}</span>
|
||||||
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</table>
|
</table>
|
||||||
|
@ -105,5 +130,33 @@
|
||||||
<hr>
|
<hr>
|
||||||
<p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
|
<p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
|
||||||
</body>
|
</body>
|
||||||
|
<script defer>
|
||||||
|
// notification logic
|
||||||
|
const notification = document.getElementById("notification");
|
||||||
|
|
||||||
|
function showNotification(message, miliseconds) {
|
||||||
|
notification.style.display = "block";
|
||||||
|
notification.innerText = message;
|
||||||
|
setTimeout(() => {
|
||||||
|
notification.style.display = "none";
|
||||||
|
notification.innerText = "";
|
||||||
|
}, miliseconds || 1000)
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy logic
|
||||||
|
Array.from(document.querySelectorAll(".copy")).forEach(el => {
|
||||||
|
el.onclick = () => {
|
||||||
|
document.execCommand("copy");
|
||||||
|
}
|
||||||
|
el.addEventListener("copy", (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
if (e.clipboardData) {
|
||||||
|
e.clipboardData.setData("text/plain", el.textContent);
|
||||||
|
console.log(e.clipboardData.getData("text"))
|
||||||
|
showNotification("copied...")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
</script>
|
||||||
|
|
||||||
</html>
|
</html>
|
13
src/media.py
13
src/media.py
|
@ -8,12 +8,19 @@ import mimetypes
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Media:
|
class Media:
|
||||||
|
# other properties eg: hash, id, exif, ...
|
||||||
filename: str
|
filename: str
|
||||||
key: str = None
|
key: str = None
|
||||||
urls: List[str] = field(default_factory=list)
|
|
||||||
_mimetype: str = None # eg: image/jpeg
|
_mimetype: str = None # eg: image/jpeg
|
||||||
id: str = "" # in case this type of media needs a special id, eg: screenshot
|
urls: List[str] = field(default_factory=list)
|
||||||
# hash: str = None # TODO: added by enrichers
|
properties: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
def set(self, key: str, value: Any) -> Media:
|
||||||
|
self.properties[key] = value
|
||||||
|
return self
|
||||||
|
|
||||||
|
def get(self, key: str, default: Any = None) -> Any:
|
||||||
|
return self.properties.get(key, default)
|
||||||
|
|
||||||
def add_url(self, url: str) -> None:
|
def add_url(self, url: str) -> None:
|
||||||
# url can be remote, local, ...
|
# url can be remote, local, ...
|
||||||
|
|
|
@ -96,13 +96,16 @@ class Metadata:
|
||||||
if iso: return ts.isoformat()
|
if iso: return ts.isoformat()
|
||||||
return ts
|
return ts
|
||||||
|
|
||||||
def add_media(self, media: Media) -> Metadata:
|
def add_media(self, media: Media, id: str = None) -> Metadata:
|
||||||
|
# adds a new media, optionally including an id
|
||||||
if media is None: return
|
if media is None: return
|
||||||
return self.media.append(media)
|
if id is not None: media.set("id", id)
|
||||||
|
self.media.append(media)
|
||||||
|
return media
|
||||||
|
|
||||||
def get_media_by_id(self, id: str) -> Media:
|
def get_media_by_id(self, id: str) -> Media:
|
||||||
for m in self.media:
|
for m in self.media:
|
||||||
if m.id == id: return m
|
if m.get("id") == id: return m
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def set_final_media(self, final: Media) -> Metadata:
|
def set_final_media(self, final: Media) -> Metadata:
|
||||||
|
|
Ładowanie…
Reference in New Issue