kopia lustrzana https://github.com/bellingcat/auto-archiver
wayback enricher ready
rodzic
d4825196f1
commit
0cb593fd21
|
@ -121,7 +121,7 @@ class TelethonArchiver(Archiverv2):
|
|||
media_posts = self._get_media_posts_in_group(chat, post)
|
||||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||
|
||||
tmp_dir = item.get("tmp_dir")
|
||||
tmp_dir = item.get_tmp_dir()
|
||||
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
title = post.message
|
||||
|
|
|
@ -68,13 +68,15 @@ class GsheetsDb(Database):
|
|||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", "")[:500])
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
if (screenshot := item.get_media_by_id("screenshot")):
|
||||
batch_if_valid('screenshot', screenshot.cdn_url)
|
||||
# batch_if_valid('status', item.status)
|
||||
|
||||
# TODO: AFTER ENRICHMENTS
|
||||
# batch_if_valid('hash', media.hash)
|
||||
# batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
|
||||
# batch_if_valid('thumbnail_index', result.thumbnail_index)
|
||||
# batch_if_valid('duration', result.duration, str(result.duration))
|
||||
# batch_if_valid('screenshot', result.screenshot)
|
||||
# if result.wacz is not None:
|
||||
# batch_if_valid('wacz', result.wacz)
|
||||
# batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
|
||||
|
@ -91,5 +93,5 @@ class GsheetsDb(Database):
|
|||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
gw: GWorksheet = item.get("gsheet").get("worksheet")
|
||||
row: int = item.get("gsheet").get("row")
|
||||
#TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
|
||||
return gw, row
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
from .enricher import Enricher
|
||||
from .screenshot_enricher import ScreenshotEnricher
|
||||
from .screenshot_enricher import ScreenshotEnricher
|
||||
from .wayback_enricher import WaybackEnricher
|
|
@ -18,4 +18,4 @@ class Enricher(Step, ABC):
|
|||
return Step.init(name, config, Enricher)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, item: Metadata) -> Metadata: pass
|
||||
def enrich(self, to_enrich: Metadata) -> None: pass
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
from media import Media
|
||||
from utils import Webdriver
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
import time, uuid, os
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time
|
||||
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot"
|
||||
name = "screenshot_enricher"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
|
@ -17,16 +18,18 @@ class ScreenshotEnricher(Enricher):
|
|||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
|
||||
}
|
||||
|
||||
def enrich(self, item: Metadata) -> Metadata:
|
||||
url = self.get_url(item)
|
||||
print(f"enriching {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: # TODO: make a util
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file, id="screenshot"))
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
|
||||
#TODO: return saved object
|
||||
driver.save_screenshot("TODO-HASH_OR_UUID.png")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
# return None
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
from utils import Webdriver
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time, requests
|
||||
|
||||
|
||||
class WaybackEnricher(Enricher):
|
||||
"""
|
||||
Submits the current URL to the webarchive and returns a job_id or completed archive
|
||||
"""
|
||||
name = "wayback_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key"
|
||||
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
|
||||
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"Enriching wayback for {url=}")
|
||||
|
||||
ia_headers = {
|
||||
"Accept": "application/json",
|
||||
"Authorization": f"LOW {self.key}:{self.secret}"
|
||||
}
|
||||
r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(em:=f"Internet archive failed with status of {r.status_code}: {r.json()}")
|
||||
to_enrich.set("wayback", em)
|
||||
return
|
||||
|
||||
# check job status
|
||||
job_id = r.json()['job_id']
|
||||
|
||||
# waits at most timeout seconds until job is completed, otherwise only enriches the job_id information
|
||||
start_time = time.time()
|
||||
wayback_url = False
|
||||
attempt = 1
|
||||
while not wayback_url and time.time() - start_time <= self.timeout:
|
||||
try:
|
||||
|
||||
logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})")
|
||||
r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
||||
r_json = r_status.json()
|
||||
if r_status.status_code == 200 and r_json['status'] == 'success':
|
||||
wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}"
|
||||
except Exception as e:
|
||||
logger.warning(f"error fetching status for {url=} due to: {e}")
|
||||
if not wayback_url:
|
||||
attempt += 1
|
||||
time.sleep(1) # TODO: can be improved with exponential backoff
|
||||
|
||||
if wayback_url:
|
||||
to_enrich.set("wayback", wayback_url)
|
||||
else:
|
||||
to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'})
|
|
@ -30,7 +30,7 @@ class HtmlFormatter(Formatter):
|
|||
media=item.media,
|
||||
metadata=item.get_clean_metadata()
|
||||
)
|
||||
html_path = os.path.join(item.get("tmp_dir"), f"formatted{str(uuid.uuid4())}.html")
|
||||
html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
|
||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||
outf.write(content)
|
||||
return Media(filename=html_path)
|
||||
|
|
|
@ -60,6 +60,9 @@
|
|||
{% endif %}
|
||||
<li>key: <span>{{ m.key }}</span></li>
|
||||
<li>type: <span>{{ m.mimetype }}</span></li>
|
||||
{% if m.id | length >0 %}
|
||||
<li>id: <span>{{ m.id }}</span></li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
|
||||
</td>
|
||||
|
@ -91,11 +94,13 @@
|
|||
{% for key in metadata %}
|
||||
<tr>
|
||||
<td>{{ key }}</td>
|
||||
<td>{{ metadata[key] }}</td>
|
||||
<td>{{ metadata[key] | urlize }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
|
||||
<hr>
|
||||
<p>made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>, add suggestions and report issues on the project's github page</p>
|
||||
</body>
|
||||
|
||||
</html>
|
|
@ -12,10 +12,13 @@ class Media:
|
|||
key: str = None
|
||||
cdn_url: str = None
|
||||
mimetype: str = None # eg: image/jpeg
|
||||
# id: str = None
|
||||
id: str = None # in case this type of media needs a special id, eg: screenshot
|
||||
# hash: str = None # TODO: added by enrichers
|
||||
|
||||
def set_mimetype(self) -> Media:
|
||||
if not self.mimetype:
|
||||
self.mimetype = mimetypes.guess_type(self.filename)[0]
|
||||
return self
|
||||
|
||||
def is_video(self) -> bool:
|
||||
return self.mimetype.startswith("video")
|
||||
|
|
|
@ -28,9 +28,12 @@ class Metadata:
|
|||
"""
|
||||
merges two Metadata instances, will overwrite according to overwrite_left flag
|
||||
"""
|
||||
if right is None: return self
|
||||
if overwrite_left:
|
||||
self.status = right.status
|
||||
if right.status and len(right.status):
|
||||
self.status = right.status
|
||||
self.rearchivable |= right.rearchivable
|
||||
self.tmp_keys |= right.tmp_keys
|
||||
for k, v in right.metadata.items():
|
||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||
|
@ -76,6 +79,12 @@ class Metadata:
|
|||
def get_title(self) -> str:
|
||||
return self.get("title")
|
||||
|
||||
def set_tmp_dir(self, tmp_dir: str) -> Metadata:
|
||||
return self.set("tmp_dir", tmp_dir, True)
|
||||
|
||||
def get_tmp_dir(self) -> str:
|
||||
return self.get("tmp_dir")
|
||||
|
||||
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
|
||||
assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance"
|
||||
return self.set("timestamp", timestamp)
|
||||
|
@ -88,9 +97,15 @@ class Metadata:
|
|||
return ts
|
||||
|
||||
def add_media(self, media: Media) -> Metadata:
|
||||
if media is None: return
|
||||
media.set_mimetype()
|
||||
return self.media.append(media)
|
||||
|
||||
def get_media_by_id(self, id:str) -> Media:
|
||||
for m in self.media:
|
||||
if m.id == id: return m
|
||||
return None
|
||||
|
||||
def set_final_media(self, final: Media) -> Metadata:
|
||||
if final:
|
||||
if self.final_media:
|
||||
|
@ -100,6 +115,7 @@ class Metadata:
|
|||
return self
|
||||
|
||||
def get_single_media(self) -> Media:
|
||||
#TODO: could be refactored to use a custom media.id
|
||||
if self.final_media:
|
||||
return self.final_media
|
||||
return self.media[0]
|
||||
|
|
|
@ -52,74 +52,6 @@ Cisticola considerations:
|
|||
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
|
||||
"""
|
||||
|
||||
# @dataclass
|
||||
# class Metadata:
|
||||
# # does not handle files, only primitives
|
||||
# # the only piece of logic to handle files is the archiver, enricher, and storage
|
||||
# status: str
|
||||
# # title: str
|
||||
# # url: str
|
||||
# # hash: str
|
||||
# main_file: Metadata
|
||||
# metadata: Dict[str, Metadata]
|
||||
|
||||
# @staticmethod
|
||||
# def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
|
||||
# # should return a merged version of the Metadata
|
||||
# # will work for archived() and enriched()
|
||||
# # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
|
||||
# pass
|
||||
|
||||
# def get(self, key) -> Union[Metadata, str]:
|
||||
# # goes through metadata and returns the Metadata available
|
||||
# pass
|
||||
|
||||
# def as_json(self) -> str:
|
||||
# # converts all metadata and data into JSON
|
||||
# pass
|
||||
|
||||
|
||||
"""
|
||||
@dataclass
|
||||
class ArchiveResult:
|
||||
# maybe metadata can have status as well, eg: screenshot fails. should that be registered in the databases? likely yes
|
||||
status: str
|
||||
url: str
|
||||
metadata: Metadata
|
||||
# title, url, hash, other={}
|
||||
# cdn_url: str = None
|
||||
# thumbnail: str = None
|
||||
# thumbnail_index: str = None
|
||||
# duration: float = None
|
||||
# title: str = None
|
||||
# timestamp: datetime.datetime = None
|
||||
# screenshot: str = None
|
||||
# wacz: str = None
|
||||
# hash: str = None
|
||||
# media: list = field(default_factory=list)
|
||||
|
||||
def __init__(self) -> None: pass
|
||||
|
||||
def update(self, metadata) -> None:
|
||||
# receive a Metadata instance and update itself with it!
|
||||
pass
|
||||
|
||||
def as_json(self) -> str:
|
||||
# converts all metadata and data into JSON
|
||||
pass
|
||||
"""
|
||||
|
||||
"""
|
||||
There is a Superclass for:
|
||||
* Database (should_process)
|
||||
|
||||
How can GSheets work? it needs to feed from a READER (GSheets Feeder)
|
||||
|
||||
Once an archiver returns a link to a local file (for eg to a storage), how do we then delete the produced local files?
|
||||
The context metadata should include a temporary folder (maybe a LocalStorage instance?)
|
||||
"""
|
||||
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
def __init__(self, config) -> None:
|
||||
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
|
||||
|
@ -128,7 +60,7 @@ class ArchivingOrchestrator:
|
|||
|
||||
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI
|
||||
# where does that update/processing happen? in config.py
|
||||
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
|
||||
# reflection for Archiver to know which child classes it has? use Archiver.__subclasses__
|
||||
# self.archivers = [
|
||||
# Archiver.init(a, config)
|
||||
# for a in config.archivers
|
||||
|
@ -166,7 +98,7 @@ class ArchivingOrchestrator:
|
|||
print("ARCHIVING", item)
|
||||
try:
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||
item.set("tmp_dir", tmp_dir, True)
|
||||
item.set_tmp_dir(tmp_dir)
|
||||
result = self.archive(item)
|
||||
print(result)
|
||||
except KeyboardInterrupt:
|
||||
|
@ -226,6 +158,7 @@ class ArchivingOrchestrator:
|
|||
# do they need to be refreshed with every execution?
|
||||
# this is where the Hashes come from, the place with access to all content
|
||||
# the archiver does not have access to storage
|
||||
# a.download(result) # TODO: refactor so there's not merge here
|
||||
result.merge(a.download(result))
|
||||
# TODO: fix logic
|
||||
if True or result.is_success(): break
|
||||
|
@ -237,7 +170,7 @@ class ArchivingOrchestrator:
|
|||
# maybe as a PDF? or a Markdown file
|
||||
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
|
||||
for e in self.enrichers:
|
||||
result.merge(e.enrich(result))
|
||||
e.enrich(result)
|
||||
|
||||
# store media
|
||||
unstored_media = result.media[::]
|
||||
|
|
Ładowanie…
Reference in New Issue