kopia lustrzana https://github.com/bellingcat/auto-archiver
adds browsertrix to all archivers flows
rodzic
20ca50dc90
commit
dc0ca8bdd6
|
@ -86,4 +86,4 @@ class TelegramArchiver(Archiver):
|
|||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot)
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)
|
||||
|
|
|
@ -74,6 +74,7 @@ class TelethonArchiver(Archiver):
|
|||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
|
||||
if len(media_posts) > 0:
|
||||
key = self.get_html_key(url)
|
||||
|
@ -81,7 +82,7 @@ class TelethonArchiver(Archiver):
|
|||
if check_if_exists and self.storage.exists(key):
|
||||
# only s3 storage supports storage.exists as not implemented on gd
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
|
||||
|
||||
key_thumb, thumb_index = None, None
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
|
@ -120,7 +121,7 @@ class TelethonArchiver(Archiver):
|
|||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index)
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
|
||||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot)
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
|
||||
|
|
|
@ -70,5 +70,6 @@ class TwitterApiArchiver(TwitterArchiver):
|
|||
}, ensure_ascii=False, indent=4)
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
|
||||
|
|
|
@ -85,8 +85,9 @@ class TwitterArchiver(Archiver):
|
|||
|
||||
timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
screenshot = self.get_screenshot(url)
|
||||
wacz = self.get_wacz(url)
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"])
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
|
||||
|
||||
def choose_variant(self, variants):
|
||||
# choosing the highest quality possible
|
||||
|
|
|
@ -70,4 +70,5 @@ class VkArchiver(Archiver):
|
|||
page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail)
|
||||
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
|
||||
screenshot = self.get_screenshot(url)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title)
|
||||
wacz = self.get_wacz(url)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
|
||||
|
|
|
@ -61,7 +61,7 @@ class WaybackArchiver(Archiver):
|
|||
retries += 1
|
||||
|
||||
if status_r.status_code != 200:
|
||||
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)
|
||||
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
|
||||
|
||||
status_json = status_r.json()
|
||||
if status_json['status'] != 'success':
|
||||
|
|
Ładowanie…
Reference in New Issue