kopia lustrzana https://github.com/bellingcat/auto-archiver
wayback should re-archive even if old version exists
rodzic
2be539d39e
commit
64c083b37b
|
@ -9,6 +9,10 @@ from configs import WaybackConfig
|
||||||
|
|
||||||
|
|
||||||
class WaybackArchiver(Archiver):
|
class WaybackArchiver(Archiver):
|
||||||
|
"""
|
||||||
|
This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}"
|
||||||
|
but that might not be desirable since the webpage might have been archived a long time ago and thus have changed
|
||||||
|
"""
|
||||||
name = "wayback"
|
name = "wayback"
|
||||||
|
|
||||||
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
||||||
|
@ -21,12 +25,6 @@ class WaybackArchiver(Archiver):
|
||||||
if check_if_exists:
|
if check_if_exists:
|
||||||
if url in self.seen_urls: return self.seen_urls[url]
|
if url in self.seen_urls: return self.seen_urls[url]
|
||||||
|
|
||||||
logger.debug(f"checking if {url=} already on archive.org")
|
|
||||||
archive_url = f"https://web.archive.org/web/{url}"
|
|
||||||
req = requests.get(archive_url)
|
|
||||||
if req.status_code == 200:
|
|
||||||
return self.if_archived_return_with_screenshot(url, archive_url, req=req, status='already archived')
|
|
||||||
|
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
logger.debug(f"POSTing {url=} to web.archive.org")
|
logger.debug(f"POSTing {url=} to web.archive.org")
|
||||||
ia_headers = {
|
ia_headers = {
|
||||||
|
@ -66,11 +64,8 @@ class WaybackArchiver(Archiver):
|
||||||
return self.custom_retry(status_json, screenshot=screenshot)
|
return self.custom_retry(status_json, screenshot=screenshot)
|
||||||
|
|
||||||
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
|
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
|
||||||
return self.if_archived_return_with_screenshot(url, archive_url)
|
|
||||||
|
|
||||||
def if_archived_return_with_screenshot(self, url, archive_url, screenshot=None, req=None, status='success'):
|
|
||||||
try:
|
try:
|
||||||
if req is None:
|
|
||||||
req = requests.get(archive_url)
|
req = requests.get(archive_url)
|
||||||
parsed = BeautifulSoup(req.content, 'html.parser')
|
parsed = BeautifulSoup(req.content, 'html.parser')
|
||||||
title = parsed.find_all('title')[0].text
|
title = parsed.find_all('title')[0].text
|
||||||
|
@ -78,8 +73,8 @@ class WaybackArchiver(Archiver):
|
||||||
title = 'Could not get title'
|
title = 'Could not get title'
|
||||||
except:
|
except:
|
||||||
title = "Could not get title"
|
title = "Could not get title"
|
||||||
screenshot = screenshot or self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
self.seen_urls[url] = ArchiveResult(status=status, cdn_url=archive_url, title=title, screenshot=screenshot)
|
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||||
return self.seen_urls[url]
|
return self.seen_urls[url]
|
||||||
|
|
||||||
def custom_retry(self, json_data, **kwargs):
|
def custom_retry(self, json_data, **kwargs):
|
||||||
|
|
Ładowanie…
Reference in New Issue