kopia lustrzana https://github.com/bellingcat/auto-archiver
auto
rodzic
ecd8f7d8b9
commit
22c1fb09fd
|
@ -2,6 +2,7 @@
|
|||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
|
||||
|
||||
{
|
||||
"name": "Test Hashing",
|
||||
"type": "python",
|
||||
|
@ -9,7 +10,7 @@
|
|||
"program": "auto_archive.py",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": true,
|
||||
"args": ["--sheet","Test Hashing"]
|
||||
"args": ["--sheet","Test Hashing","--use-filenumber-as-directory=True"]
|
||||
},
|
||||
{
|
||||
"name": "Python: auto_archive CIR --sheet",
|
||||
|
|
|
@ -42,7 +42,9 @@ class Archiver(ABC):
|
|||
return self.__class__.__name__
|
||||
|
||||
@abstractmethod
|
||||
def download(self, url, check_if_exists=False): pass
|
||||
# def download(self, url, check_if_exists=False): pass
|
||||
# DM add feature flad
|
||||
def download(self, url, check_if_exists=False, filenumber=None): pass
|
||||
|
||||
def get_netloc(self, url):
|
||||
return urlparse(url).netloc
|
||||
|
@ -50,9 +52,8 @@ class Archiver(ABC):
|
|||
def get_html_key(self, url):
|
||||
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||
|
||||
# DM added UTF
|
||||
# https://github.com/bellingcat/auto-archiver/pull/21/commits/576f1a8f687199cf38864f7271b9a63e65de8692
|
||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
|
||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None, filenumber=None):
|
||||
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
||||
<body>
|
||||
<h2>Archived media from {self.name}</h2>
|
||||
|
@ -66,18 +67,31 @@ class Archiver(ABC):
|
|||
|
||||
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||
page_filename = 'tmp/' + page_key
|
||||
page_cdn = self.storage.get_cdn_url(page_key)
|
||||
|
||||
# DM feature flag
|
||||
# page_cdn gets written to the spreadsheet
|
||||
if filenumber is None:
|
||||
page_cdn = self.storage.get_cdn_url(page_key)
|
||||
else:
|
||||
page_cdn = self.storage.get_cdn_url(filenumber + "/" + page_key)
|
||||
|
||||
with open(page_filename, "w") as f:
|
||||
f.write(page)
|
||||
|
||||
page_hash = self.get_hash(page_filename)
|
||||
|
||||
# DM feature flag
|
||||
if filenumber != "":
|
||||
logger.debug(f'filenumber for directory is {filenumber}')
|
||||
page_key = filenumber + "/" + page_key
|
||||
|
||||
self.storage.upload(page_filename, page_key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||
return (page_cdn, page_hash, thumbnail)
|
||||
|
||||
def generate_media_page(self, urls, url, object):
|
||||
# def generate_media_page(self, urls, url, object):
|
||||
# eg images in a tweet save to cloud storage
|
||||
def generate_media_page(self, urls, url, object, filenumber=None):
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
}
|
||||
|
@ -92,19 +106,32 @@ class Archiver(ABC):
|
|||
|
||||
filename = 'tmp/' + key
|
||||
|
||||
# eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig
|
||||
d = requests.get(media_url, headers=headers)
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(d.content)
|
||||
|
||||
# DM feature flag
|
||||
if filenumber is not None:
|
||||
logger.debug(f'filenumber for directory is {filenumber}')
|
||||
key = filenumber + "/" + key
|
||||
|
||||
# eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
# eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
# or key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
# file will be in storage now as: twitter__media_FM7-ggCUYAQHKWW.jpg
|
||||
|
||||
hash = self.get_hash(filename)
|
||||
# eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
if thumbnail is None:
|
||||
thumbnail = cdn_url
|
||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||
|
||||
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
|
||||
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail, filenumber=filenumber)
|
||||
|
||||
def get_key(self, filename):
|
||||
"""
|
||||
|
@ -130,7 +157,8 @@ class Archiver(ABC):
|
|||
f.close()
|
||||
return hash.hexdigest()
|
||||
|
||||
def get_screenshot(self, url):
|
||||
# eg SA3013/twitter__minmyatnaing13_status_14994155629375037512022-04-27T13:51:43.701962.png
|
||||
def get_screenshot(self, url, filenumber):
|
||||
key = self.get_key(urlparse(url).path.replace(
|
||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||
filename = 'tmp/' + key
|
||||
|
@ -157,14 +185,15 @@ class Archiver(ABC):
|
|||
|
||||
self.driver.save_screenshot(filename)
|
||||
|
||||
# want to reset so that next call to selenium doesn't have cookies?
|
||||
# functions needs to be idempotent
|
||||
if filenumber is not None:
|
||||
logger.debug(f'filenumber for directory is {filenumber}')
|
||||
key = filenumber + "/" + key
|
||||
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
return self.storage.get_cdn_url(key)
|
||||
|
||||
def get_thumbnails(self, filename, key, duration=None):
|
||||
def get_thumbnails(self, filename, key, duration=None, filenumber=None):
|
||||
thumbnails_folder = filename.split('.')[0] + '/'
|
||||
key_folder = key.split('.')[0] + '/'
|
||||
|
||||
|
@ -192,6 +221,10 @@ class Archiver(ABC):
|
|||
thumbnail_filename = thumbnails_folder + fname
|
||||
key = key_folder + fname
|
||||
|
||||
# DM feature flag
|
||||
# if filenumber is not None:
|
||||
# key = filenumber + "/" + key
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
self.storage.upload(thumbnail_filename, key)
|
||||
|
|
|
@ -11,7 +11,7 @@ from .base_archiver import Archiver, ArchiveResult
|
|||
class TelegramArchiver(Archiver):
|
||||
name = "telegram"
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=""):
|
||||
# detect URLs that we definitely cannot handle
|
||||
if 't.me' != self.get_netloc(url):
|
||||
return False
|
||||
|
|
|
@ -41,7 +41,7 @@ class TelethonArchiver(Archiver):
|
|||
media.append(post)
|
||||
return media
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
# detect URLs that we definitely cannot handle
|
||||
matches = self.link_pattern.findall(url)
|
||||
if not len(matches):
|
||||
|
|
|
@ -8,7 +8,7 @@ from .base_archiver import Archiver, ArchiveResult
|
|||
class TiktokArchiver(Archiver):
|
||||
name = "tiktok"
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=""):
|
||||
if 'tiktok.com' not in url:
|
||||
return False
|
||||
|
||||
|
|
|
@ -11,7 +11,11 @@ import traceback
|
|||
class TwitterArchiver(Archiver):
|
||||
name = "twitter"
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
# DM added filenumber params todo fix ""
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
if filenumber is not None:
|
||||
logger.debug(f'filenumber is {filenumber}')
|
||||
|
||||
if 'twitter.com' != self.get_netloc(url):
|
||||
return False
|
||||
|
||||
|
@ -29,8 +33,7 @@ class TwitterArchiver(Archiver):
|
|||
# except:
|
||||
except Exception as e:
|
||||
# logger.warning('wah wah')
|
||||
# DM can happen if a media sensitive tweet
|
||||
# logger.warning(f'Exception in twitter_archiver - traceback: {traceback.format_exc()}')
|
||||
# DM
|
||||
logger.warning(f'TwitterArchiver cant get tweet for url {url} - can happen if a media sensitive tweet: \n{traceback.format_exc()}')
|
||||
return False
|
||||
|
||||
|
@ -59,8 +62,9 @@ class TwitterArchiver(Archiver):
|
|||
else:
|
||||
logger.warning(f"Could not get media URL of {media}")
|
||||
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
|
||||
# page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json(), filenumber)
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = self.get_screenshot(url, filenumber)
|
||||
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)
|
||||
|
|
|
@ -14,7 +14,7 @@ class WaybackArchiver(Archiver):
|
|||
super(WaybackArchiver, self).__init__(storage, driver)
|
||||
self.seen_urls = {}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
if check_if_exists and url in self.seen_urls:
|
||||
return self.seen_urls[url]
|
||||
|
||||
|
@ -75,7 +75,7 @@ class WaybackArchiver(Archiver):
|
|||
except:
|
||||
title = "Could not get title"
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = self.get_screenshot(url, filenumber)
|
||||
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||
self.seen_urls[url] = result
|
||||
return result
|
||||
|
|
|
@ -16,7 +16,7 @@ class YoutubeDLArchiver(Archiver):
|
|||
super().__init__(storage, driver)
|
||||
self.fb_cookie = fb_cookie
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
netloc = self.get_netloc(url)
|
||||
# DM to set env variable: export FB_COOKIE="paste"
|
||||
# this gets blanked at the end of each session ie when vs code closes
|
||||
|
@ -69,6 +69,10 @@ class YoutubeDLArchiver(Archiver):
|
|||
|
||||
key = self.get_key(filename)
|
||||
|
||||
# DM feature flag
|
||||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
|
||||
if self.storage.exists(key):
|
||||
status = 'already archived'
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
@ -92,19 +96,24 @@ class YoutubeDLArchiver(Archiver):
|
|||
|
||||
if status != 'already archived':
|
||||
key = self.get_key(filename)
|
||||
|
||||
# DM feature flag
|
||||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
hash = self.get_hash(filename)
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = self.get_screenshot(url, filenumber)
|
||||
|
||||
# get duration
|
||||
duration = info.get('duration')
|
||||
|
||||
# get thumbnails
|
||||
try:
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration, filenumber=filenumber)
|
||||
except:
|
||||
key_thumb = ''
|
||||
thumb_index = 'Could not generate thumbnails'
|
||||
|
|
|
@ -67,7 +67,7 @@ def expand_url(url):
|
|||
return url
|
||||
|
||||
|
||||
def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||
def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES, usefilenumber=False):
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
sh = gc.open(sheet)
|
||||
|
||||
|
@ -117,6 +117,18 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
|||
|
||||
url = expand_url(url)
|
||||
|
||||
# DM Feature flag
|
||||
if usefilenumber:
|
||||
filenumber = gw.get_cell(row, 'filenumber')
|
||||
logger.debug(f'filenumber is {filenumber}')
|
||||
if filenumber == "":
|
||||
logger.warning(f'Logic error - the feature flag for usefilenumber is True, yet cant find a corresponding filenumber')
|
||||
gw.set_cell(row, 'status', 'Missing filenumber')
|
||||
continue
|
||||
else:
|
||||
# We will use this through the app to differentiate between where to save
|
||||
filenumber = None
|
||||
|
||||
# DM make a new driver every row so idempotent
|
||||
# otherwise cookies will be remembered
|
||||
options = webdriver.FirefoxOptions()
|
||||
|
@ -142,10 +154,10 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
|||
logger.debug(f'Trying {archiver} on row {row}')
|
||||
|
||||
try:
|
||||
result = archiver.download(url, check_if_exists=True)
|
||||
# DM . filenumber="" if not to be used
|
||||
result = archiver.download(url, check_if_exists=True, filenumber=filenumber)
|
||||
except Exception as e:
|
||||
result = False
|
||||
# DM loguru writes traceback to files so this traceback may be superfluous
|
||||
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
|
||||
|
||||
if result:
|
||||
|
@ -190,6 +202,7 @@ def main():
|
|||
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True)
|
||||
parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row')
|
||||
parser.add_argument('--private', action='store_true', help='Store content without public access permission')
|
||||
parser.add_argument('--use-filenumber-as-directory', action='store', dest='usefilenumber', default=False, type=bool, help='False is default and True will save files into a subfolder on cloud storage which has the File Number eg SM3012')
|
||||
|
||||
for k, v in GWorksheet.COLUMN_NAMES.items():
|
||||
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})')
|
||||
|
@ -197,10 +210,11 @@ def main():
|
|||
args = parser.parse_args()
|
||||
config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()}
|
||||
|
||||
logger.info(f'Opening document {args.sheet} for header {args.header}')
|
||||
logger.info(f'Opening document {args.sheet} for header {args.header} using filenumber: {args.usefilenumber}')
|
||||
|
||||
mkdir_if_not_exists('tmp')
|
||||
process_sheet(args.sheet, header=args.header, columns=config_columns)
|
||||
# DM added a feature flag for usefilenumber
|
||||
process_sheet(args.sheet, header=args.header, columns=config_columns, usefilenumber=args.usefilenumber)
|
||||
shutil.rmtree('tmp')
|
||||
|
||||
|
||||
|
|
|
@ -8,7 +8,9 @@ class GWorksheet:
|
|||
should always include the offset of the header.
|
||||
eg: if header=4, row 5 will be the first with data.
|
||||
"""
|
||||
|
||||
COLUMN_NAMES = {
|
||||
'filenumber': 'file number',
|
||||
'url': 'link',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
|
|
Ładowanie…
Reference in New Issue