kopia lustrzana https://github.com/bellingcat/auto-archiver
simplify download_url method
rodzic
3af1a79645
commit
cd8b4cfec4
|
@ -41,15 +41,14 @@ class Archiver(Step):
|
|||
return mime.split("/")[0]
|
||||
return ""
|
||||
|
||||
def download_from_url(self, url: str, to_filename: str = None, item: Metadata = None) -> str:
|
||||
def download_from_url(self, url: str, to_filename: str = None) -> str:
|
||||
"""
|
||||
downloads a URL to provided filename, or inferred from URL, returns local filename, if item is present will use its tmp_dir
|
||||
downloads a URL to provided filename, or inferred from URL, returns local filename
|
||||
"""
|
||||
if not to_filename:
|
||||
to_filename = url.split('/')[-1].split('?')[0]
|
||||
if len(to_filename) > 64:
|
||||
to_filename = to_filename[-64:]
|
||||
if item:
|
||||
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
|
|
|
@ -53,10 +53,10 @@ class TelegramArchiver(Archiver):
|
|||
|
||||
if not len(image_urls): return False
|
||||
for img_url in image_urls:
|
||||
result.add_media(Media(self.download_from_url(img_url, item=item)))
|
||||
result.add_media(Media(self.download_from_url(img_url)))
|
||||
else:
|
||||
video_url = video.get('src')
|
||||
m_video = Media(self.download_from_url(video_url, item=item))
|
||||
m_video = Media(self.download_from_url(video_url))
|
||||
# extract duration from HTML
|
||||
try:
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
|
|
|
@ -152,7 +152,7 @@ class TelethonArchiver(Archiver):
|
|||
if len(other_media_urls):
|
||||
logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
|
||||
for i, om_url in enumerate(other_media_urls):
|
||||
filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}', item)
|
||||
filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}')
|
||||
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
||||
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
|
||||
|
|
|
@ -90,7 +90,7 @@ class TwitterApiArchiver(TwitterArchiver, Archiver):
|
|||
continue
|
||||
logger.info(f"Found media {media}")
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
|
||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
||||
result.add_media(media)
|
||||
|
||||
result.set_content(json.dumps({
|
||||
|
|
|
@ -80,7 +80,7 @@ class TwitterArchiver(Archiver):
|
|||
logger.warning(f"Could not get media URL of {tweet_media}")
|
||||
continue
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
|
||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
||||
result.add_media(media)
|
||||
|
||||
return result.success("twitter-snscrape")
|
||||
|
@ -120,7 +120,7 @@ class TwitterArchiver(Archiver):
|
|||
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
|
||||
ext = mimetypes.guess_extension(mtype)
|
||||
|
||||
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
|
||||
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}')
|
||||
result.add_media(media)
|
||||
|
||||
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
|
||||
|
|
|
@ -191,7 +191,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||
# if a link with better quality exists, try to download that
|
||||
if record_url_best_qual != record_url:
|
||||
try:
|
||||
m.filename = self.download_from_url(record_url_best_qual, warc_fn, to_enrich)
|
||||
m.filename = self.download_from_url(record_url_best_qual, warc_fn)
|
||||
m.set("src", record_url_best_qual)
|
||||
m.set("src_alternative", record_url)
|
||||
except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")
|
||||
|
|
Ładowanie…
Reference in New Issue