simplify download_url method

pull/129/head
msramalho 2024-02-15 11:30:49 +00:00
rodzic 3af1a79645
commit cd8b4cfec4
6 zmienionych plików z 10 dodań i 11 usunięć

Wyświetl plik

@ -41,16 +41,15 @@ class Archiver(Step):
return mime.split("/")[0]
return ""
def download_from_url(self, url: str, to_filename: str = None, item: Metadata = None) -> str:
def download_from_url(self, url: str, to_filename: str = None) -> str:
"""
downloads a URL to provided filename, or inferred from URL, returns local filename, if item is present will use its tmp_dir
downloads a URL to provided filename, or inferred from URL, returns local filename
"""
if not to_filename:
to_filename = url.split('/')[-1].split('?')[0]
if len(to_filename) > 64:
to_filename = to_filename[-64:]
if item:
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}

Wyświetl plik

@ -53,10 +53,10 @@ class TelegramArchiver(Archiver):
if not len(image_urls): return False
for img_url in image_urls:
result.add_media(Media(self.download_from_url(img_url, item=item)))
result.add_media(Media(self.download_from_url(img_url)))
else:
video_url = video.get('src')
m_video = Media(self.download_from_url(video_url, item=item))
m_video = Media(self.download_from_url(video_url))
# extract duration from HTML
try:
duration = s.find_all('time')[0].contents[0]

Wyświetl plik

@ -152,7 +152,7 @@ class TelethonArchiver(Archiver):
if len(other_media_urls):
logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
for i, om_url in enumerate(other_media_urls):
filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}', item)
filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}')
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))

Wyświetl plik

@ -90,7 +90,7 @@ class TwitterApiArchiver(TwitterArchiver, Archiver):
continue
logger.info(f"Found media {media}")
ext = mimetypes.guess_extension(mimetype)
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
result.add_media(media)
result.set_content(json.dumps({

Wyświetl plik

@ -80,7 +80,7 @@ class TwitterArchiver(Archiver):
logger.warning(f"Could not get media URL of {tweet_media}")
continue
ext = mimetypes.guess_extension(mimetype)
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
result.add_media(media)
return result.success("twitter-snscrape")
@ -120,7 +120,7 @@ class TwitterArchiver(Archiver):
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
ext = mimetypes.guess_extension(mtype)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}')
result.add_media(media)
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))

Wyświetl plik

@ -191,7 +191,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
# if a link with better quality exists, try to download that
if record_url_best_qual != record_url:
try:
m.filename = self.download_from_url(record_url_best_qual, warc_fn, to_enrich)
m.filename = self.download_from_url(record_url_best_qual, warc_fn)
m.set("src", record_url_best_qual)
m.set("src_alternative", record_url)
except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")