simplify download_url method

2024-02-15 11:30:49 +00:00 · 2024-02-15 11:30:49 +00:00 · cd8b4cfec4
commit cd8b4cfec4
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@ -41,15 +41,14 @@ class Archiver(Step):
            return mime.split("/")[0]
        return ""

-    def download_from_url(self, url: str, to_filename: str = None, item: Metadata = None) -> str:
+    def download_from_url(self, url: str, to_filename: str = None) -> str:
        """
-        downloads a URL to provided filename, or inferred from URL, returns local filename, if item is present will use its tmp_dir
+        downloads a URL to provided filename, or inferred from URL, returns local filename
        """
        if not to_filename:
            to_filename = url.split('/')[-1].split('?')[0]
            if len(to_filename) > 64:
                to_filename = to_filename[-64:]
-        if item:
        to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
--- a/src/auto_archiver/archivers/telegram_archiver.py
+++ b/src/auto_archiver/archivers/telegram_archiver.py
@ -53,10 +53,10 @@ class TelegramArchiver(Archiver):

            if not len(image_urls): return False
            for img_url in image_urls:
-                result.add_media(Media(self.download_from_url(img_url, item=item)))
+                result.add_media(Media(self.download_from_url(img_url)))
        else:
            video_url = video.get('src')
-            m_video = Media(self.download_from_url(video_url, item=item))
+            m_video = Media(self.download_from_url(video_url))
            # extract duration from HTML
            try:
                duration = s.find_all('time')[0].contents[0]
--- a/src/auto_archiver/archivers/telethon_archiver.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@ -152,7 +152,7 @@ class TelethonArchiver(Archiver):
                    if len(other_media_urls):
                        logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
                    for i, om_url in enumerate(other_media_urls):
-                        filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}', item)
+                        filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}')
                        result.add_media(Media(filename=filename), id=f"{group_id}_{i}")

                filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
--- a/src/auto_archiver/archivers/twitter_api_archiver.py
+++ b/src/auto_archiver/archivers/twitter_api_archiver.py
@ -90,7 +90,7 @@ class TwitterApiArchiver(TwitterArchiver, Archiver):
                    continue
                logger.info(f"Found media {media}")
                ext = mimetypes.guess_extension(mimetype)
-                media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
+                media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
                result.add_media(media)

        result.set_content(json.dumps({
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@ -80,7 +80,7 @@ class TwitterArchiver(Archiver):
                logger.warning(f"Could not get media URL of {tweet_media}")
                continue
            ext = mimetypes.guess_extension(mimetype)
-            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
+            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
            result.add_media(media)

        return result.success("twitter-snscrape")
@ -120,7 +120,7 @@ class TwitterArchiver(Archiver):
            if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
                ext = mimetypes.guess_extension(mtype)

-            media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
+            media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}')
            result.add_media(media)

        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@ -191,7 +191,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
                # if a link with better quality exists, try to download that
                if record_url_best_qual != record_url:
                    try:
-                        m.filename = self.download_from_url(record_url_best_qual, warc_fn, to_enrich)
+                        m.filename = self.download_from_url(record_url_best_qual, warc_fn)
                        m.set("src", record_url_best_qual)
                        m.set("src_alternative", record_url)
                    except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")