adds configurable limits to instagram/youtube

2024-02-25 15:14:17 +00:00 · 2024-02-25 15:14:17 +00:00 · ccf5f857ef
commit ccf5f857ef
--- a/src/auto_archiver/archivers/instagram_api_archiver.py
+++ b/src/auto_archiver/archivers/instagram_api_archiver.py
@ -22,6 +22,7 @@ class InstagramAPIArchiver(Archiver):
        super().__init__(config)
        self.assert_valid_string("access_token")
        self.assert_valid_string("api_endpoint")
+        self.full_profile_max_posts = int(self.full_profile_max_posts)
        if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1]

        self.full_profile = bool(self.full_profile)
@ -33,6 +34,7 @@ class InstagramAPIArchiver(Archiver):
            "access_token": {"default": None, "help": "a valid instagrapi-api token"},
            "api_endpoint": {"default": None, "help": "API endpoint to use"},
            "full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."},
+            "full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"},
            "minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"},
        }
    
@ -117,16 +119,7 @@ class InstagramAPIArchiver(Archiver):

            # download all highlights
            try:
-                count_highlights = 0
-                highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
-                for h in highlights:
-                    try: 
-                        h_info = self._download_highlights_reusable(result, h.get("pk"))
-                        count_highlights += len(h_info.get("items", []))
-                    except Exception as e:
-                        result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
-                        logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
-                result.set("#highlights", count_highlights)
+                self.download_all_highlights(result, username, user_id)
            except Exception as e:
                result.append("errors", f"Error downloading highlights for {username}")
                logger.error(f"Error downloading highlights for {username}: {e}")
@ -135,6 +128,21 @@ class InstagramAPIArchiver(Archiver):
        result.set_url(url) # reset as scrape_item modifies it
        return result.success("insta profile")

+    def download_all_highlights(self, result, username, user_id):
+        count_highlights = 0
+        highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
+        for h in highlights:
+            try: 
+                h_info = self._download_highlights_reusable(result, h.get("pk"))
+                count_highlights += len(h_info.get("items", []))
+            except Exception as e:
+                result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
+                logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
+            if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
+                logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                break
+        result.set("#highlights", count_highlights)
+
    def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
        if id:
            post = self.call_api(f"v1/media/by/id", {"id": id})
@ -211,6 +219,9 @@ class InstagramAPIArchiver(Archiver):
                    logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
                pbar.update(1)
                post_count+=1
+            if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
+                logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                break
        result.set("#posts", post_count)
        
    def download_all_tagged(self, result: Metadata, user_id: str):
@ -233,6 +244,9 @@ class InstagramAPIArchiver(Archiver):
                    logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
                pbar.update(1)
                tagged_count+=1
+            if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
+                logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
+                break
        result.set("#tagged", tagged_count)


--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@ -15,6 +15,8 @@ class YoutubeDLArchiver(Archiver):
        self.livestreams = bool(self.livestreams)
        self.live_from_start = bool(self.live_from_start)
        self.end_means_success = bool(self.end_means_success)
+        self.allow_playlist = bool(self.allow_playlist)
+        self.max_downloads = self.max_downloads

    @staticmethod
    def configs() -> dict:
@ -26,6 +28,8 @@ class YoutubeDLArchiver(Archiver):
            "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
            "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
+            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
+            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
        }

    def download(self, item: Metadata) -> Metadata:
@ -35,7 +39,7 @@ class YoutubeDLArchiver(Archiver):
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie

-        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True, 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy}
+        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

        try:
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@ -3,7 +3,7 @@ _MAJOR = "0"
 _MINOR = "9"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "10"
+_PATCH = "11"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""