diff --git a/src/auto_archiver/modules/instagram_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py index 05cae19..c9b479a 100644 --- a/src/auto_archiver/modules/instagram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -10,25 +10,30 @@ "requires_setup": True, "configs": { "username": {"required": True, - "help": "a valid Instagram username"}, + "help": "A valid Instagram username."}, "password": { "required": True, - "help": "the corresponding Instagram account password", + "help": "The corresponding Instagram account password.", }, "download_folder": { "default": "instaloader", - "help": "name of a folder to temporarily download content to", + "help": "Name of a folder to temporarily download content to.", }, "session_file": { "default": "secrets/instaloader.session", - "help": "path to the instagram session which saves session credentials", + "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.", }, # TODO: fine-grain # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, }, "description": """ - Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts - and user profiles, downloading as much information as possible, including images, videos, text, stories, + Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. + + > ⚠️ **Warning** + > This module is not actively maintained due to known issues with blocking. + > Prioritise usage of the `instagram_tbot_extractor` and `instagram_api_extractor`. + + This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories, highlights, and tagged posts. Authentication is required via username/password or a session file. diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 7ae3b01..7e195ad 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -4,8 +4,6 @@ """ import re, os, shutil -from sys import exc_info - import instaloader from loguru import logger @@ -17,10 +15,9 @@ class InstagramExtractor(Extractor): """ Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) """ + # NB: post regex should be tested before profile - valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/") - # https://regex101.com/r/MGPquX/1 post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url)) # https://regex101.com/r/6Wbsxa/1 @@ -38,19 +35,14 @@ class InstagramExtractor(Extractor): ) try: self.insta.load_session_from_file(self.username, self.session_file) - except FileNotFoundError: - logger.info("No existing session file found - Attempting login with use and password.") + except Exception as e: try: + logger.debug(f"Session file failed", exc_info=True) + logger.info("No valid session file found - Attempting login with use and password.") self.insta.login(self.username, self.password) self.insta.save_session_to_file(self.session_file) except Exception as e: - logger.error(f"Failed to log in with Instaloader: {e}") - # TODO raise exception? - # raise Exception(f"Failed to log in with Instaloader: {e}") - except Exception as e: - logger.error(f"Error loading session file: {e}") - # TODO raise exception? - # raise Exception(f"Error loading session file: {e}") + logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}") def download(self, item: Metadata) -> Metadata: diff --git a/tests/extractors/test_instagram_extractor.py b/tests/extractors/test_instagram_extractor.py index 97549b8..647cab4 100644 --- a/tests/extractors/test_instagram_extractor.py +++ b/tests/extractors/test_instagram_extractor.py @@ -1,11 +1,10 @@ import pytest from auto_archiver.modules.instagram_extractor import InstagramExtractor -from .test_extractor_base import TestExtractorBase @pytest.fixture -def intsagram_extractor(setup_module): +def instagram_extractor(setup_module, mocker): extractor_module: str = 'instagram_extractor' config: dict = { @@ -14,11 +13,14 @@ def intsagram_extractor(setup_module): "download_folder": "instaloader", "session_file": "secrets/instaloader.session", } + fake_loader = mocker.MagicMock() + fake_loader.load_session_from_file.return_value = None + fake_loader.login.return_value = None + fake_loader.save_session_to_file.return_value = None + mocker.patch("instaloader.Instaloader", return_value=fake_loader,) return setup_module(extractor_module, config) - - @pytest.mark.parametrize("url", [ "https://www.instagram.com/p/", "https://www.instagram.com/p/1234567890/", @@ -27,6 +29,8 @@ def intsagram_extractor(setup_module): "https://www.instagram.com/username/stories/", "https://www.instagram.com/username/highlights/", ]) -def test_regex_matches(url, instagram_extractor): - # post - assert instagram_extractor.valid_url.match(url) +def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None: + """ + Ensure that the valid_url regex matches all provided Instagram URLs. + """ + assert instagram_extractor.valid_url.match(url) \ No newline at end of file