Merge pull request #235 from bellingcat/instagram_extractor_bugfix

Instagram extractor bugfix:
- Fix typo from config changes
- Add warning message to documentation to alert to it not being maintained.
pull/238/head
Erin Clark 2025-03-07 15:02:05 +00:00 zatwierdzone przez GitHub
commit 8ae3d9c031
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
3 zmienionych plików z 52 dodań i 30 usunięć

Wyświetl plik

@ -10,25 +10,30 @@
"requires_setup": True, "requires_setup": True,
"configs": { "configs": {
"username": {"required": True, "username": {"required": True,
"help": "a valid Instagram username"}, "help": "A valid Instagram username."},
"password": { "password": {
"required": True, "required": True,
"help": "the corresponding Instagram account password", "help": "The corresponding Instagram account password.",
}, },
"download_folder": { "download_folder": {
"default": "instaloader", "default": "instaloader",
"help": "name of a folder to temporarily download content to", "help": "Name of a folder to temporarily download content to.",
}, },
"session_file": { "session_file": {
"default": "secrets/instaloader.session", "default": "secrets/instaloader.session",
"help": "path to the instagram session which saves session credentials", "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
}, },
# TODO: fine-grain # TODO: fine-grain
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
}, },
"description": """ "description": """
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram.
and user profiles, downloading as much information as possible, including images, videos, text, stories,
> **Warning**
> This module is not actively maintained due to known issues with blocking.
> Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
highlights, and tagged posts. highlights, and tagged posts.
Authentication is required via username/password or a session file. Authentication is required via username/password or a session file.

Wyświetl plik

@ -3,7 +3,7 @@
highlights, and tagged posts. Authentication is required via username/password or a session file. highlights, and tagged posts. Authentication is required via username/password or a session file.
""" """
import re, os, shutil, traceback import re, os, shutil
import instaloader import instaloader
from loguru import logger from loguru import logger
@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
""" """
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
""" """
# NB: post regex should be tested before profile # NB: post regex should be tested before profile
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/") valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
# https://regex101.com/r/MGPquX/1 # https://regex101.com/r/MGPquX/1
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url)) post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
# https://regex101.com/r/6Wbsxa/1 # https://regex101.com/r/6Wbsxa/1
@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
def setup(self) -> None: def setup(self) -> None:
self.insta = instaloader.Instaloader( self.insta = instaloader.Instaloader(
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" download_geotags=True,
download_comments=True,
compress_json=False,
dirname_pattern=self.download_folder,
filename_pattern="{date_utc}_UTC_{target}__{typename}"
) )
try: try:
self.insta.load_session_from_file(self.username, self.session_file) self.insta.load_session_from_file(self.username, self.session_file)
except Exception as e: except Exception as e:
logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
try: try:
self.insta.login(self.username, config.instagram_self.password) logger.debug(f"Session file failed", exc_info=True)
# TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758 logger.info("No valid session file found - Attempting login with use and password.")
self.insta.login(self.username, self.password)
self.insta.save_session_to_file(self.session_file) self.insta.save_session_to_file(self.session_file)
except Exception as e2: except Exception as e:
logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}") logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:

Wyświetl plik

@ -1,12 +1,25 @@
import pytest import pytest
from auto_archiver.modules.instagram_extractor import InstagramExtractor from auto_archiver.modules.instagram_extractor import InstagramExtractor
from .test_extractor_base import TestExtractorBase
class TestInstagramExtractor(TestExtractorBase):
@pytest.fixture
def instagram_extractor(setup_module, mocker):
extractor_module: str = 'instagram_extractor' extractor_module: str = 'instagram_extractor'
config: dict = {} config: dict = {
"username": "user_name",
"password": "password123",
"download_folder": "instaloader",
"session_file": "secrets/instaloader.session",
}
fake_loader = mocker.MagicMock()
fake_loader.load_session_from_file.return_value = None
fake_loader.login.return_value = None
fake_loader.save_session_to_file.return_value = None
mocker.patch("instaloader.Instaloader", return_value=fake_loader,)
return setup_module(extractor_module, config)
@pytest.mark.parametrize("url", [ @pytest.mark.parametrize("url", [
"https://www.instagram.com/p/", "https://www.instagram.com/p/",
@ -16,6 +29,8 @@ class TestInstagramExtractor(TestExtractorBase):
"https://www.instagram.com/username/stories/", "https://www.instagram.com/username/stories/",
"https://www.instagram.com/username/highlights/", "https://www.instagram.com/username/highlights/",
]) ])
def test_regex_matches(self, url): def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
# post """
assert InstagramExtractor.valid_url.match(url) Ensure that the valid_url regex matches all provided Instagram URLs.
"""
assert instagram_extractor.valid_url.match(url)