kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge pull request #235 from bellingcat/instagram_extractor_bugfix
Instagram extractor bugfix: - Fix typo from config changes - Add warning message to documentation to alert to it not being maintained.pull/238/head
commit
8ae3d9c031
|
@ -10,25 +10,30 @@
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"configs": {
|
"configs": {
|
||||||
"username": {"required": True,
|
"username": {"required": True,
|
||||||
"help": "a valid Instagram username"},
|
"help": "A valid Instagram username."},
|
||||||
"password": {
|
"password": {
|
||||||
"required": True,
|
"required": True,
|
||||||
"help": "the corresponding Instagram account password",
|
"help": "The corresponding Instagram account password.",
|
||||||
},
|
},
|
||||||
"download_folder": {
|
"download_folder": {
|
||||||
"default": "instaloader",
|
"default": "instaloader",
|
||||||
"help": "name of a folder to temporarily download content to",
|
"help": "Name of a folder to temporarily download content to.",
|
||||||
},
|
},
|
||||||
"session_file": {
|
"session_file": {
|
||||||
"default": "secrets/instaloader.session",
|
"default": "secrets/instaloader.session",
|
||||||
"help": "path to the instagram session which saves session credentials",
|
"help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
|
||||||
},
|
},
|
||||||
# TODO: fine-grain
|
# TODO: fine-grain
|
||||||
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
|
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
|
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram.
|
||||||
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
|
||||||
|
> ⚠️ **Warning**
|
||||||
|
> This module is not actively maintained due to known issues with blocking.
|
||||||
|
> Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
|
||||||
|
|
||||||
|
This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||||
highlights, and tagged posts.
|
highlights, and tagged posts.
|
||||||
Authentication is required via username/password or a session file.
|
Authentication is required via username/password or a session file.
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
import re, os, shutil, traceback
|
import re, os, shutil
|
||||||
import instaloader
|
import instaloader
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
|
||||||
"""
|
"""
|
||||||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# NB: post regex should be tested before profile
|
# NB: post regex should be tested before profile
|
||||||
|
|
||||||
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
|
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
|
||||||
|
|
||||||
# https://regex101.com/r/MGPquX/1
|
# https://regex101.com/r/MGPquX/1
|
||||||
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
|
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
|
||||||
# https://regex101.com/r/6Wbsxa/1
|
# https://regex101.com/r/6Wbsxa/1
|
||||||
|
@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
|
|
||||||
self.insta = instaloader.Instaloader(
|
self.insta = instaloader.Instaloader(
|
||||||
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
download_geotags=True,
|
||||||
|
download_comments=True,
|
||||||
|
compress_json=False,
|
||||||
|
dirname_pattern=self.download_folder,
|
||||||
|
filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
self.insta.load_session_from_file(self.username, self.session_file)
|
self.insta.load_session_from_file(self.username, self.session_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
|
|
||||||
try:
|
try:
|
||||||
self.insta.login(self.username, config.instagram_self.password)
|
logger.debug(f"Session file failed", exc_info=True)
|
||||||
# TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
|
logger.info("No valid session file found - Attempting login with use and password.")
|
||||||
|
self.insta.login(self.username, self.password)
|
||||||
self.insta.save_session_to_file(self.session_file)
|
self.insta.save_session_to_file(self.session_file)
|
||||||
except Exception as e2:
|
except Exception as e:
|
||||||
logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
|
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
|
|
|
@ -1,12 +1,25 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from auto_archiver.modules.instagram_extractor import InstagramExtractor
|
from auto_archiver.modules.instagram_extractor import InstagramExtractor
|
||||||
from .test_extractor_base import TestExtractorBase
|
|
||||||
|
|
||||||
class TestInstagramExtractor(TestExtractorBase):
|
|
||||||
|
@pytest.fixture
|
||||||
|
def instagram_extractor(setup_module, mocker):
|
||||||
|
|
||||||
extractor_module: str = 'instagram_extractor'
|
extractor_module: str = 'instagram_extractor'
|
||||||
config: dict = {}
|
config: dict = {
|
||||||
|
"username": "user_name",
|
||||||
|
"password": "password123",
|
||||||
|
"download_folder": "instaloader",
|
||||||
|
"session_file": "secrets/instaloader.session",
|
||||||
|
}
|
||||||
|
fake_loader = mocker.MagicMock()
|
||||||
|
fake_loader.load_session_from_file.return_value = None
|
||||||
|
fake_loader.login.return_value = None
|
||||||
|
fake_loader.save_session_to_file.return_value = None
|
||||||
|
mocker.patch("instaloader.Instaloader", return_value=fake_loader,)
|
||||||
|
return setup_module(extractor_module, config)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("url", [
|
@pytest.mark.parametrize("url", [
|
||||||
"https://www.instagram.com/p/",
|
"https://www.instagram.com/p/",
|
||||||
|
@ -16,6 +29,8 @@ class TestInstagramExtractor(TestExtractorBase):
|
||||||
"https://www.instagram.com/username/stories/",
|
"https://www.instagram.com/username/stories/",
|
||||||
"https://www.instagram.com/username/highlights/",
|
"https://www.instagram.com/username/highlights/",
|
||||||
])
|
])
|
||||||
def test_regex_matches(self, url):
|
def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
|
||||||
# post
|
"""
|
||||||
assert InstagramExtractor.valid_url.match(url)
|
Ensure that the valid_url regex matches all provided Instagram URLs.
|
||||||
|
"""
|
||||||
|
assert instagram_extractor.valid_url.match(url)
|
Ładowanie…
Reference in New Issue