diff --git a/pyproject.toml b/pyproject.toml index 89bd4eb..6720ce8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "0.13.7" +version = "0.13.8" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 642b8ee..bcaa59b 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -71,7 +71,16 @@ class BaseModule(ABC): :param site: the domain of the site to get authentication information for :param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar). - :returns: authdict dict of login information for the given site + :returns: authdict dict -> { + "username": str, + "password": str, + "api_key": str, + "api_secret": str, + "cookie": str, + "cookies_file": str, + "cookies_from_browser": str, + "cookies_jar": CookieJar + } **Global options:**\n * cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n @@ -85,6 +94,7 @@ class BaseModule(ABC): * cookie: str - a cookie string to use for login (specific to this site)\n * cookies_file: str - the path to a cookies file to use for login (specific to this site)\n * cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n + """ # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code? diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 294b4e7..d559c47 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -29,6 +29,9 @@ class InstagramExtractor(Extractor): # TODO: links to stories def setup(self) -> None: + logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.") + logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.") + self.insta = instaloader.Instaloader( download_geotags=True, download_comments=True, diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index 491bd51..4e01357 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -19,12 +19,21 @@ class ScreenshotEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() - if UrlUtil.is_auth_wall(url): - logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") - return - logger.debug(f"Enriching screenshot for {url=}") auth = self.auth_for_site(url) + + # screenshot enricher only supports cookie-type auth (selenium) + has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie")) + + if UrlUtil.is_auth_wall(url) and not has_valid_auth: + logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}") + if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]): + logger.warning( + f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\ + Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site." + ) + return + with self.webdriver_factory( self.width, self.height, diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 169ed87..368d93c 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -4,8 +4,8 @@ from ipaddress import ip_address AUTHWALL_URLS = [ - re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels - re.compile(r"https:\/\/www\.instagram\.com"), # instagram + re.compile(r"https?:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels + re.compile(r"https?:\/\/(www\.)?instagram\.com"), # instagram ] @@ -81,56 +81,43 @@ def is_relevant_url(url: str) -> bool: """ clean_url = remove_get_parameters(url) - # favicons - if "favicon" in url: - return False - # ifnore icons - if clean_url.endswith(".ico"): - return False - # ignore SVGs - if remove_get_parameters(url).endswith(".svg"): - return False + IRRELEVANT_URLS = [ + # favicons + ("favicon",), + # twitter profile pictures + ("twimg.com/profile_images",), + ("twimg.com", "default_profile_images"), + # instagram profile pictures + ("https://scontent.cdninstagram.com/", "150x150"), + # instagram recurring images + ("https://static.cdninstagram.com/rsrc.php/",), + # telegram + ("https://telegram.org/img/emoji/",), + # youtube + ("https://www.youtube.com/s/gaming/emoji/",), + ("https://yt3.ggpht.com", "default-user="), + ("https://www.youtube.com/s/search/audio/",), + # ok + ("https://ok.ru/res/i/",), + ("https://vk.com/emoji/",), + ("vk.com/images/",), + ("vk.com/images/reaction/",), + # wikipedia + ("wikipedia.org/static",), + ] - # twitter profile pictures - if "twimg.com/profile_images" in url: - return False - if "twimg.com" in url and "/default_profile_images" in url: - return False + IRRELEVANT_ENDS_WITH = [ + ".svg", # ignore SVGs + ".ico", # ignore icons + ] - # instagram profile pictures - if "https://scontent.cdninstagram.com/" in url and "150x150" in url: - return False - # instagram recurring images - if "https://static.cdninstagram.com/rsrc.php/" in url: - return False + for end in IRRELEVANT_ENDS_WITH: + if clean_url.endswith(end): + return False - # telegram - if "https://telegram.org/img/emoji/" in url: - return False - - # youtube - if "https://www.youtube.com/s/gaming/emoji/" in url: - return False - if "https://yt3.ggpht.com" in url and "default-user=" in url: - return False - if "https://www.youtube.com/s/search/audio/" in url: - return False - - # ok - if " https://ok.ru/res/i/" in url: - return False - - # vk - if "https://vk.com/emoji/" in url: - return False - if "vk.com/images/" in url: - return False - if "vk.com/images/reaction/" in url: - return False - - # wikipedia - if "wikipedia.org/static" in url: - return False + for parts in IRRELEVANT_URLS: + if all(part in clean_url for part in parts): + return False return True diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index 43e7817..bd47f9d 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -22,35 +22,35 @@ from loguru import logger class CookieSettingDriver(webdriver.Firefox): facebook_accept_cookies: bool - cookies: str - cookiejar: MozillaCookieJar + cookie: str + cookie_jar: MozillaCookieJar - def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs): + def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs): if os.environ.get("RUNNING_IN_DOCKER"): # Selenium doesn't support linux-aarch64 driver, we need to set this manually kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver") super(CookieSettingDriver, self).__init__(*args, **kwargs) - self.cookies = cookies - self.cookiejar = cookiejar + self.cookie = cookie + self.cookie_jar = cookie_jar self.facebook_accept_cookies = facebook_accept_cookies def get(self, url: str): - if self.cookies or self.cookiejar: + if self.cookie_jar or self.cookie: # set up the driver to make it not 'cookie averse' (needs a context/URL) # get the 'robots.txt' file which should be quick and easy robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment="")) super(CookieSettingDriver, self).get(robots_url) - if self.cookies: + if self.cookie: # an explicit cookie is set for this site, use that first for cookie in self.cookies.split(";"): for name, value in cookie.split("="): self.driver.add_cookie({"name": name, "value": value}) - elif self.cookiejar: - domain = urlparse(url).netloc + elif self.cookie_jar: + domain = urlparse(url).netloc.removeprefix("www.") regex = re.compile(f"(www)?.?{domain}$") - for cookie in self.cookiejar: + for cookie in self.cookie_jar: if regex.match(cookie.domain): try: self.add_cookie( @@ -145,8 +145,8 @@ class Webdriver: try: self.driver = CookieSettingDriver( - cookies=self.auth.get("cookies"), - cookiejar=self.auth.get("cookies_jar"), + cookie=self.auth.get("cookie"), + cookie_jar=self.auth.get("cookies_jar"), facebook_accept_cookies=self.facebook_accept_cookies, options=options, ) diff --git a/tests/enrichers/test_screenshot_enricher.py b/tests/enrichers/test_screenshot_enricher.py index b86bb17..ec56345 100644 --- a/tests/enrichers/test_screenshot_enricher.py +++ b/tests/enrichers/test_screenshot_enricher.py @@ -85,8 +85,8 @@ def test_enrich_adds_screenshot( mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env screenshot_enricher.enrich(metadata_with_video) mock_driver_class.assert_called_once_with( - cookies=None, - cookiejar=None, + cookie=None, + cookie_jar=None, facebook_accept_cookies=False, options=mock_options_instance, ) @@ -124,6 +124,38 @@ def test_enrich_auth_wall( assert metadata_with_video.media[1].properties.get("id") == "screenshot" +def test_skip_authwall_no_cookies(screenshot_enricher, caplog): + with caplog.at_level("WARNING"): + screenshot_enricher.enrich(Metadata().set_url("https://instagram.com")) + assert "[SKIP] SCREENSHOT since url" in caplog.text + + +@pytest.mark.parametrize( + "auth", + [ + {"cookie": "cookie"}, + {"cookies_jar": "cookie"}, + ], +) +def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth): + mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True) + + # patch the authentication dict: + screenshot_enricher.authentication = {"example.com": auth} + with caplog.at_level("WARNING"): + screenshot_enricher.enrich(Metadata().set_url("https://example.com")) + assert "[SKIP] SCREENSHOT since url" not in caplog.text + + +def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env): + mock_driver, mock_driver_class, _ = mock_selenium_env + mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True) + screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}} + with caplog.at_level("WARNING"): + screenshot_enricher.enrich(Metadata().set_url("https://example.com")) + assert "Screenshot enricher only supports cookie-type authentication" in caplog.text + + def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker): mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py new file mode 100644 index 0000000..7871847 --- /dev/null +++ b/tests/utils/test_urls.py @@ -0,0 +1,113 @@ +import pytest +from auto_archiver.utils.url import ( + is_auth_wall, + check_url_or_raise, + domain_for_url, + is_relevant_url, + remove_get_parameters, + twitter_best_quality_url, +) + + +@pytest.mark.parametrize( + "url, is_auth", + [ + ("https://example.com", False), + ("https://t.me/c/abc/123", True), + ("https://t.me/not-private/", False), + ("https://instagram.com", True), + ("https://www.instagram.com", True), + ("https://www.instagram.com/p/INVALID", True), + ("https://www.instagram.com/p/C4QgLbrIKXG/", True), + ], +) +def test_is_auth_wall(url, is_auth): + assert is_auth_wall(url) == is_auth + + +@pytest.mark.parametrize( + "url, raises", + [ + ("http://example.com", False), + ("https://example.com", False), + ("ftp://example.com", True), + ("http://localhost", True), + ("http://", True), + ], +) +def test_check_url_or_raise(url, raises): + if raises: + with pytest.raises(ValueError): + check_url_or_raise(url) + else: + assert check_url_or_raise(url) + + +@pytest.mark.parametrize( + "url, domain", + [ + ("https://example.com", "example.com"), + ("https://www.example.com", "www.example.com"), + ("https://www.example.com/path", "www.example.com"), + ("https://", ""), + ("http://localhost", "localhost"), + ], +) +def test_domain_for_url(url, domain): + assert domain_for_url(url) == domain + + +@pytest.mark.parametrize( + "url, without_get", + [ + ("https://example.com", "https://example.com"), + ("https://example.com?utm_source=example", "https://example.com"), + ("https://example.com?utm_source=example&other=1", "https://example.com"), + ("https://example.com/something", "https://example.com/something"), + ("https://example.com/something?utm_source=example", "https://example.com/something"), + ], +) +def test_remove_get_parameters(url, without_get): + assert remove_get_parameters(url) == without_get + + +@pytest.mark.parametrize( + "url, relevant", + [ + ("https://example.com", True), + ("https://example.com/favicon.ico", False), + ("https://twimg.com/profile_images", False), + ("https://twimg.com/something/default_profile_images", False), + ("https://scontent.cdninstagram.com/username/150x150.jpg", False), + ("https://static.cdninstagram.com/rsrc.php/", False), + ("https://telegram.org/img/emoji/", False), + ("https://www.youtube.com/s/gaming/emoji/", False), + ("https://yt3.ggpht.com/default-user=", False), + ("https://www.youtube.com/s/search/audio/", False), + ("https://ok.ru/res/i/", False), + ("https://vk.com/emoji/", False), + ("https://vk.com/images/", False), + ("https://vk.com/images/reaction/", False), + ("https://wikipedia.org/static", False), + ("https://example.com/file.svg", False), + ("https://example.com/file.ico", False), + ("https://example.com/file.mp4", True), + ("https://example.com/150x150.jpg", True), + ("https://example.com/rsrc.php/", True), + ("https://example.com/img/emoji/", True), + ], +) +def test_is_relevant_url(url, relevant): + assert is_relevant_url(url) == relevant + + +@pytest.mark.parametrize( + "url, best_quality", + [ + ("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"), + ("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"), + ("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"), + ], +) +def test_twitter_best_quality_url(url, best_quality): + assert twitter_best_quality_url(url) == best_quality