kopia lustrzana https://github.com/bellingcat/auto-archiver
commit
01516724d3
|
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||
|
||||
[project]
|
||||
name = "auto-archiver"
|
||||
version = "0.13.7"
|
||||
version = "0.13.8"
|
||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||
|
||||
requires-python = ">=3.10,<3.13"
|
||||
|
|
|
@ -71,7 +71,16 @@ class BaseModule(ABC):
|
|||
:param site: the domain of the site to get authentication information for
|
||||
:param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
|
||||
|
||||
:returns: authdict dict of login information for the given site
|
||||
:returns: authdict dict -> {
|
||||
"username": str,
|
||||
"password": str,
|
||||
"api_key": str,
|
||||
"api_secret": str,
|
||||
"cookie": str,
|
||||
"cookies_file": str,
|
||||
"cookies_from_browser": str,
|
||||
"cookies_jar": CookieJar
|
||||
}
|
||||
|
||||
**Global options:**\n
|
||||
* cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
|
||||
|
@ -85,6 +94,7 @@ class BaseModule(ABC):
|
|||
* cookie: str - a cookie string to use for login (specific to this site)\n
|
||||
* cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
|
||||
* cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
|
||||
|
||||
"""
|
||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
|
||||
|
|
|
@ -29,6 +29,9 @@ class InstagramExtractor(Extractor):
|
|||
# TODO: links to stories
|
||||
|
||||
def setup(self) -> None:
|
||||
logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
|
||||
logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
|
||||
|
||||
self.insta = instaloader.Instaloader(
|
||||
download_geotags=True,
|
||||
download_comments=True,
|
||||
|
|
|
@ -19,12 +19,21 @@ class ScreenshotEnricher(Enricher):
|
|||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
auth = self.auth_for_site(url)
|
||||
|
||||
# screenshot enricher only supports cookie-type auth (selenium)
|
||||
has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
|
||||
|
||||
if UrlUtil.is_auth_wall(url) and not has_valid_auth:
|
||||
logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
|
||||
if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
|
||||
logger.warning(
|
||||
f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
|
||||
Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
|
||||
)
|
||||
return
|
||||
|
||||
with self.webdriver_factory(
|
||||
self.width,
|
||||
self.height,
|
||||
|
|
|
@ -4,8 +4,8 @@ from ipaddress import ip_address
|
|||
|
||||
|
||||
AUTHWALL_URLS = [
|
||||
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
||||
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
|
||||
re.compile(r"https?:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
||||
re.compile(r"https?:\/\/(www\.)?instagram\.com"), # instagram
|
||||
]
|
||||
|
||||
|
||||
|
@ -81,56 +81,43 @@ def is_relevant_url(url: str) -> bool:
|
|||
"""
|
||||
clean_url = remove_get_parameters(url)
|
||||
|
||||
# favicons
|
||||
if "favicon" in url:
|
||||
return False
|
||||
# ifnore icons
|
||||
if clean_url.endswith(".ico"):
|
||||
return False
|
||||
# ignore SVGs
|
||||
if remove_get_parameters(url).endswith(".svg"):
|
||||
return False
|
||||
IRRELEVANT_URLS = [
|
||||
# favicons
|
||||
("favicon",),
|
||||
# twitter profile pictures
|
||||
("twimg.com/profile_images",),
|
||||
("twimg.com", "default_profile_images"),
|
||||
# instagram profile pictures
|
||||
("https://scontent.cdninstagram.com/", "150x150"),
|
||||
# instagram recurring images
|
||||
("https://static.cdninstagram.com/rsrc.php/",),
|
||||
# telegram
|
||||
("https://telegram.org/img/emoji/",),
|
||||
# youtube
|
||||
("https://www.youtube.com/s/gaming/emoji/",),
|
||||
("https://yt3.ggpht.com", "default-user="),
|
||||
("https://www.youtube.com/s/search/audio/",),
|
||||
# ok
|
||||
("https://ok.ru/res/i/",),
|
||||
("https://vk.com/emoji/",),
|
||||
("vk.com/images/",),
|
||||
("vk.com/images/reaction/",),
|
||||
# wikipedia
|
||||
("wikipedia.org/static",),
|
||||
]
|
||||
|
||||
# twitter profile pictures
|
||||
if "twimg.com/profile_images" in url:
|
||||
return False
|
||||
if "twimg.com" in url and "/default_profile_images" in url:
|
||||
return False
|
||||
IRRELEVANT_ENDS_WITH = [
|
||||
".svg", # ignore SVGs
|
||||
".ico", # ignore icons
|
||||
]
|
||||
|
||||
# instagram profile pictures
|
||||
if "https://scontent.cdninstagram.com/" in url and "150x150" in url:
|
||||
return False
|
||||
# instagram recurring images
|
||||
if "https://static.cdninstagram.com/rsrc.php/" in url:
|
||||
return False
|
||||
for end in IRRELEVANT_ENDS_WITH:
|
||||
if clean_url.endswith(end):
|
||||
return False
|
||||
|
||||
# telegram
|
||||
if "https://telegram.org/img/emoji/" in url:
|
||||
return False
|
||||
|
||||
# youtube
|
||||
if "https://www.youtube.com/s/gaming/emoji/" in url:
|
||||
return False
|
||||
if "https://yt3.ggpht.com" in url and "default-user=" in url:
|
||||
return False
|
||||
if "https://www.youtube.com/s/search/audio/" in url:
|
||||
return False
|
||||
|
||||
# ok
|
||||
if " https://ok.ru/res/i/" in url:
|
||||
return False
|
||||
|
||||
# vk
|
||||
if "https://vk.com/emoji/" in url:
|
||||
return False
|
||||
if "vk.com/images/" in url:
|
||||
return False
|
||||
if "vk.com/images/reaction/" in url:
|
||||
return False
|
||||
|
||||
# wikipedia
|
||||
if "wikipedia.org/static" in url:
|
||||
return False
|
||||
for parts in IRRELEVANT_URLS:
|
||||
if all(part in clean_url for part in parts):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
|
|
@ -22,35 +22,35 @@ from loguru import logger
|
|||
|
||||
class CookieSettingDriver(webdriver.Firefox):
|
||||
facebook_accept_cookies: bool
|
||||
cookies: str
|
||||
cookiejar: MozillaCookieJar
|
||||
cookie: str
|
||||
cookie_jar: MozillaCookieJar
|
||||
|
||||
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
|
||||
def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs):
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
|
||||
kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver")
|
||||
|
||||
super(CookieSettingDriver, self).__init__(*args, **kwargs)
|
||||
self.cookies = cookies
|
||||
self.cookiejar = cookiejar
|
||||
self.cookie = cookie
|
||||
self.cookie_jar = cookie_jar
|
||||
self.facebook_accept_cookies = facebook_accept_cookies
|
||||
|
||||
def get(self, url: str):
|
||||
if self.cookies or self.cookiejar:
|
||||
if self.cookie_jar or self.cookie:
|
||||
# set up the driver to make it not 'cookie averse' (needs a context/URL)
|
||||
# get the 'robots.txt' file which should be quick and easy
|
||||
robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment=""))
|
||||
super(CookieSettingDriver, self).get(robots_url)
|
||||
|
||||
if self.cookies:
|
||||
if self.cookie:
|
||||
# an explicit cookie is set for this site, use that first
|
||||
for cookie in self.cookies.split(";"):
|
||||
for name, value in cookie.split("="):
|
||||
self.driver.add_cookie({"name": name, "value": value})
|
||||
elif self.cookiejar:
|
||||
domain = urlparse(url).netloc
|
||||
elif self.cookie_jar:
|
||||
domain = urlparse(url).netloc.removeprefix("www.")
|
||||
regex = re.compile(f"(www)?.?{domain}$")
|
||||
for cookie in self.cookiejar:
|
||||
for cookie in self.cookie_jar:
|
||||
if regex.match(cookie.domain):
|
||||
try:
|
||||
self.add_cookie(
|
||||
|
@ -145,8 +145,8 @@ class Webdriver:
|
|||
|
||||
try:
|
||||
self.driver = CookieSettingDriver(
|
||||
cookies=self.auth.get("cookies"),
|
||||
cookiejar=self.auth.get("cookies_jar"),
|
||||
cookie=self.auth.get("cookie"),
|
||||
cookie_jar=self.auth.get("cookies_jar"),
|
||||
facebook_accept_cookies=self.facebook_accept_cookies,
|
||||
options=options,
|
||||
)
|
||||
|
|
|
@ -85,8 +85,8 @@ def test_enrich_adds_screenshot(
|
|||
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
|
||||
screenshot_enricher.enrich(metadata_with_video)
|
||||
mock_driver_class.assert_called_once_with(
|
||||
cookies=None,
|
||||
cookiejar=None,
|
||||
cookie=None,
|
||||
cookie_jar=None,
|
||||
facebook_accept_cookies=False,
|
||||
options=mock_options_instance,
|
||||
)
|
||||
|
@ -124,6 +124,38 @@ def test_enrich_auth_wall(
|
|||
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
|
||||
|
||||
|
||||
def test_skip_authwall_no_cookies(screenshot_enricher, caplog):
|
||||
with caplog.at_level("WARNING"):
|
||||
screenshot_enricher.enrich(Metadata().set_url("https://instagram.com"))
|
||||
assert "[SKIP] SCREENSHOT since url" in caplog.text
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"auth",
|
||||
[
|
||||
{"cookie": "cookie"},
|
||||
{"cookies_jar": "cookie"},
|
||||
],
|
||||
)
|
||||
def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth):
|
||||
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
|
||||
|
||||
# patch the authentication dict:
|
||||
screenshot_enricher.authentication = {"example.com": auth}
|
||||
with caplog.at_level("WARNING"):
|
||||
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
|
||||
assert "[SKIP] SCREENSHOT since url" not in caplog.text
|
||||
|
||||
|
||||
def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env):
|
||||
mock_driver, mock_driver_class, _ = mock_selenium_env
|
||||
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
|
||||
screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}}
|
||||
with caplog.at_level("WARNING"):
|
||||
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
|
||||
assert "Screenshot enricher only supports cookie-type authentication" in caplog.text
|
||||
|
||||
|
||||
def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
|
||||
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
|
||||
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
import pytest
|
||||
from auto_archiver.utils.url import (
|
||||
is_auth_wall,
|
||||
check_url_or_raise,
|
||||
domain_for_url,
|
||||
is_relevant_url,
|
||||
remove_get_parameters,
|
||||
twitter_best_quality_url,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, is_auth",
|
||||
[
|
||||
("https://example.com", False),
|
||||
("https://t.me/c/abc/123", True),
|
||||
("https://t.me/not-private/", False),
|
||||
("https://instagram.com", True),
|
||||
("https://www.instagram.com", True),
|
||||
("https://www.instagram.com/p/INVALID", True),
|
||||
("https://www.instagram.com/p/C4QgLbrIKXG/", True),
|
||||
],
|
||||
)
|
||||
def test_is_auth_wall(url, is_auth):
|
||||
assert is_auth_wall(url) == is_auth
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, raises",
|
||||
[
|
||||
("http://example.com", False),
|
||||
("https://example.com", False),
|
||||
("ftp://example.com", True),
|
||||
("http://localhost", True),
|
||||
("http://", True),
|
||||
],
|
||||
)
|
||||
def test_check_url_or_raise(url, raises):
|
||||
if raises:
|
||||
with pytest.raises(ValueError):
|
||||
check_url_or_raise(url)
|
||||
else:
|
||||
assert check_url_or_raise(url)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, domain",
|
||||
[
|
||||
("https://example.com", "example.com"),
|
||||
("https://www.example.com", "www.example.com"),
|
||||
("https://www.example.com/path", "www.example.com"),
|
||||
("https://", ""),
|
||||
("http://localhost", "localhost"),
|
||||
],
|
||||
)
|
||||
def test_domain_for_url(url, domain):
|
||||
assert domain_for_url(url) == domain
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, without_get",
|
||||
[
|
||||
("https://example.com", "https://example.com"),
|
||||
("https://example.com?utm_source=example", "https://example.com"),
|
||||
("https://example.com?utm_source=example&other=1", "https://example.com"),
|
||||
("https://example.com/something", "https://example.com/something"),
|
||||
("https://example.com/something?utm_source=example", "https://example.com/something"),
|
||||
],
|
||||
)
|
||||
def test_remove_get_parameters(url, without_get):
|
||||
assert remove_get_parameters(url) == without_get
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, relevant",
|
||||
[
|
||||
("https://example.com", True),
|
||||
("https://example.com/favicon.ico", False),
|
||||
("https://twimg.com/profile_images", False),
|
||||
("https://twimg.com/something/default_profile_images", False),
|
||||
("https://scontent.cdninstagram.com/username/150x150.jpg", False),
|
||||
("https://static.cdninstagram.com/rsrc.php/", False),
|
||||
("https://telegram.org/img/emoji/", False),
|
||||
("https://www.youtube.com/s/gaming/emoji/", False),
|
||||
("https://yt3.ggpht.com/default-user=", False),
|
||||
("https://www.youtube.com/s/search/audio/", False),
|
||||
("https://ok.ru/res/i/", False),
|
||||
("https://vk.com/emoji/", False),
|
||||
("https://vk.com/images/", False),
|
||||
("https://vk.com/images/reaction/", False),
|
||||
("https://wikipedia.org/static", False),
|
||||
("https://example.com/file.svg", False),
|
||||
("https://example.com/file.ico", False),
|
||||
("https://example.com/file.mp4", True),
|
||||
("https://example.com/150x150.jpg", True),
|
||||
("https://example.com/rsrc.php/", True),
|
||||
("https://example.com/img/emoji/", True),
|
||||
],
|
||||
)
|
||||
def test_is_relevant_url(url, relevant):
|
||||
assert is_relevant_url(url) == relevant
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, best_quality",
|
||||
[
|
||||
("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
|
||||
("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
|
||||
("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
|
||||
],
|
||||
)
|
||||
def test_twitter_best_quality_url(url, best_quality):
|
||||
assert twitter_best_quality_url(url) == best_quality
|
Ładowanie…
Reference in New Issue