Set up screenshot enricher to use authentication/cookies

pull/189/head
Patrick Robertson 2025-02-03 17:25:59 +01:00
rodzic 7ec328ab40
commit c574b694ed
11 zmienionych plików z 153 dodań i 96 usunięć

Wyświetl plik

@ -4,6 +4,7 @@ from typing import Mapping, Any
from abc import ABC
from copy import deepcopy, copy
from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil
from loguru import logger
@ -98,8 +99,7 @@ class BaseModule(ABC):
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
# SECURITY: parse the domain using urllib
site = urlparse(site).netloc
site = UrlUtil.domain_for_url(site)
# add the 'www' version of the site to the list of sites to check
authdict = {}
@ -117,11 +117,10 @@ class BaseModule(ABC):
did find information for '{key}' which is close, is this what you meant? \
If so, edit your authentication settings to make sure it exactly matches.")
def get_ytdlp_cookiejar(args):
import yt_dlp
from yt_dlp import parse_options
logger.debug(f"Extracting cookies from settings: {args[1]}")
# parse_options returns a named tuple as follows, we only need the ydl_options part
# collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
@ -130,10 +129,12 @@ class BaseModule(ABC):
# get the cookies jar, prefer the browser cookies than the file
if 'cookies_from_browser' in self.authentication:
authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
if extract_cookies:
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
elif 'cookies_file' in self.authentication:
authdict['cookies_file'] = self.authentication['cookies_file']
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
if extract_cookies:
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
return authdict

Wyświetl plik

@ -174,7 +174,7 @@ class ArchivingOrchestrator:
default={},
action=AuthenticationJsonParseAction)
# logging arguments
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO')
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)

Wyświetl plik

@ -4,7 +4,7 @@ from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Enricher
from ..utils import Webdriver, UrlUtil, random_str
from ..utils import Webdriver, url as UrlUtil, random_str
from ..core import Media, Metadata
class ScreenshotEnricher(Enricher):

Wyświetl plik

@ -274,7 +274,7 @@ class GenericExtractor(Extractor):
"max_downloads": self.max_downloads, "playlistend": self.max_downloads}
# set up auth
auth = self.auth_for_site(url)
auth = self.auth_for_site(url, extract_cookies=False)
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
if auth:
if 'username' in auth and 'password' in auth:

Wyświetl plik

@ -5,7 +5,7 @@ from loguru import logger
from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import UrlUtil
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.extractor import Extractor
from .dropin import GenericDropin, InfoExtractor

Wyświetl plik

@ -6,7 +6,7 @@ from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Enricher
from auto_archiver.utils import Webdriver, UrlUtil, random_str
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
@ -19,7 +19,9 @@ class ScreenshotEnricher(Enricher):
return
logger.debug(f"Enriching screenshot for {url=}")
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
auth = self.auth_for_site(url)
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
try:
driver.get(url)
time.sleep(int(self.sleep_before_screenshot))

Wyświetl plik

@ -7,7 +7,7 @@ from warcio.archiveiterator import ArchiveIterator
from auto_archiver.core import Media, Metadata
from auto_archiver.core import Extractor, Enricher
from auto_archiver.utils import UrlUtil, random_str
from auto_archiver.utils import url as UrlUtil, random_str
class WaczExtractorEnricher(Enricher, Extractor):

Wyświetl plik

@ -3,7 +3,7 @@ from loguru import logger
import time, requests
from auto_archiver.core import Extractor, Enricher
from auto_archiver.utils import UrlUtil
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core import Metadata
class WaybackExtractorEnricher(Enricher, Extractor):

Wyświetl plik

@ -2,7 +2,6 @@
# we need to explicitly expose the available imports here
from .misc import *
from .webdriver import Webdriver
from .url import UrlUtil
from .atlos import get_atlos_config_options
# handy utils from ytdlp

Wyświetl plik

@ -1,83 +1,84 @@
import re
from urllib.parse import urlparse, urlunparse
class UrlUtil:
AUTHWALL_URLS = [
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
]
AUTHWALL_URLS = [
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
]
@staticmethod
def clean(url: str) -> str: return url
def domain_for_url(url: str) -> str:
"""
SECURITY: parse the domain using urllib to avoid any potential security issues
"""
return urlparse(url).netloc
@staticmethod
def is_auth_wall(url: str) -> bool:
"""
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
"""
for regex in UrlUtil.AUTHWALL_URLS:
if regex.match(url):
return True
def clean(url: str) -> str:
return url
return False
def is_auth_wall(url: str) -> bool:
"""
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
"""
for regex in AUTHWALL_URLS:
if regex.match(url):
return True
@staticmethod
def remove_get_parameters(url: str) -> str:
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
return False
@staticmethod
def is_relevant_url(url: str) -> bool:
"""
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
"""
clean_url = UrlUtil.remove_get_parameters(url)
def remove_get_parameters(url: str) -> str:
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
# favicons
if "favicon" in url: return False
# ifnore icons
if clean_url.endswith(".ico"): return False
# ignore SVGs
if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False
def is_relevant_url(url: str) -> bool:
"""
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
"""
clean_url = remove_get_parameters(url)
# twitter profile pictures
if "twimg.com/profile_images" in url: return False
if "twimg.com" in url and "/default_profile_images" in url: return False
# favicons
if "favicon" in url: return False
# ifnore icons
if clean_url.endswith(".ico"): return False
# ignore SVGs
if remove_get_parameters(url).endswith(".svg"): return False
# instagram profile pictures
if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
# instagram recurring images
if "https://static.cdninstagram.com/rsrc.php/" in url: return False
# twitter profile pictures
if "twimg.com/profile_images" in url: return False
if "twimg.com" in url and "/default_profile_images" in url: return False
# telegram
if "https://telegram.org/img/emoji/" in url: return False
# instagram profile pictures
if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
# instagram recurring images
if "https://static.cdninstagram.com/rsrc.php/" in url: return False
# youtube
if "https://www.youtube.com/s/gaming/emoji/" in url: return False
if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
if "https://www.youtube.com/s/search/audio/" in url: return False
# telegram
if "https://telegram.org/img/emoji/" in url: return False
# ok
if " https://ok.ru/res/i/" in url: return False
# youtube
if "https://www.youtube.com/s/gaming/emoji/" in url: return False
if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
if "https://www.youtube.com/s/search/audio/" in url: return False
# vk
if "https://vk.com/emoji/" in url: return False
if "vk.com/images/" in url: return False
if "vk.com/images/reaction/" in url: return False
# ok
if " https://ok.ru/res/i/" in url: return False
# wikipedia
if "wikipedia.org/static" in url: return False
# vk
if "https://vk.com/emoji/" in url: return False
if "vk.com/images/" in url: return False
if "vk.com/images/reaction/" in url: return False
return True
# wikipedia
if "wikipedia.org/static" in url: return False
@staticmethod
def twitter_best_quality_url(url: str) -> str:
"""
some twitter image URLs point to a less-than best quality
this returns the URL pointing to the highest (original) quality
"""
return re.sub(r"name=(\w+)", "name=orig", url, 1)
return True
def twitter_best_quality_url(url: str) -> str:
"""
some twitter image URLs point to a less-than best quality
this returns the URL pointing to the highest (original) quality
"""
return re.sub(r"name=(\w+)", "name=orig", url, 1)

Wyświetl plik

@ -9,12 +9,72 @@ from loguru import logger
from selenium.webdriver.common.by import By
import time
#import domain_for_url
from urllib.parse import urlparse, urlunparse
from http.cookiejar import MozillaCookieJar
class CookieSettingDriver(webdriver.Firefox):
facebook_accept_cookies: bool
cookies: str
cookiejar: MozillaCookieJar
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
super(CookieSettingDriver, self).__init__(*args, **kwargs)
self.cookies = cookies
self.cookiejar = cookiejar
self.facebook_accept_cookies = facebook_accept_cookies
def get(self, url: str):
if self.cookies or self.cookiejar:
# set up the driver to make it not 'cookie averse' (needs a context/URL)
# get the 'robots.txt' file which should be quick and easy
robots_url = urlunparse(urlparse(url)._replace(path='/robots.txt', query='', fragment=''))
super(CookieSettingDriver, self).get(robots_url)
if self.cookies:
# an explicit cookie is set for this site, use that first
for cookie in self.cookies.split(";"):
for name, value in cookie.split("="):
self.driver.add_cookie({'name': name, 'value': value})
elif self.cookiejar:
domain = urlparse(url).netloc.lstrip("www.")
for cookie in self.cookiejar:
if domain in cookie.domain:
try:
self.add_cookie({
'name': cookie.name,
'value': cookie.value,
'path': cookie.path,
'domain': cookie.domain,
'secure': bool(cookie.secure),
'expiry': cookie.expires
})
except Exception as e:
logger.warning(f"Failed to add cookie to webdriver: {e}")
if self.facebook_accept_cookies:
try:
logger.debug(f'Trying fb click accept cookie popup.')
super(CookieSettingDriver, self).get("http://www.facebook.com")
essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]")
essential_only.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
time.sleep(2)
except Exception as e:
logger.warning(f'Failed on fb accept cookies.', e)
# now get the actual URL
super(CookieSettingDriver, self).get(url)
class Webdriver:
def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver:
def __init__(self, width: int, height: int, timeout_seconds: int,
facebook_accept_cookies: bool = False, http_proxy: str = "",
print_options: dict = {}, auth: dict = {}) -> webdriver:
self.width = width
self.height = height
self.timeout_seconds = timeout_seconds
self.auth = auth
self.facebook_accept_cookies = facebook_accept_cookies
self.http_proxy = http_proxy
# create and set print options
@ -23,30 +83,24 @@ class Webdriver:
setattr(self.print_options, k, v)
def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
# options.add_argument("--headless")
options.add_argument(f'--proxy-server={self.http_proxy}')
options.set_preference('network.protocol-handler.external.tg', False)
# if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option
if self.facebook_accept_cookies:
options.add_argument('--lang=en')
try:
self.driver = webdriver.Firefox(options=options)
self.driver = CookieSettingDriver(cookies=self.auth.get('cookies'), cookiejar=self.auth.get('cookies_jar'),
facebook_accept_cookies=self.facebook_accept_cookies, options=options)
self.driver.set_window_size(self.width, self.height)
self.driver.set_page_load_timeout(self.timeout_seconds)
self.driver.print_options = self.print_options
except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
if self.facebook_accept_cookies:
try:
logger.debug(f'Trying fb click accept cookie popup.')
self.driver.get("http://www.facebook.com")
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
foo.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
time.sleep(2)
except:
logger.warning(f'Failed on fb accept cookies.')
return self.driver
def __exit__(self, exc_type, exc_val, exc_tb):