v1.0.1 dependency updates, generic extractor improvements (#307)

* wacz: allow exceptional cases where more than one resource image is available

* improves generic extractor edge-cases and yt-dlp updates

* REMOVES vk_extractor until further notice

* bumps browsertrix in docker image

* npm version bump on scripts/settings

* poetry updates

* Changed log level on gsheet_feeder_db started from warning to info (#301)

* closes 305 and further fixes finding local downloads from uncommon ytdlp extractors

* use ffmpeg -bitexact to reduce duplicate content storing

* formatting

* adds yt-dlp curl-cffi

* version bump

* linting

---------

Co-authored-by: Dave Mateer <davemateer@gmail.com>
pull/308/head v1.0.1
Miguel Sozinho Ramalho 2025-06-02 20:57:12 +01:00 zatwierdzone przez GitHub
rodzic 48be13fb2a
commit 6735fa890b
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
14 zmienionych plików z 1196 dodań i 1140 usunięć

Wyświetl plik

@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:1.5.8 AS base
FROM webrecorder/browsertrix-crawler:1.6.1 AS base
ENV RUNNING_IN_DOCKER=1 \
LANG=C.UTF-8 \

Wyświetl plik

@ -71,7 +71,6 @@ The names of the actual modules have also changed, so for any extractor modules
- `telethon_archiver``telethon_extractor`
- `wacz_archiver_enricher``wacz_extractor_enricher`
- `wayback_archiver_enricher``wayback_extractor_enricher`
- `vk_archiver``vk_extractor`
#### c) Module Renaming

Wyświetl plik

@ -11,7 +11,6 @@ are available on the [extractors](../modules/extractor.md) page. Some sites supp
* Twitter
* Instagram
* Telegram
* VKontact
* Tiktok
* Bluesky

1266
poetry.lock wygenerowano

Plik diff jest za duży Load Diff

Wyświetl plik

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project]
name = "auto-archiver"
version = "1.0.0"
version = "1.0.1"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13"
@ -43,9 +43,7 @@ dependencies = [
"jinja2 (>=0.0.0)",
"boto3 (>=1.28.0,<2.0.0)",
"dataclasses-json (>=0.0.0)",
"yt-dlp (>=2025.3.21,<2026.0.0)",
"numpy (==2.1.3)",
"vk-url-scraper (>=0.0.0)",
"requests[socks] (>=0.0.0)",
"warcio (>=0.0.0)",
"jsonlines (>=0.0.0)",
@ -56,7 +54,9 @@ dependencies = [
"rfc3161-client (>=1.0.1,<2.0.0)",
"cryptography (>44.0.1,<45.0.0)",
"opentimestamps (>=0.4.5,<0.5.0)",
"bgutil-ytdlp-pot-provider (>=0.7.3,<0.8.0)",
"bgutil-ytdlp-pot-provider (>=1.0.0)",
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
"secretstorage (>=3.3.3,<4.0.0)",
]
[tool.poetry.group.dev.dependencies]

Plik diff jest za duży Load Diff

Wyświetl plik

@ -1,3 +1,4 @@
import mimetypes
import shutil
import sys
import datetime
@ -11,6 +12,7 @@ from urllib.request import urlretrieve
import yt_dlp
from yt_dlp.extractor.common import InfoExtractor
from yt_dlp.utils import MaxDownloadsReached
import pysubs2
from loguru import logger
@ -156,7 +158,7 @@ class GenericExtractor(Extractor):
logger.error("generate_once.js not found after transpilation.")
return
self.extractor_args.setdefault("youtube", {})["getpot_bgutil_script"] = script_path
self.extractor_args.setdefault("youtubepot-bgutilscript", {})["script_path"] = script_path
logger.info(f"PO Token script configured at: {script_path}")
except Exception as e:
@ -301,7 +303,7 @@ class GenericExtractor(Extractor):
result.set_url(url)
if "description" in video_data and not result.get("content"):
result.set_content(video_data["description"])
result.set_content(video_data.pop("description"))
# extract comments if enabled
if self.comments and video_data.get("comments", []) is not None:
result.set(
@ -362,7 +364,12 @@ class GenericExtractor(Extractor):
# this time download
ydl.params["getcomments"] = self.comments
# TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
try:
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
except MaxDownloadsReached: # proceed as normal once MaxDownloadsReached is raised
pass
logger.success(data)
if "entries" in data:
entries = data.get("entries", [])
if not len(entries):
@ -370,14 +377,33 @@ class GenericExtractor(Extractor):
return False
else:
entries = [data]
result = Metadata()
def _helper_get_filename(entry: dict) -> str:
entry_url = entry.get("url")
filename = ydl.prepare_filename(entry)
base_filename, _ = os.path.splitext(filename) # '/get/path/to/file' ignore '.ext'
directory = os.path.dirname(base_filename) # '/get/path/to'
basename = os.path.basename(base_filename) # 'file'
for f in os.listdir(directory):
if (
f.startswith(basename)
or (entry_url and os.path.splitext(f)[0] in entry_url)
and "video/" in (mimetypes.guess_type(f)[0] or "")
):
return os.path.join(directory, f)
return False
for entry in entries:
try:
filename = ydl.prepare_filename(entry)
if not os.path.exists(filename):
filename = filename.split(".")[0] + ".mkv"
filename = _helper_get_filename(entry)
if not filename or not os.path.exists(filename):
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
continue
logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
new_media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
@ -396,6 +422,9 @@ class GenericExtractor(Extractor):
result.add_media(new_media)
except Exception as e:
logger.error(f"Error processing entry {entry}: {e}")
if not len(result.media):
logger.warning(f"No media found for entry {entry}, skipping.")
return False
return self.add_metadata(data, info_extractor, url, result)
@ -454,6 +483,13 @@ class GenericExtractor(Extractor):
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
if data.get("is_live", False) and not self.livestreams:
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
return False
# it's a valid video, that the youtubdedl can download out of the box
return self.get_metadata_for_video(data, info_extractor, url, ydl)
try:
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
@ -461,11 +497,12 @@ class GenericExtractor(Extractor):
# don't download since it can be a live stream
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
if data.get("is_live", False) and not self.livestreams:
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
return False
# it's a valid video, that the youtubdedl can download out of the box
result = self.get_metadata_for_video(data, info_extractor, url, ydl)
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
except MaxDownloadsReached:
# yt-dlp raises an error when the max downloads limit is reached, and it shouldn't for our purposes, so we consider that a success
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
except Exception as e:
if info_extractor.IE_NAME == "generic":
@ -519,6 +556,8 @@ class GenericExtractor(Extractor):
"--write-subs" if self.subtitles else "--no-write-subs",
"--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
"--live-from-start" if self.live_from_start else "--no-live-from-start",
"--postprocessor-args",
"ffmpeg:-bitexact", # ensure bitexact output to avoid mismatching hashes for same video
]
# proxy handling

Wyświetl plik

@ -98,7 +98,7 @@ class GsheetsFeederDB(Feeder, Database):
return missing
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
logger.info(f"STARTED {item}")
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, "status", "Archive in progress")

Wyświetl plik

@ -1 +0,0 @@
from .vk_extractor import VkExtractor

Wyświetl plik

@ -1,37 +0,0 @@
{
"name": "VKontakte Extractor",
"type": ["extractor"],
"requires_setup": True,
"depends": ["core", "utils"],
"dependencies": {
"python": ["loguru", "vk_url_scraper"],
},
"configs": {
"username": {"required": True, "help": "valid VKontakte username"},
"password": {"required": True, "help": "valid VKontakte password"},
"session_file": {
"default": "secrets/vk_config.v2.json",
"help": "valid VKontakte password",
},
},
"description": """
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
and download content. Note that VK videos are handled separately by the `YTDownloader`.
### Features
- Extracts text, timestamps, and metadata from VK `/wall` posts.
- Downloads associated images and attaches them to the resulting `Metadata` object.
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
- Outputs structured metadata and media using `Metadata` and `Media` objects.
### Setup
To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
- **Username**: A valid VKontakte account username.
- **Password**: The corresponding password for the VKontakte account.
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
Credentials can be set in the configuration file or directly via environment variables. Ensure you
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
""",
}

Wyświetl plik

@ -1,43 +0,0 @@
from loguru import logger
from vk_url_scraper import VkScraper
from auto_archiver.utils.misc import dump_payload
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
class VkExtractor(Extractor):
""" "
VK videos are handled by YTDownloader, this archiver gets posts text and images.
Currently only works for /wall posts
"""
def setup(self) -> None:
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if "vk.com" not in item.netloc:
return False
# some urls can contain multiple wall/photo/... parts and all will be fetched
vk_scrapes = self.vks.scrape(url)
if not len(vk_scrapes):
return False
logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
result = Metadata()
for scrape in vk_scrapes:
if not result.get_title():
result.set_title(scrape["text"])
if not result.get_timestamp():
result.set_timestamp(scrape["datetime"])
result.set_content(dump_payload(vk_scrapes))
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
for filename in filenames:
result.add_media(Media(filename))
return result.success("vk")

Wyświetl plik

@ -194,7 +194,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
shutil.copyfileobj(infile, outfile)
# get media out of .warc
counter = 0
counter_warc_files = 0
counter_screenshots = 0
seen_urls = set()
with open(warc_filename, "rb") as warc_stream:
@ -203,12 +204,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
if (
record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot
): # screenshots
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
fn = os.path.join(tmp_dir, f"warc-file-{counter_screenshots}.png")
with open(fn, "wb") as outf:
outf.write(record.raw_stream.read())
m = Media(filename=fn)
to_enrich.add_media(m, "browsertrix-screenshot")
counter += 1
to_enrich.add_media(m, f"browsertrix-screenshot-{counter_screenshots}")
counter_screenshots += 1
if not self.extract_media:
continue
@ -231,7 +232,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
# create local file and add media
ext = mimetypes.guess_extension(content_type)
warc_fn = f"warc-file-{counter}{ext}"
warc_fn = f"warc-file-{counter_screenshots}{ext}"
fn = os.path.join(tmp_dir, warc_fn)
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
@ -256,6 +257,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
continue
to_enrich.add_media(m, warc_fn)
counter += 1
counter_warc_files += 1
seen_urls.add(record_url)
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
logger.info(
f"WACZ extract_media/extract_screenshot finished, found {counter_warc_files + counter_screenshots} relevant media file(s)"
)

Wyświetl plik

@ -119,4 +119,4 @@ def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
metadata.add_media(Media("something.wacz"), "browsertrix")
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
assert len(metadata.media) == 2
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot-0"

Wyświetl plik

@ -1,77 +0,0 @@
import pytest
from auto_archiver.core import Metadata
from auto_archiver.modules.vk_extractor import VkExtractor
@pytest.fixture
def mock_vk_scraper(mocker):
"""Fixture to mock VkScraper."""
return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
@pytest.fixture
def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
"""Fixture to initialize VkExtractor with mocked VkScraper."""
extractor_module = "vk_extractor"
configs = {
"username": "name",
"password": "password123",
"session_file": "secrets/vk_config.v2.json",
}
vk = setup_module(extractor_module, configs)
vk.vks = mock_vk_scraper.return_value
return vk
def test_netloc(vk_extractor, metadata):
# metadata url set as: "https://example.com/"
assert vk_extractor.download(metadata) is False
def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
metadata.set_url("https://vk.com/valid-wall")
vk_extractor.vks.scrape.return_value = []
assert vk_extractor.download(metadata) is False
assert metadata.netloc == "vk.com"
vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
mock_scrapes = [
{"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
]
mock_filenames = ["image1.jpg", "image2.png"]
vk_extractor.vks.scrape.return_value = mock_scrapes
vk_extractor.vks.download_media.return_value = mock_filenames
metadata.set_url("https://vk.com/valid-wall")
result = vk_extractor.download(metadata)
# Test metadata
assert result.is_success()
assert result.status == "vk: success"
assert result.get_title() == "Post Title"
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
assert "Another Post" in result.metadata["content"]
# Test Media objects
assert len(result.media) == 2
assert result.media[0].filename == "image1.jpg"
assert result.media[1].filename == "image2.png"
vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
def test_adds_first_title_and_timestamp(vk_extractor):
metadata = Metadata().set_url("https://vk.com/no-metadata")
metadata.set_url("https://vk.com/no-metadata")
mock_scrapes = [
{"text": "value", "datetime": "2023-01-01T00:00:00"},
{"text": "value2", "datetime": "2023-01-02T00:00:00"},
]
vk_extractor.vks.scrape.return_value = mock_scrapes
vk_extractor.vks.download_media.return_value = []
result = vk_extractor.download(metadata)
assert result.get_title() == "value"
# formatted timestamp
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
assert result.is_success()