kopia lustrzana https://github.com/bellingcat/auto-archiver
v1.0.1 dependency updates, generic extractor improvements (#307)
* wacz: allow exceptional cases where more than one resource image is available * improves generic extractor edge-cases and yt-dlp updates * REMOVES vk_extractor until further notice * bumps browsertrix in docker image * npm version bump on scripts/settings * poetry updates * Changed log level on gsheet_feeder_db started from warning to info (#301) * closes 305 and further fixes finding local downloads from uncommon ytdlp extractors * use ffmpeg -bitexact to reduce duplicate content storing * formatting * adds yt-dlp curl-cffi * version bump * linting --------- Co-authored-by: Dave Mateer <davemateer@gmail.com>pull/308/head v1.0.1
rodzic
48be13fb2a
commit
6735fa890b
|
@ -1,4 +1,4 @@
|
|||
FROM webrecorder/browsertrix-crawler:1.5.8 AS base
|
||||
FROM webrecorder/browsertrix-crawler:1.6.1 AS base
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1 \
|
||||
LANG=C.UTF-8 \
|
||||
|
|
|
@ -71,7 +71,6 @@ The names of the actual modules have also changed, so for any extractor modules
|
|||
- `telethon_archiver` → `telethon_extractor`
|
||||
- `wacz_archiver_enricher` → `wacz_extractor_enricher`
|
||||
- `wayback_archiver_enricher` → `wayback_extractor_enricher`
|
||||
- `vk_archiver` → `vk_extractor`
|
||||
|
||||
|
||||
#### c) Module Renaming
|
||||
|
|
|
@ -11,7 +11,6 @@ are available on the [extractors](../modules/extractor.md) page. Some sites supp
|
|||
* Twitter
|
||||
* Instagram
|
||||
* Telegram
|
||||
* VKontact
|
||||
* Tiktok
|
||||
* Bluesky
|
||||
|
||||
|
|
Plik diff jest za duży
Load Diff
|
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||
|
||||
[project]
|
||||
name = "auto-archiver"
|
||||
version = "1.0.0"
|
||||
version = "1.0.1"
|
||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||
|
||||
requires-python = ">=3.10,<3.13"
|
||||
|
@ -43,9 +43,7 @@ dependencies = [
|
|||
"jinja2 (>=0.0.0)",
|
||||
"boto3 (>=1.28.0,<2.0.0)",
|
||||
"dataclasses-json (>=0.0.0)",
|
||||
"yt-dlp (>=2025.3.21,<2026.0.0)",
|
||||
"numpy (==2.1.3)",
|
||||
"vk-url-scraper (>=0.0.0)",
|
||||
"requests[socks] (>=0.0.0)",
|
||||
"warcio (>=0.0.0)",
|
||||
"jsonlines (>=0.0.0)",
|
||||
|
@ -56,7 +54,9 @@ dependencies = [
|
|||
"rfc3161-client (>=1.0.1,<2.0.0)",
|
||||
"cryptography (>44.0.1,<45.0.0)",
|
||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||
"bgutil-ytdlp-pot-provider (>=0.7.3,<0.8.0)",
|
||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
||||
"secretstorage (>=3.3.3,<4.0.0)",
|
||||
]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
|
Plik diff jest za duży
Load Diff
|
@ -1,3 +1,4 @@
|
|||
import mimetypes
|
||||
import shutil
|
||||
import sys
|
||||
import datetime
|
||||
|
@ -11,6 +12,7 @@ from urllib.request import urlretrieve
|
|||
|
||||
import yt_dlp
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
from yt_dlp.utils import MaxDownloadsReached
|
||||
import pysubs2
|
||||
|
||||
from loguru import logger
|
||||
|
@ -156,7 +158,7 @@ class GenericExtractor(Extractor):
|
|||
logger.error("generate_once.js not found after transpilation.")
|
||||
return
|
||||
|
||||
self.extractor_args.setdefault("youtube", {})["getpot_bgutil_script"] = script_path
|
||||
self.extractor_args.setdefault("youtubepot-bgutilscript", {})["script_path"] = script_path
|
||||
logger.info(f"PO Token script configured at: {script_path}")
|
||||
|
||||
except Exception as e:
|
||||
|
@ -301,7 +303,7 @@ class GenericExtractor(Extractor):
|
|||
result.set_url(url)
|
||||
|
||||
if "description" in video_data and not result.get("content"):
|
||||
result.set_content(video_data["description"])
|
||||
result.set_content(video_data.pop("description"))
|
||||
# extract comments if enabled
|
||||
if self.comments and video_data.get("comments", []) is not None:
|
||||
result.set(
|
||||
|
@ -362,7 +364,12 @@ class GenericExtractor(Extractor):
|
|||
# this time download
|
||||
ydl.params["getcomments"] = self.comments
|
||||
# TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
||||
try:
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
||||
except MaxDownloadsReached: # proceed as normal once MaxDownloadsReached is raised
|
||||
pass
|
||||
logger.success(data)
|
||||
|
||||
if "entries" in data:
|
||||
entries = data.get("entries", [])
|
||||
if not len(entries):
|
||||
|
@ -370,14 +377,33 @@ class GenericExtractor(Extractor):
|
|||
return False
|
||||
else:
|
||||
entries = [data]
|
||||
|
||||
result = Metadata()
|
||||
|
||||
def _helper_get_filename(entry: dict) -> str:
|
||||
entry_url = entry.get("url")
|
||||
|
||||
filename = ydl.prepare_filename(entry)
|
||||
base_filename, _ = os.path.splitext(filename) # '/get/path/to/file' ignore '.ext'
|
||||
directory = os.path.dirname(base_filename) # '/get/path/to'
|
||||
basename = os.path.basename(base_filename) # 'file'
|
||||
for f in os.listdir(directory):
|
||||
if (
|
||||
f.startswith(basename)
|
||||
or (entry_url and os.path.splitext(f)[0] in entry_url)
|
||||
and "video/" in (mimetypes.guess_type(f)[0] or "")
|
||||
):
|
||||
return os.path.join(directory, f)
|
||||
return False
|
||||
|
||||
for entry in entries:
|
||||
try:
|
||||
filename = ydl.prepare_filename(entry)
|
||||
if not os.path.exists(filename):
|
||||
filename = filename.split(".")[0] + ".mkv"
|
||||
filename = _helper_get_filename(entry)
|
||||
|
||||
if not filename or not os.path.exists(filename):
|
||||
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
|
||||
continue
|
||||
|
||||
logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
|
||||
|
||||
new_media = Media(filename)
|
||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||
|
@ -396,6 +422,9 @@ class GenericExtractor(Extractor):
|
|||
result.add_media(new_media)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing entry {entry}: {e}")
|
||||
if not len(result.media):
|
||||
logger.warning(f"No media found for entry {entry}, skipping.")
|
||||
return False
|
||||
|
||||
return self.add_metadata(data, info_extractor, url, result)
|
||||
|
||||
|
@ -454,6 +483,13 @@ class GenericExtractor(Extractor):
|
|||
|
||||
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
|
||||
|
||||
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
|
||||
if data.get("is_live", False) and not self.livestreams:
|
||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||
return False
|
||||
# it's a valid video, that the youtubdedl can download out of the box
|
||||
return self.get_metadata_for_video(data, info_extractor, url, ydl)
|
||||
|
||||
try:
|
||||
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
||||
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||
|
@ -461,11 +497,12 @@ class GenericExtractor(Extractor):
|
|||
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
if data.get("is_live", False) and not self.livestreams:
|
||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||
return False
|
||||
# it's a valid video, that the youtubdedl can download out of the box
|
||||
result = self.get_metadata_for_video(data, info_extractor, url, ydl)
|
||||
|
||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||
|
||||
except MaxDownloadsReached:
|
||||
# yt-dlp raises an error when the max downloads limit is reached, and it shouldn't for our purposes, so we consider that a success
|
||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||
|
||||
except Exception as e:
|
||||
if info_extractor.IE_NAME == "generic":
|
||||
|
@ -519,6 +556,8 @@ class GenericExtractor(Extractor):
|
|||
"--write-subs" if self.subtitles else "--no-write-subs",
|
||||
"--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
|
||||
"--live-from-start" if self.live_from_start else "--no-live-from-start",
|
||||
"--postprocessor-args",
|
||||
"ffmpeg:-bitexact", # ensure bitexact output to avoid mismatching hashes for same video
|
||||
]
|
||||
|
||||
# proxy handling
|
||||
|
|
|
@ -98,7 +98,7 @@ class GsheetsFeederDB(Feeder, Database):
|
|||
return missing
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
logger.info(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
from .vk_extractor import VkExtractor
|
|
@ -1,37 +0,0 @@
|
|||
{
|
||||
"name": "VKontakte Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"depends": ["core", "utils"],
|
||||
"dependencies": {
|
||||
"python": ["loguru", "vk_url_scraper"],
|
||||
},
|
||||
"configs": {
|
||||
"username": {"required": True, "help": "valid VKontakte username"},
|
||||
"password": {"required": True, "help": "valid VKontakte password"},
|
||||
"session_file": {
|
||||
"default": "secrets/vk_config.v2.json",
|
||||
"help": "valid VKontakte password",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
|
||||
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
|
||||
and download content. Note that VK videos are handled separately by the `YTDownloader`.
|
||||
|
||||
### Features
|
||||
- Extracts text, timestamps, and metadata from VK `/wall` posts.
|
||||
- Downloads associated images and attaches them to the resulting `Metadata` object.
|
||||
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
|
||||
- Outputs structured metadata and media using `Metadata` and `Media` objects.
|
||||
|
||||
### Setup
|
||||
To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
|
||||
- **Username**: A valid VKontakte account username.
|
||||
- **Password**: The corresponding password for the VKontakte account.
|
||||
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
|
||||
|
||||
Credentials can be set in the configuration file or directly via environment variables. Ensure you
|
||||
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
|
||||
""",
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
from loguru import logger
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
from auto_archiver.utils.misc import dump_payload
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class VkExtractor(Extractor):
|
||||
""" "
|
||||
VK videos are handled by YTDownloader, this archiver gets posts text and images.
|
||||
Currently only works for /wall posts
|
||||
"""
|
||||
|
||||
def setup(self) -> None:
|
||||
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
if "vk.com" not in item.netloc:
|
||||
return False
|
||||
|
||||
# some urls can contain multiple wall/photo/... parts and all will be fetched
|
||||
vk_scrapes = self.vks.scrape(url)
|
||||
if not len(vk_scrapes):
|
||||
return False
|
||||
logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
|
||||
|
||||
result = Metadata()
|
||||
for scrape in vk_scrapes:
|
||||
if not result.get_title():
|
||||
result.set_title(scrape["text"])
|
||||
if not result.get_timestamp():
|
||||
result.set_timestamp(scrape["datetime"])
|
||||
|
||||
result.set_content(dump_payload(vk_scrapes))
|
||||
|
||||
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
|
||||
for filename in filenames:
|
||||
result.add_media(Media(filename))
|
||||
|
||||
return result.success("vk")
|
|
@ -194,7 +194,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
shutil.copyfileobj(infile, outfile)
|
||||
|
||||
# get media out of .warc
|
||||
counter = 0
|
||||
counter_warc_files = 0
|
||||
counter_screenshots = 0
|
||||
seen_urls = set()
|
||||
|
||||
with open(warc_filename, "rb") as warc_stream:
|
||||
|
@ -203,12 +204,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
if (
|
||||
record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot
|
||||
): # screenshots
|
||||
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
|
||||
fn = os.path.join(tmp_dir, f"warc-file-{counter_screenshots}.png")
|
||||
with open(fn, "wb") as outf:
|
||||
outf.write(record.raw_stream.read())
|
||||
m = Media(filename=fn)
|
||||
to_enrich.add_media(m, "browsertrix-screenshot")
|
||||
counter += 1
|
||||
to_enrich.add_media(m, f"browsertrix-screenshot-{counter_screenshots}")
|
||||
counter_screenshots += 1
|
||||
if not self.extract_media:
|
||||
continue
|
||||
|
||||
|
@ -231,7 +232,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
|
||||
# create local file and add media
|
||||
ext = mimetypes.guess_extension(content_type)
|
||||
warc_fn = f"warc-file-{counter}{ext}"
|
||||
warc_fn = f"warc-file-{counter_screenshots}{ext}"
|
||||
fn = os.path.join(tmp_dir, warc_fn)
|
||||
|
||||
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
|
||||
|
@ -256,6 +257,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
continue
|
||||
|
||||
to_enrich.add_media(m, warc_fn)
|
||||
counter += 1
|
||||
counter_warc_files += 1
|
||||
seen_urls.add(record_url)
|
||||
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
|
||||
logger.info(
|
||||
f"WACZ extract_media/extract_screenshot finished, found {counter_warc_files + counter_screenshots} relevant media file(s)"
|
||||
)
|
||||
|
|
|
@ -119,4 +119,4 @@ def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
|
|||
metadata.add_media(Media("something.wacz"), "browsertrix")
|
||||
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
|
||||
assert len(metadata.media) == 2
|
||||
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"
|
||||
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot-0"
|
||||
|
|
|
@ -1,77 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.vk_extractor import VkExtractor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_vk_scraper(mocker):
|
||||
"""Fixture to mock VkScraper."""
|
||||
return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
|
||||
"""Fixture to initialize VkExtractor with mocked VkScraper."""
|
||||
extractor_module = "vk_extractor"
|
||||
configs = {
|
||||
"username": "name",
|
||||
"password": "password123",
|
||||
"session_file": "secrets/vk_config.v2.json",
|
||||
}
|
||||
vk = setup_module(extractor_module, configs)
|
||||
vk.vks = mock_vk_scraper.return_value
|
||||
return vk
|
||||
|
||||
|
||||
def test_netloc(vk_extractor, metadata):
|
||||
# metadata url set as: "https://example.com/"
|
||||
assert vk_extractor.download(metadata) is False
|
||||
|
||||
|
||||
def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
vk_extractor.vks.scrape.return_value = []
|
||||
assert vk_extractor.download(metadata) is False
|
||||
assert metadata.netloc == "vk.com"
|
||||
vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
|
||||
|
||||
|
||||
def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
|
||||
mock_scrapes = [
|
||||
{"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
|
||||
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
|
||||
]
|
||||
mock_filenames = ["image1.jpg", "image2.png"]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = mock_filenames
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
result = vk_extractor.download(metadata)
|
||||
# Test metadata
|
||||
assert result.is_success()
|
||||
assert result.status == "vk: success"
|
||||
assert result.get_title() == "Post Title"
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert "Another Post" in result.metadata["content"]
|
||||
# Test Media objects
|
||||
assert len(result.media) == 2
|
||||
assert result.media[0].filename == "image1.jpg"
|
||||
assert result.media[1].filename == "image2.png"
|
||||
vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
|
||||
|
||||
|
||||
def test_adds_first_title_and_timestamp(vk_extractor):
|
||||
metadata = Metadata().set_url("https://vk.com/no-metadata")
|
||||
metadata.set_url("https://vk.com/no-metadata")
|
||||
mock_scrapes = [
|
||||
{"text": "value", "datetime": "2023-01-01T00:00:00"},
|
||||
{"text": "value2", "datetime": "2023-01-02T00:00:00"},
|
||||
]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = []
|
||||
result = vk_extractor.download(metadata)
|
||||
|
||||
assert result.get_title() == "value"
|
||||
# formatted timestamp
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert result.is_success()
|
Ładowanie…
Reference in New Issue