REMOVES vk_extractor until further notice

2025-06-02 12:06:02 +01:00 · 2025-06-02 12:06:02 +01:00 · 2d7206f99d
commit 2d7206f99d
--- a/docs/source/how_to/new_config_format.md
+++ b/docs/source/how_to/new_config_format.md
@ -71,7 +71,6 @@ The names of the actual modules have also changed, so for any extractor modules
 - `telethon_archiver` → `telethon_extractor`
 - `wacz_archiver_enricher` → `wacz_extractor_enricher`
 - `wayback_archiver_enricher` → `wayback_extractor_enricher`
- `vk_archiver` → `vk_extractor`


 #### c) Module Renaming
--- a/docs/source/installation/faq.md
+++ b/docs/source/installation/faq.md
@ -11,7 +11,6 @@ are available on the [extractors](../modules/extractor.md) page. Some sites supp
 * Twitter
 * Instagram
 * Telegram
-* VKontact
 * Tiktok
 * Bluesky

--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -43,9 +43,7 @@ dependencies = [
    "jinja2 (>=0.0.0)",
    "boto3 (>=1.28.0,<2.0.0)",
    "dataclasses-json (>=0.0.0)",
-    "yt-dlp (>=2025.3.21,<2026.0.0)",
    "numpy (==2.1.3)",
-    "vk-url-scraper (>=0.0.0)",
    "requests[socks] (>=0.0.0)",
    "warcio (>=0.0.0)",
    "jsonlines (>=0.0.0)",
@ -56,7 +54,9 @@ dependencies = [
    "rfc3161-client (>=1.0.1,<2.0.0)",
    "cryptography (>44.0.1,<45.0.0)",
    "opentimestamps (>=0.4.5,<0.5.0)",
-    "bgutil-ytdlp-pot-provider (>=0.7.3,<0.8.0)",
+    "bgutil-ytdlp-pot-provider (>=1.0.0)",
+    "yt-dlp (>=2025.5.22,<2026.0.0)",
+    "secretstorage (>=3.3.3,<4.0.0)",
 ]

 [tool.poetry.group.dev.dependencies]
--- a/src/auto_archiver/modules/vk_extractor/init.py
+++ b/src/auto_archiver/modules/vk_extractor/init.py
@ -1 +0,0 @@
-from .vk_extractor import VkExtractor
--- a/src/auto_archiver/modules/vk_extractor/manifest.py
+++ b/src/auto_archiver/modules/vk_extractor/manifest.py
@ -1,37 +0,0 @@
-{
-    "name": "VKontakte Extractor",
-    "type": ["extractor"],
-    "requires_setup": True,
-    "depends": ["core", "utils"],
-    "dependencies": {
-        "python": ["loguru", "vk_url_scraper"],
-    },
-    "configs": {
-        "username": {"required": True, "help": "valid VKontakte username"},
-        "password": {"required": True, "help": "valid VKontakte password"},
-        "session_file": {
-            "default": "secrets/vk_config.v2.json",
-            "help": "valid VKontakte password",
-        },
-    },
-    "description": """
-The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages. 
-This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract 
-and download content. Note that VK videos are handled separately by the `YTDownloader`.
-
-### Features
- Extracts text, timestamps, and metadata from VK `/wall` posts.
- Downloads associated images and attaches them to the resulting `Metadata` object.
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
- Outputs structured metadata and media using `Metadata` and `Media` objects.
-
-### Setup
-To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
- **Username**: A valid VKontakte account username.
- **Password**: The corresponding password for the VKontakte account.
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
-
-Credentials can be set in the configuration file or directly via environment variables. Ensure you 
-have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
-""",
-}
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@ -1,43 +0,0 @@
-from loguru import logger
-from vk_url_scraper import VkScraper
-
-from auto_archiver.utils.misc import dump_payload
-from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media
-
-
-class VkExtractor(Extractor):
-    """ "
-    VK videos are handled by YTDownloader, this archiver gets posts text and images.
-    Currently only works for /wall posts
-    """
-
-    def setup(self) -> None:
-        self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
-
-    def download(self, item: Metadata) -> Metadata:
-        url = item.get_url()
-
-        if "vk.com" not in item.netloc:
-            return False
-
-        # some urls can contain multiple wall/photo/... parts and all will be fetched
-        vk_scrapes = self.vks.scrape(url)
-        if not len(vk_scrapes):
-            return False
-        logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
-
-        result = Metadata()
-        for scrape in vk_scrapes:
-            if not result.get_title():
-                result.set_title(scrape["text"])
-            if not result.get_timestamp():
-                result.set_timestamp(scrape["datetime"])
-
-        result.set_content(dump_payload(vk_scrapes))
-
-        filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
-        for filename in filenames:
-            result.add_media(Media(filename))
-
-        return result.success("vk")
--- a/tests/extractors/test_vk_extractor.py
+++ b/tests/extractors/test_vk_extractor.py
@ -1,77 +0,0 @@
-import pytest
-
-from auto_archiver.core import Metadata
-from auto_archiver.modules.vk_extractor import VkExtractor
-
-
-@pytest.fixture
-def mock_vk_scraper(mocker):
-    """Fixture to mock VkScraper."""
-    return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
-
-
-@pytest.fixture
-def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
-    """Fixture to initialize VkExtractor with mocked VkScraper."""
-    extractor_module = "vk_extractor"
-    configs = {
-        "username": "name",
-        "password": "password123",
-        "session_file": "secrets/vk_config.v2.json",
-    }
-    vk = setup_module(extractor_module, configs)
-    vk.vks = mock_vk_scraper.return_value
-    return vk
-
-
-def test_netloc(vk_extractor, metadata):
-    # metadata url set as: "https://example.com/"
-    assert vk_extractor.download(metadata) is False
-
-
-def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
-    metadata.set_url("https://vk.com/valid-wall")
-    vk_extractor.vks.scrape.return_value = []
-    assert vk_extractor.download(metadata) is False
-    assert metadata.netloc == "vk.com"
-    vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
-
-
-def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
-    mock_scrapes = [
-        {"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
-        {"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
-    ]
-    mock_filenames = ["image1.jpg", "image2.png"]
-    vk_extractor.vks.scrape.return_value = mock_scrapes
-    vk_extractor.vks.download_media.return_value = mock_filenames
-    metadata.set_url("https://vk.com/valid-wall")
-    result = vk_extractor.download(metadata)
-    # Test metadata
-    assert result.is_success()
-    assert result.status == "vk: success"
-    assert result.get_title() == "Post Title"
-    assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
-    assert "Another Post" in result.metadata["content"]
-    # Test Media objects
-    assert len(result.media) == 2
-    assert result.media[0].filename == "image1.jpg"
-    assert result.media[1].filename == "image2.png"
-    vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
-
-
-def test_adds_first_title_and_timestamp(vk_extractor):
-    metadata = Metadata().set_url("https://vk.com/no-metadata")
-    metadata.set_url("https://vk.com/no-metadata")
-    mock_scrapes = [
-        {"text": "value", "datetime": "2023-01-01T00:00:00"},
-        {"text": "value2", "datetime": "2023-01-02T00:00:00"},
-    ]
-    vk_extractor.vks.scrape.return_value = mock_scrapes
-    vk_extractor.vks.download_media.return_value = []
-    result = vk_extractor.download(metadata)
-
-    assert result.get_title() == "value"
-    # formatted timestamp
-    assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
-    assert result.is_success()