diff --git a/poetry.lock b/poetry.lock index 83b2860..2855bb5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -103,14 +103,14 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] [[package]] name = "authlib" -version = "1.4.1" +version = "1.5.0" description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "Authlib-1.4.1-py2.py3-none-any.whl", hash = "sha256:edc29c3f6a3e72cd9e9f45fff67fc663a2c364022eb0371c003f22d5405915c1"}, - {file = "authlib-1.4.1.tar.gz", hash = "sha256:30ead9ea4993cdbab821dc6e01e818362f92da290c04c7f6a1940f86507a790d"}, + {file = "Authlib-1.5.0-py2.py3-none-any.whl", hash = "sha256:b3cc5ccfc19cf87678046b6e7cb19d402d8a631a33c40e36385232203227953a"}, + {file = "authlib-1.5.0.tar.gz", hash = "sha256:8fd8bd8f806485a532ac39a17b579982cf54688f956174f995cc938a91725423"}, ] [package.dependencies] @@ -172,18 +172,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.36.22" +version = "1.37.0" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.36.22-py3-none-any.whl", hash = "sha256:39957eabdce009353d72d131046489fbbfa15891865d5f069f1e8bfa414e6b81"}, - {file = "boto3-1.36.22.tar.gz", hash = "sha256:768c8a4d4a6227fe2258105efa086f1424cba5ca915a5eb2305b2cd979306ad1"}, + {file = "boto3-1.37.0-py3-none-any.whl", hash = "sha256:03bd8c93b226f07d944fd6b022e11a307bff94ab6a21d51675d7e3ea81ee8424"}, + {file = "boto3-1.37.0.tar.gz", hash = "sha256:01015b38017876d79efd7273f35d9a4adfba505237159621365bed21b9b65eca"}, ] [package.dependencies] -botocore = ">=1.36.22,<1.37.0" +botocore = ">=1.37.0,<1.38.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.11.0,<0.12.0" @@ -192,14 +192,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.36.22" +version = "1.37.0" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.36.22-py3-none-any.whl", hash = "sha256:75d6b34acb0686ee4d54ff6eb285e78ccfe318407428769d1e3e13351714d890"}, - {file = "botocore-1.36.22.tar.gz", hash = "sha256:59520247d5a479731724f97c995d5a1c2aae3b303b324f39d99efcfad1d3019e"}, + {file = "botocore-1.37.0-py3-none-any.whl", hash = "sha256:d01661f38c0edac87424344cdf4169f3ab9bc1bf1b677c8b230d025eb66c54a3"}, + {file = "botocore-1.37.0.tar.gz", hash = "sha256:b129d091a8360b4152ab65327186bf4e250de827c4a9b7ddf40a72b1acf1f3c1"}, ] [package.dependencies] @@ -363,14 +363,14 @@ beautifulsoup4 = "*" [[package]] name = "cachetools" -version = "5.5.1" +version = "5.5.2" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "cachetools-5.5.1-py3-none-any.whl", hash = "sha256:b76651fdc3b24ead3c648bbdeeb940c1b04d365b38b4af66788f9ec4a81d42bb"}, - {file = "cachetools-5.5.1.tar.gz", hash = "sha256:70f238fbba50383ef62e55c6aff6d9673175fe59f7c6782c7a0b9e38f4a9df95"}, + {file = "cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a"}, + {file = "cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4"}, ] [[package]] @@ -860,14 +860,14 @@ tool = ["click (>=6.0.0)"] [[package]] name = "googleapis-common-protos" -version = "1.67.0" +version = "1.68.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "googleapis_common_protos-1.67.0-py2.py3-none-any.whl", hash = "sha256:579de760800d13616f51cf8be00c876f00a9f146d3e6510e19d1f4111758b741"}, - {file = "googleapis_common_protos-1.67.0.tar.gz", hash = "sha256:21398025365f138be356d5923e9168737d94d46a72aefee4a6110a1f23463c86"}, + {file = "googleapis_common_protos-1.68.0-py2.py3-none-any.whl", hash = "sha256:aaf179b2f81df26dfadac95def3b16a95064c76a5f45f07e4c68a21bb371c4ac"}, + {file = "googleapis_common_protos-1.68.0.tar.gz", hash = "sha256:95d38161f4f9af0d9423eed8fb7b64ffd2568c3464eb542ff02c5bfa1953ab3c"}, ] [package.dependencies] @@ -1674,14 +1674,14 @@ files = [ [[package]] name = "pydata-sphinx-theme" -version = "0.16.1" +version = "0.15.4" description = "Bootstrap-based Sphinx theme from the PyData community" optional = false python-versions = ">=3.9" groups = ["docs"] files = [ - {file = "pydata_sphinx_theme-0.16.1-py3-none-any.whl", hash = "sha256:225331e8ac4b32682c18fcac5a57a6f717c4e632cea5dd0e247b55155faeccde"}, - {file = "pydata_sphinx_theme-0.16.1.tar.gz", hash = "sha256:a08b7f0b7f70387219dc659bff0893a7554d5eb39b59d3b8ef37b8401b7642d7"}, + {file = "pydata_sphinx_theme-0.15.4-py3-none-any.whl", hash = "sha256:2136ad0e9500d0949f96167e63f3e298620040aea8f9c74621959eda5d4cf8e6"}, + {file = "pydata_sphinx_theme-0.15.4.tar.gz", hash = "sha256:7762ec0ac59df3acecf49fd2f889e1b4565dbce8b88b2e29ee06fdd90645a06d"}, ] [package.dependencies] @@ -1689,8 +1689,9 @@ accessible-pygments = "*" Babel = "*" beautifulsoup4 = "*" docutils = "!=0.17.0" +packaging = "*" pygments = ">=2.7" -sphinx = ">=6.1" +sphinx = ">=5" typing-extensions = "*" [package.extras] @@ -2265,14 +2266,14 @@ crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"] [[package]] name = "selenium" -version = "4.28.1" +version = "4.29.0" description = "Official Python bindings for Selenium WebDriver" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "selenium-4.28.1-py3-none-any.whl", hash = "sha256:4238847e45e24e4472cfcf3554427512c7aab9443396435b1623ef406fff1cc1"}, - {file = "selenium-4.28.1.tar.gz", hash = "sha256:0072d08670d7ec32db901bd0107695a330cecac9f196e3afb3fa8163026e022a"}, + {file = "selenium-4.29.0-py3-none-any.whl", hash = "sha256:ce5d26f1ddc1111641113653af33694c13947dd36c2df09cdd33f554351d372e"}, + {file = "selenium-4.29.0.tar.gz", hash = "sha256:3a62f7ec33e669364a6c0562a701deb69745b569c50d55f1a912bf8eb33358ba"}, ] [package.dependencies] @@ -2425,19 +2426,19 @@ test = ["httpx", "pytest (>=6)"] [[package]] name = "sphinx-book-theme" -version = "1.1.3" +version = "1.1.4" description = "A clean book theme for scientific explanations and documentation with Sphinx" optional = false python-versions = ">=3.9" groups = ["docs"] files = [ - {file = "sphinx_book_theme-1.1.3-py3-none-any.whl", hash = "sha256:a554a9a7ac3881979a87a2b10f633aa2a5706e72218a10f71be38b3c9e831ae9"}, - {file = "sphinx_book_theme-1.1.3.tar.gz", hash = "sha256:1f25483b1846cb3d353a6bc61b3b45b031f4acf845665d7da90e01ae0aef5b4d"}, + {file = "sphinx_book_theme-1.1.4-py3-none-any.whl", hash = "sha256:843b3f5c8684640f4a2d01abd298beb66452d1b2394cd9ef5be5ebd5640ea0e1"}, + {file = "sphinx_book_theme-1.1.4.tar.gz", hash = "sha256:73efe28af871d0a89bd05856d300e61edce0d5b2fbb7984e84454be0fedfe9ed"}, ] [package.dependencies] -pydata-sphinx-theme = ">=0.15.2" -sphinx = ">=5" +pydata-sphinx-theme = "0.15.4" +sphinx = ">=6.1" [package.extras] code-style = ["pre-commit"] @@ -2584,14 +2585,14 @@ test = ["pytest"] [[package]] name = "starlette" -version = "0.45.3" +version = "0.46.0" description = "The little ASGI library that shines." optional = false python-versions = ">=3.9" groups = ["docs"] files = [ - {file = "starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d"}, - {file = "starlette-0.45.3.tar.gz", hash = "sha256:2cbcba2a75806f8a41c722141486f37c28e30a0921c5f6fe4346cb0dcee1302f"}, + {file = "starlette-0.46.0-py3-none-any.whl", hash = "sha256:913f0798bd90ba90a9156383bcf1350a17d6259451d0d8ee27fc0cf2db609038"}, + {file = "starlette-0.46.0.tar.gz", hash = "sha256:b359e4567456b28d473d0193f34c0de0ed49710d75ef183a74a5ce0499324f50"}, ] [package.dependencies] @@ -2602,14 +2603,14 @@ full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart [[package]] name = "telethon" -version = "1.38.1" +version = "1.39.0" description = "Full-featured Telegram client library for Python 3" optional = false python-versions = ">=3.5" groups = ["main"] files = [ - {file = "Telethon-1.38.1-py3-none-any.whl", hash = "sha256:30c187017501bfb982b8af5659f864dda4108f77ea49cfce61e8f6fdb8a18d6e"}, - {file = "Telethon-1.38.1.tar.gz", hash = "sha256:f9866c1e37197a0894e0c02aa56a6359bffb14a585e88e18e3e819df4fda399a"}, + {file = "Telethon-1.39.0-py3-none-any.whl", hash = "sha256:aa9f394b94be144799a6f6a93ab463867bc7c63503ede9631751940a98f6c703"}, + {file = "telethon-1.39.0.tar.gz", hash = "sha256:35d4795d8c91deac515fb0bcb3723866b924de1c724e1d5c230460e96f284a63"}, ] [package.dependencies] @@ -2719,14 +2720,14 @@ sortedcontainers = "*" [[package]] name = "trio-websocket" -version = "0.12.1" +version = "0.12.2" description = "WebSocket library for Trio" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "trio_websocket-0.12.1-py3-none-any.whl", hash = "sha256:608ec746bb287e5d5a66baf483e41194193c5cf05ffaad6240e7d1fcd80d1e6f"}, - {file = "trio_websocket-0.12.1.tar.gz", hash = "sha256:d55ccd4d3eae27c494f3fdae14823317839bdcb8214d1173eacc4d42c69fc91b"}, + {file = "trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6"}, + {file = "trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae"}, ] [package.dependencies] @@ -3161,14 +3162,14 @@ h11 = ">=0.9.0,<1" [[package]] name = "yt-dlp" -version = "2025.1.26" +version = "2025.2.19" description = "A feature-rich command-line audio/video downloader" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "yt_dlp-2025.1.26-py3-none-any.whl", hash = "sha256:3e76bd896b9f96601021ca192ca0fbdd195e3c3dcc28302a3a34c9bc4979da7b"}, - {file = "yt_dlp-2025.1.26.tar.gz", hash = "sha256:1c9738266921ad43c568ad01ac3362fb7c7af549276fbec92bd72f140da16240"}, + {file = "yt_dlp-2025.2.19-py3-none-any.whl", hash = "sha256:3ed218eaeece55e9d715afd41abc450dc406ee63bf79355169dfde312d38fdb8"}, + {file = "yt_dlp-2025.2.19.tar.gz", hash = "sha256:f33ca76df2e4db31880f2fe408d44f5058d9f135015b13e50610dfbe78245bea"}, ] [package.extras] diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index f18ad13..8d520d1 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -50,7 +50,6 @@ class BaseModule(ABC): def config_setup(self, config: dict): - authentication = config.get('authentication', {}) # this is important. Each instance is given its own deepcopied config, so modules cannot # change values to affect other modules config = deepcopy(config) @@ -117,7 +116,7 @@ class BaseModule(ABC): # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts')) ytdlp_opts = getattr(parse_options(args), 'ydl_opts') return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar - + get_cookiejar_options = None # order of priority: diff --git a/src/auto_archiver/core/consts.py b/src/auto_archiver/core/consts.py index 0fb81fb..a49884f 100644 --- a/src/auto_archiver/core/consts.py +++ b/src/auto_archiver/core/consts.py @@ -14,7 +14,7 @@ DEFAULT_MANIFEST = { 'name': '', # the display name of the module 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! 'type': [], # the type of the module, can be one or more of MODULE_TYPES - 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare + 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software 'description': '', # a description of the module 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 08d2af7..47c03f6 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -86,7 +86,7 @@ class GenericExtractor(Extractor): # keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist result.set_title(video_data.pop('title', video_data.pop('fulltitle', ""))) result.set_url(url) - + if "description" in video_data: result.set_content(video_data["description"]) # extract comments if enabled if self.comments: result.set("comments", [{ diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index ea724e7..2026804 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -64,7 +64,7 @@ class GsheetsFeeder(Feeder): yield m def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: - # TODO: Check folder value not being recognised + m.set_context("gsheet", {"row": row, "worksheet": gw}) if gw.get_cell_or_default(row, 'folder', "") is None: diff --git a/src/auto_archiver/modules/gsheet_feeder/gworksheet.py b/src/auto_archiver/modules/gsheet_feeder/gworksheet.py index 3044780..ba2d691 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gworksheet.py +++ b/src/auto_archiver/modules/gsheet_feeder/gworksheet.py @@ -17,6 +17,7 @@ class GWorksheet: 'thumbnail': 'thumbnail', 'timestamp': 'upload timestamp', 'title': 'upload title', + 'text': 'text content', 'screenshot': 'screenshot', 'hash': 'hash', 'pdq_hash': 'perceptual hashes', diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index e46a93d..49ef0b5 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -1,9 +1,11 @@ -import os +import hashlib import json +import os import uuid from datetime import datetime, timezone +from dateutil.parser import parse as parse_dt + import requests -import hashlib from loguru import logger @@ -68,26 +70,34 @@ def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: in hash.update(buf) return hash.hexdigest() -def get_current_datetime_iso() -> str: - return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat() +def get_datetime_from_str(dt_str: str, fmt: str | None = None, dayfirst=True) -> datetime | None: + """ parse a datetime string with option of passing a specific format -def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None: - # parse a datetime string with option of passing a specific format + Args: + dt_str: the datetime string to parse + fmt: the python date format of the datetime string, if None, dateutil.parser.parse is used + dayfirst: Use this to signify between date formats which put the day first, vs the month first: + e.g. DD/MM/YYYY vs MM/DD/YYYY + """ try: - return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str) + return datetime.strptime(dt_str, fmt) if fmt else parse_dt(dt_str, dayfirst=dayfirst) except ValueError as e: logger.error(f"Unable to parse datestring {dt_str}: {e}") return None -def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None: - # Consistent parsing of timestamps - # If utc=True, the timezone is set to UTC, - # if iso=True, the output is an iso string +def get_timestamp(ts, utc=True, iso=True, dayfirst=True) -> str | datetime | None: + """ Consistent parsing of timestamps. + Args: + If utc=True, the timezone is set to UTC, + if iso=True, the output is an iso string + Use dayfirst to signify between date formats which put the date vs month first: + e.g. DD/MM/YYYY vs MM/DD/YYYY + """ if not ts: return try: - if isinstance(ts, str): ts = datetime.fromisoformat(ts) + if isinstance(ts, str): ts = parse_dt(ts, dayfirst=dayfirst) if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts) if utc: ts = ts.replace(tzinfo=timezone.utc) if iso: return ts.isoformat() @@ -96,5 +106,6 @@ def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None: logger.error(f"Unable to parse timestamp {ts}: {e}") return None + def get_current_timestamp() -> str: - return get_timestamp(datetime.now()) \ No newline at end of file + return get_timestamp(datetime.now()) diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 42a21b2..8b49e5a 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -32,9 +32,8 @@ def mock_metadata(mocker): @pytest.fixture def metadata(): metadata = Metadata() - metadata.add_media(Media(filename="screenshot", urls=["http://example.com/screenshot.png"])) - metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"])) - metadata.add_media(Media(filename="thumbnail", urls=["http://example.com/thumbnail.png"])) + metadata.add_media(Media(filename="screenshot.png", urls=["http://example.com/screenshot.png"]).set("id", "screenshot")) + metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"]).set("id", "browsertrix")) metadata.set_url("http://example.com") metadata.set_title("Example Title") metadata.set_content("Example Content") @@ -53,7 +52,7 @@ def mock_media(mocker): return mock_media @pytest.fixture -def gsheets_db(mock_gworksheet, setup_module, mocker): +def gsheets_db(mock_gworksheet, setup_module, mocker) -> GsheetsDb: db = setup_module("gsheet_db", { "allow_worksheets": "set()", "block_worksheets": "set()", @@ -80,10 +79,10 @@ def expected_calls(mock_media, fixed_timestamp): (1, 'text', 'Example Content'), (1, 'timestamp', '2025-01-01T00:00:00+00:00'), (1, 'hash', 'not-calculated'), - # (1, 'screenshot', 'http://example.com/screenshot.png'), - # (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'), - # (1, 'wacz', 'http://example.com/browsertrix.wacz'), - # (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=') + (1, 'screenshot', 'http://example.com/screenshot.png'), + (1, 'thumbnail', '=IMAGE("http://example.com/screenshot.png")'), + (1, 'wacz', 'http://example.com/browsertrix.wacz'), + (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A//example.com/browsertrix.wacz#view=pages&url=http%3A//example.com') ] def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet): diff --git a/tests/enrichers/test_pdq_hash_enricher.py b/tests/enrichers/test_pdq_hash_enricher.py index 9653734..a8470fb 100644 --- a/tests/enrichers/test_pdq_hash_enricher.py +++ b/tests/enrichers/test_pdq_hash_enricher.py @@ -20,17 +20,15 @@ def metadata_with_images(): def test_successful_enrich(metadata_with_images, mocker): - with ( - mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)), - mocker.patch("PIL.Image.open"), - mocker.patch.object(Media, "is_image", return_value=True) as mock_is_image, - ): - enricher = PdqHashEnricher() - enricher.enrich(metadata_with_images) + mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)) + mocker.patch("PIL.Image.open") + mocker.patch.object(Media, "is_image", return_value=True) + enricher = PdqHashEnricher() + enricher.enrich(metadata_with_images) - # Ensure the hash is set for image media - for media in metadata_with_images.media: - assert media.get("pdq_hash") is not None + # Ensure the hash is set for image media + for media in metadata_with_images.media: + assert media.get("pdq_hash") is not None def test_enrich_skip_non_image(metadata_with_images, mocker): diff --git a/tests/enrichers/test_wayback_enricher.py b/tests/enrichers/test_wayback_enricher.py index 88f4662..5406e39 100644 --- a/tests/enrichers/test_wayback_enricher.py +++ b/tests/enrichers/test_wayback_enricher.py @@ -16,7 +16,7 @@ def mock_is_auth_wall(mocker): def mock_post_success(mocker): """Fixture to mock POST requests with a successful response.""" def _mock_post(json_data: dict = None, status_code: int = 200): - json_data = json_data or {"job_id": "job123"} + json_data = {"job_id": "job123"} if json_data is None else json_data resp = mocker.Mock(status_code=status_code) resp.json.return_value = json_data return mocker.patch("requests.post", return_value=resp) diff --git a/tests/extractors/test_vk_extractor.py b/tests/extractors/test_vk_extractor.py new file mode 100644 index 0000000..80eb9dd --- /dev/null +++ b/tests/extractors/test_vk_extractor.py @@ -0,0 +1,76 @@ +import pytest + +from auto_archiver.core import Metadata +from auto_archiver.modules.vk_extractor import VkExtractor + + +@pytest.fixture +def mock_vk_scraper(mocker): + """Fixture to mock VkScraper.""" + return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper") + +@pytest.fixture +def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor: + """Fixture to initialize VkExtractor with mocked VkScraper.""" + extractor_module = "vk_extractor" + configs = { + "username": "name", + "password": "password123", + "session_file": "secrets/vk_config.v2.json", + } + vk = setup_module(extractor_module, configs) + vk.vks = mock_vk_scraper.return_value + return vk + + +def test_netloc(vk_extractor, metadata): + # metadata url set as: "https://example.com/" + assert vk_extractor.download(metadata) is False + + +def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata): + metadata.set_url("https://vk.com/valid-wall") + vk_extractor.vks.scrape.return_value = [] + assert vk_extractor.download(metadata) is False + assert metadata.netloc == "vk.com" + vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url()) + + +def test_successful_scrape_and_download(vk_extractor, metadata, mocker): + mock_scrapes = [ + {"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1}, + {"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2} + ] + mock_filenames = ["image1.jpg", "image2.png"] + vk_extractor.vks.scrape.return_value = mock_scrapes + vk_extractor.vks.download_media.return_value = mock_filenames + metadata.set_url("https://vk.com/valid-wall") + result = vk_extractor.download(metadata) + # Test metadata + assert result.is_success() + assert result.status == "vk: success" + assert result.get_title() == "Post Title" + assert result.get_timestamp() == "2023-01-01T00:00:00+00:00" + assert "Another Post" in result.metadata["content"] + # Test Media objects + assert len(result.media) == 2 + assert result.media[0].filename == "image1.jpg" + assert result.media[1].filename == "image2.png" + vk_extractor.vks.download_media.assert_called_once_with( + mock_scrapes, vk_extractor.tmp_dir + ) + + +def test_adds_first_title_and_timestamp(vk_extractor): + metadata = Metadata().set_url("https://vk.com/no-metadata") + metadata.set_url("https://vk.com/no-metadata") + mock_scrapes = [{"text": "value", "datetime": "2023-01-01T00:00:00"}, + {"text": "value2", "datetime": "2023-01-02T00:00:00"}] + vk_extractor.vks.scrape.return_value = mock_scrapes + vk_extractor.vks.download_media.return_value = [] + result = vk_extractor.download(metadata) + + assert result.get_title() == "value" + # formatted timestamp + assert result.get_timestamp() == "2023-01-01T00:00:00+00:00" + assert result.is_success() \ No newline at end of file diff --git a/tests/storages/test_local_storage.py b/tests/storages/test_local_storage.py index 85f97c6..7617867 100644 --- a/tests/storages/test_local_storage.py +++ b/tests/storages/test_local_storage.py @@ -9,11 +9,12 @@ from auto_archiver.modules.local_storage import LocalStorage @pytest.fixture -def local_storage(setup_module) -> LocalStorage: +def local_storage(setup_module, tmp_path) -> LocalStorage: + save_to = tmp_path / "local_archive" configs: dict = { "path_generator": "flat", "filename_generator": "static", - "save_to": "./local_archive", + "save_to": str(save_to), "save_absolute": False, } return setup_module("local_storage", configs)