2025-01-17 10:56:08 +00:00
from pathlib import Path
2025-01-21 15:58:18 +00:00
import datetime
2025-01-17 16:29:13 +00:00
import os
2025-01-17 10:56:08 +00:00
2025-01-21 15:58:18 +00:00
from os . path import dirname
import pytest
2025-01-17 10:56:08 +00:00
2025-01-28 13:40:12 +00:00
from auto_archiver . modules . generic_extractor . generic_extractor import GenericExtractor
from . test_extractor_base import TestExtractorBase
2025-01-17 10:56:08 +00:00
2025-03-10 18:44:54 +00:00
CI = os . getenv ( " GITHUB_ACTIONS " , " " ) == " true "
2025-06-11 19:05:35 +00:00
TEST_TRUTH_SOCIAL = os . getenv ( " TEST_TRUTH_SOCIAL " , " " ) == " true "
2025-03-10 18:44:54 +00:00
2025-01-28 13:40:12 +00:00
class TestGenericExtractor ( TestExtractorBase ) :
2025-03-10 18:44:54 +00:00
""" Tests Generic Extractor """
extractor_module = " generic_extractor "
2025-01-28 13:40:12 +00:00
extractor : GenericExtractor
2025-01-17 10:56:08 +00:00
config = {
2025-03-10 18:44:54 +00:00
" subtitles " : False ,
" comments " : False ,
" livestreams " : False ,
" live_from_start " : False ,
" end_means_success " : True ,
" allow_playlist " : False ,
" max_downloads " : " inf " ,
" proxy " : None ,
" cookies_from_browser " : False ,
" cookie_file " : None ,
2025-03-18 14:54:57 +00:00
" pot_provider " : False ,
2025-03-10 18:44:54 +00:00
}
2025-01-21 15:58:18 +00:00
def test_load_dropin ( self ) :
# test loading dropins that are in the generic_archiver package
2025-01-28 13:40:12 +00:00
package = " auto_archiver.modules.generic_extractor "
assert self . extractor . dropin_for_name ( " bluesky " , package = package )
2025-01-21 15:58:18 +00:00
2025-03-26 19:03:48 +00:00
# test loading dropins via filepath
2025-01-21 15:58:18 +00:00
path = os . path . join ( dirname ( dirname ( __file__ ) ) , " data/ " )
2025-01-28 13:40:12 +00:00
assert self . extractor . dropin_for_name ( " dropin " , additional_paths = [ path ] )
2025-01-21 15:58:18 +00:00
2025-03-17 10:05:11 +00:00
@pytest.mark.parametrize (
" url, suitable_extractors " ,
[
( " https://www.youtube.com/watch?v=5qap5aO4i9A " , [ " youtube " ] ) ,
( " https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en " , [ " tiktok " ] ) ,
( " https://www.instagram.com/p/CU1J9JYJ9Zz/ " , [ " instagram " ] ) ,
( " https://www.facebook.com/nytimes/videos/10160796550110716 " , [ " facebook " ] ) ,
( " https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/ " , [ " facebook " ] ) ,
] ,
)
2025-02-25 11:44:35 +00:00
def test_suitable_extractors ( self , url , suitable_extractors ) :
2025-03-17 10:05:11 +00:00
suitable_extractors = suitable_extractors + [ " generic " ] # the generic is valid for all
2025-02-25 11:44:35 +00:00
extractors = list ( self . extractor . suitable_extractors ( url ) )
assert len ( extractors ) == len ( suitable_extractors )
assert [ e . ie_key ( ) . lower ( ) for e in extractors ] == suitable_extractors
2025-01-17 10:56:08 +00:00
2025-03-10 18:44:54 +00:00
@pytest.mark.parametrize (
" url, is_suitable " ,
[
( " https://www.youtube.com/watch?v=5qap5aO4i9A " , True ) ,
( " https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en " , True ) ,
( " https://www.instagram.com/p/CU1J9JYJ9Zz/ " , True ) ,
( " https://www.facebook.com/nytimes/videos/10160796550110716 " , True ) ,
( " https://www.twitch.tv/videos/1167226570 " , True ) ,
(
" https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/ " ,
True ,
) ,
( " https://google.com " , True ) ,
] ,
)
2025-02-25 11:44:35 +00:00
def test_suitable_urls ( self , url , is_suitable ) :
2025-01-17 10:56:08 +00:00
"""
2025-03-10 18:44:54 +00:00
Note : expected behaviour is to return True for all URLs , as YoutubeDLArchiver should be able to handle all URLs
This behaviour may be changed in the future ( e . g . if we want the youtubedl archiver to just handle URLs it has extractors for ,
and then if and only if all archivers fails , does it fall back to the generic archiver )
2025-01-17 10:56:08 +00:00
"""
2025-01-28 13:40:12 +00:00
assert self . extractor . suitable ( url ) == is_suitable
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_download_tiktok ( self , make_item ) :
item = make_item ( " https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result . get_url ( ) == " https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970 "
2025-01-17 16:29:13 +00:00
2025-01-20 15:17:57 +00:00
@pytest.mark.download
2025-03-10 18:44:54 +00:00
@pytest.mark.parametrize (
" url " ,
[
" https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l " ,
" twitter.com/bellingcat/status/123 " ,
" https://www.youtube.com/watch?v=1 " ,
] ,
)
2025-02-19 10:29:05 +00:00
def test_download_nonexistent_media ( self , make_item , url ) :
2025-01-20 15:17:57 +00:00
"""
2025-06-04 10:53:01 +00:00
Test to make sure that the extractor doesn ' t break on non-existent posts/media
2025-01-20 15:17:57 +00:00
It should return ' False '
"""
item = make_item ( url )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert not result
2025-03-10 18:44:54 +00:00
@pytest.mark.skipif (
CI ,
reason = " Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn ' t support logging in with username/password. " ,
)
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_youtube_download ( self , make_item ) :
# url https://www.youtube.com/watch?v=5qap5aO4i9A
2025-02-18 19:10:09 +00:00
2025-01-17 10:56:08 +00:00
item = make_item ( " https://www.youtube.com/watch?v=J---aiyznGQ " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result . get_url ( ) == " https://www.youtube.com/watch?v=J---aiyznGQ "
assert result . get_title ( ) == " Keyboard Cat! - THE ORIGINAL! "
2025-03-10 18:44:54 +00:00
assert (
result . get ( " description " )
== " Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com \n \n xo Keyboard Cat memes make your day better! \n http://www.keyboardcatstore.com/ \n https://www.facebook.com/thekeyboardcat \n http://www.charlieschmidt.com/ "
)
2025-01-17 10:56:08 +00:00
assert len ( result . media ) == 2
2025-03-26 19:03:48 +00:00
assert " J---aiyznGQ " in Path ( result . media [ 0 ] . filename ) . name
2025-01-17 10:56:08 +00:00
assert Path ( result . media [ 1 ] . filename ) . name == " hqdefault.jpg "
@pytest.mark.download
def test_bluesky_download_multiple_images ( self , make_item ) :
2025-01-20 15:17:57 +00:00
item = make_item ( " https://bsky.app/profile/bellingcat.com/post/3lffjoxcu7k2w " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result is not False
@pytest.mark.download
def test_bluesky_download_single_image ( self , make_item ) :
2025-01-20 15:17:57 +00:00
item = make_item ( " https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result is not False
2025-03-10 18:44:54 +00:00
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_bluesky_download_no_media ( self , make_item ) :
item = make_item ( " https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result is not False
@pytest.mark.download
def test_bluesky_download_video ( self , make_item ) :
item = make_item ( " https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result is not False
2025-03-10 18:44:54 +00:00
2025-06-11 19:05:35 +00:00
@pytest.mark.skipif ( not TEST_TRUTH_SOCIAL , reason = " Truth social download tests disabled in environment variables. " )
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_video ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@DaynaTrueman/posts/110602446619561579 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert len ( result . media ) == 1
assert result is not False
2025-06-11 19:05:35 +00:00
@pytest.mark.skipif ( not TEST_TRUTH_SOCIAL , reason = " Truth social download tests disabled in environment variables. " )
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_no_media ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@bbcnewa/posts/109598702184774628 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert result is not False
2025-03-10 18:44:54 +00:00
2025-06-11 19:05:35 +00:00
@pytest.mark.skipif ( not TEST_TRUTH_SOCIAL , reason = " Truth social download tests disabled in environment variables. " )
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_poll ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@CNN_US/posts/113724326568555098 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert result is not False
2025-03-10 18:44:54 +00:00
2025-06-11 19:05:35 +00:00
@pytest.mark.skipif ( not TEST_TRUTH_SOCIAL , reason = " Truth social download tests disabled in environment variables. " )
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_single_image ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@mariabartiromo/posts/113861116433335006 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert len ( result . media ) == 1
assert result is not False
2025-06-11 19:05:35 +00:00
@pytest.mark.skipif ( not TEST_TRUTH_SOCIAL , reason = " Truth social download tests disabled in environment variables. " )
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_multiple_images ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@trrth/posts/113861302149349135 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert len ( result . media ) == 3
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_twitter_download_nonexistend_tweet ( self , make_item ) :
# this tweet does not exist
url = " https://x.com/Bellingcat/status/17197025860711058 "
2025-01-28 13:40:12 +00:00
response = self . extractor . download ( make_item ( url ) )
2025-01-17 10:56:08 +00:00
assert not response
2025-03-10 18:44:54 +00:00
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_twitter_download_malformed_tweetid ( self , make_item ) :
# this tweet does not exist
url = " https://x.com/Bellingcat/status/1719702a586071100058 "
2025-01-28 13:40:12 +00:00
response = self . extractor . download ( make_item ( url ) )
2025-01-17 10:56:08 +00:00
assert not response
@pytest.mark.download
def test_twitter_download_tweet_no_media ( self , make_item ) :
item = make_item ( " https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w " )
2025-01-28 13:40:12 +00:00
post = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
self . assertValidResponseMetadata (
post ,
2025-03-20 14:55:22 +00:00
" Cookie Monster - Onion rings are just vegetable donuts. " ,
2025-01-17 10:56:08 +00:00
datetime . datetime ( 2023 , 1 , 24 , 16 , 25 , 51 , tzinfo = datetime . timezone . utc ) ,
2025-03-10 18:44:54 +00:00
" yt-dlp_Twitter: success " ,
2025-01-17 10:56:08 +00:00
)
2025-03-20 14:55:22 +00:00
assert post . get ( " content " ) == " Onion rings are just vegetable donuts. "
2025-03-10 18:44:54 +00:00
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_twitter_download_video ( self , make_item ) :
url = " https://x.com/bellingcat/status/1871552600346415571 "
2025-01-28 13:40:12 +00:00
post = self . extractor . download ( make_item ( url ) )
2025-01-17 10:56:08 +00:00
self . assertValidResponseMetadata (
post ,
2025-03-24 13:48:15 +00:00
" Bellingcat - This month ' s Bellingchat Premium is with @KolinaKoltai " ,
2025-03-10 18:44:54 +00:00
datetime . datetime ( 2024 , 12 , 24 , 13 , 44 , 46 , tzinfo = datetime . timezone . utc ) ,
2025-01-17 10:56:08 +00:00
)
2025-03-10 18:44:54 +00:00
@pytest.mark.xfail (
reason = " Currently failing, sensitive content requires logged in users/cookies - not yet implemented "
)
2025-01-17 10:56:08 +00:00
@pytest.mark.download
2025-03-10 18:44:54 +00:00
@pytest.mark.parametrize (
" url, title, timestamp, image_hash " ,
[
(
" https://x.com/SozinhoRamalho/status/1876710769913450647 " ,
" ignore tweet, testing sensitivity warning nudity " ,
datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) ,
" image_hash " ,
) ,
(
" https://x.com/SozinhoRamalho/status/1876710875475681357 " ,
" ignore tweet, testing sensitivity warning violence " ,
datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) ,
" image_hash " ,
) ,
(
" https://x.com/SozinhoRamalho/status/1876711053813227618 " ,
" ignore tweet, testing sensitivity warning sensitive " ,
datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) ,
" image_hash " ,
) ,
(
" https://x.com/SozinhoRamalho/status/1876711141314801937 " ,
" ignore tweet, testing sensitivity warning nudity, violence, sensitivity " ,
datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) ,
" image_hash " ,
) ,
] ,
)
2025-01-17 10:56:08 +00:00
def test_twitter_download_sensitive_media ( self , url , title , timestamp , image_hash , make_item ) :
""" Download tweets with sensitive media """
2025-01-28 13:40:12 +00:00
post = self . extractor . download ( make_item ( url ) )
2025-03-10 18:44:54 +00:00
self . assertValidResponseMetadata ( post , title , timestamp )
2025-01-17 10:56:08 +00:00
assert len ( post . media ) == 1
2025-02-25 11:44:35 +00:00
assert post . media [ 0 ] . hash == image_hash
@pytest.mark.download
def test_download_facebook_video ( self , make_item ) :
post = self . extractor . download ( make_item ( " https://www.facebook.com/bellingcat/videos/588371253839133 " ) )
assert len ( post . media ) == 2
assert post . media [ 0 ] . filename . endswith ( " 588371253839133.mp4 " )
assert post . media [ 0 ] . mimetype == " video/mp4 "
assert post . media [ 1 ] . filename . endswith ( " .jpg " )
assert post . media [ 1 ] . mimetype == " image/jpeg "
assert " Bellingchat Premium is with Kolina Koltai " in post . get_title ( )
2025-03-17 10:05:11 +00:00
2025-06-11 19:05:35 +00:00
@pytest.mark.skip ( reason = " Newer yt-dlp versions don ' t support image download. " )
2025-02-25 11:44:35 +00:00
@pytest.mark.download
def test_download_facebook_image ( self , make_item ) :
2025-03-17 10:05:11 +00:00
post = self . extractor . download (
make_item ( " https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/ " )
)
2025-02-25 11:44:35 +00:00
assert len ( post . media ) == 1
assert post . media [ 0 ] . filename . endswith ( " .png " )
assert " Byline Festival - BylineFest Partner " == post . get_title ( )
@pytest.mark.download
def test_download_facebook_text_only ( self , make_item ) :
url = " https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl "
post = self . extractor . download ( make_item ( url ) )
2025-03-17 10:05:11 +00:00
assert " Bellingcat researcher Kolina Koltai delves deeper into Clothoff " in post . get ( " content " )
2025-02-25 11:44:35 +00:00
assert post . get_title ( ) == " Bellingcat "
2025-03-28 10:37:22 +00:00
class TestGenericExtractorPoToken :
@pytest.fixture
def extractor ( self , mocker ) :
extractor = GenericExtractor ( )
extractor . extractor_args = { }
extractor . setup_token_generation_script = mocker . Mock ( )
return extractor
def test_po_token_disabled_does_not_call_setup ( self , extractor ) :
extractor . bguils_po_token_method = " disabled "
extractor . in_docker = True
extractor . setup_po_tokens ( )
extractor . setup_token_generation_script . assert_not_called ( )
def test_po_token_default_in_docker_calls_setup ( self , extractor , mocker ) :
2025-03-28 13:43:46 +00:00
extractor . bguils_po_token_method = " auto "
2025-03-28 10:37:22 +00:00
mocker . patch . dict ( os . environ , { " RUNNING_IN_DOCKER " : " 1 " } )
extractor . setup_po_tokens ( )
extractor . setup_token_generation_script . assert_called_once ( )
def test_po_token_default_local_does_not_call_setup ( self , extractor , caplog , mocker ) :
2025-03-28 13:43:46 +00:00
extractor . bguils_po_token_method = " auto "
2025-03-28 10:37:22 +00:00
# clears env vars for this test
mocker . patch . dict ( os . environ , { } , clear = True )
extractor . setup_po_tokens ( )
extractor . setup_token_generation_script . assert_not_called ( )
assert " Proof of Origin Token method not explicitly set " in caplog . text
def test_po_token_script_always_calls_setup ( self , extractor ) :
extractor . bguils_po_token_method = " script "
extractor . in_docker = False
extractor . setup_po_tokens ( )
extractor . setup_token_generation_script . assert_called_once ( )
extractor . setup_token_generation_script . reset_mock ( )
extractor . in_docker = True
extractor . setup_po_tokens ( )
extractor . setup_token_generation_script . assert_called_once ( )