2025-01-17 10:56:08 +00:00
from pathlib import Path
2025-01-21 15:58:18 +00:00
import datetime
2025-01-17 16:29:13 +00:00
import os
2025-01-17 10:56:08 +00:00
2025-01-21 15:58:18 +00:00
from os . path import dirname
import pytest
2025-01-17 10:56:08 +00:00
2025-01-28 13:40:12 +00:00
from auto_archiver . modules . generic_extractor . generic_extractor import GenericExtractor
from . test_extractor_base import TestExtractorBase
2025-01-17 10:56:08 +00:00
2025-03-10 18:44:54 +00:00
CI = os . getenv ( " GITHUB_ACTIONS " , " " ) == " true "
2025-01-28 13:40:12 +00:00
class TestGenericExtractor ( TestExtractorBase ) :
2025-03-10 18:44:54 +00:00
""" Tests Generic Extractor """
extractor_module = " generic_extractor "
2025-01-28 13:40:12 +00:00
extractor : GenericExtractor
2025-01-17 10:56:08 +00:00
config = {
2025-03-10 18:44:54 +00:00
" subtitles " : False ,
" comments " : False ,
" livestreams " : False ,
" live_from_start " : False ,
" end_means_success " : True ,
" allow_playlist " : False ,
" max_downloads " : " inf " ,
" proxy " : None ,
" cookies_from_browser " : False ,
" cookie_file " : None ,
}
2025-01-21 15:58:18 +00:00
def test_load_dropin ( self ) :
# test loading dropins that are in the generic_archiver package
2025-01-28 13:40:12 +00:00
package = " auto_archiver.modules.generic_extractor "
assert self . extractor . dropin_for_name ( " bluesky " , package = package )
2025-01-21 15:58:18 +00:00
# test loading dropings via filepath
path = os . path . join ( dirname ( dirname ( __file__ ) ) , " data/ " )
2025-01-28 13:40:12 +00:00
assert self . extractor . dropin_for_name ( " dropin " , additional_paths = [ path ] )
2025-01-21 15:58:18 +00:00
2025-03-17 10:05:11 +00:00
@pytest.mark.parametrize (
" url, suitable_extractors " ,
[
( " https://www.youtube.com/watch?v=5qap5aO4i9A " , [ " youtube " ] ) ,
( " https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en " , [ " tiktok " ] ) ,
( " https://www.instagram.com/p/CU1J9JYJ9Zz/ " , [ " instagram " ] ) ,
( " https://www.facebook.com/nytimes/videos/10160796550110716 " , [ " facebook " ] ) ,
( " https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/ " , [ " facebook " ] ) ,
] ,
)
2025-02-25 11:44:35 +00:00
def test_suitable_extractors ( self , url , suitable_extractors ) :
2025-03-17 10:05:11 +00:00
suitable_extractors = suitable_extractors + [ " generic " ] # the generic is valid for all
2025-02-25 11:44:35 +00:00
extractors = list ( self . extractor . suitable_extractors ( url ) )
assert len ( extractors ) == len ( suitable_extractors )
assert [ e . ie_key ( ) . lower ( ) for e in extractors ] == suitable_extractors
2025-01-17 10:56:08 +00:00
2025-03-10 18:44:54 +00:00
@pytest.mark.parametrize (
" url, is_suitable " ,
[
( " https://www.youtube.com/watch?v=5qap5aO4i9A " , True ) ,
( " https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en " , True ) ,
( " https://www.instagram.com/p/CU1J9JYJ9Zz/ " , True ) ,
( " https://www.facebook.com/nytimes/videos/10160796550110716 " , True ) ,
( " https://www.twitch.tv/videos/1167226570 " , True ) ,
(
" https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/ " ,
True ,
) ,
( " https://google.com " , True ) ,
] ,
)
2025-02-25 11:44:35 +00:00
def test_suitable_urls ( self , url , is_suitable ) :
2025-01-17 10:56:08 +00:00
"""
2025-03-10 18:44:54 +00:00
Note : expected behaviour is to return True for all URLs , as YoutubeDLArchiver should be able to handle all URLs
This behaviour may be changed in the future ( e . g . if we want the youtubedl archiver to just handle URLs it has extractors for ,
and then if and only if all archivers fails , does it fall back to the generic archiver )
2025-01-17 10:56:08 +00:00
"""
2025-01-28 13:40:12 +00:00
assert self . extractor . suitable ( url ) == is_suitable
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_download_tiktok ( self , make_item ) :
item = make_item ( " https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result . get_url ( ) == " https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970 "
2025-01-17 16:29:13 +00:00
2025-01-20 15:17:57 +00:00
@pytest.mark.download
2025-03-10 18:44:54 +00:00
@pytest.mark.parametrize (
" url " ,
[
" https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l " ,
" twitter.com/bellingcat/status/123 " ,
" https://www.youtube.com/watch?v=1 " ,
] ,
)
2025-02-19 10:29:05 +00:00
def test_download_nonexistent_media ( self , make_item , url ) :
2025-01-20 15:17:57 +00:00
"""
Test to make sure that the extractor doesn ' t break on non-existend posts/media
It should return ' False '
"""
item = make_item ( url )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert not result
2025-03-10 18:44:54 +00:00
@pytest.mark.skipif (
CI ,
reason = " Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn ' t support logging in with username/password. " ,
)
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_youtube_download ( self , make_item ) :
# url https://www.youtube.com/watch?v=5qap5aO4i9A
2025-02-18 19:10:09 +00:00
2025-01-17 10:56:08 +00:00
item = make_item ( " https://www.youtube.com/watch?v=J---aiyznGQ " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result . get_url ( ) == " https://www.youtube.com/watch?v=J---aiyznGQ "
assert result . get_title ( ) == " Keyboard Cat! - THE ORIGINAL! "
2025-03-10 18:44:54 +00:00
assert (
result . get ( " description " )
== " Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com \n \n xo Keyboard Cat memes make your day better! \n http://www.keyboardcatstore.com/ \n https://www.facebook.com/thekeyboardcat \n http://www.charlieschmidt.com/ "
)
2025-01-17 10:56:08 +00:00
assert len ( result . media ) == 2
assert Path ( result . media [ 0 ] . filename ) . name == " J---aiyznGQ.webm "
assert Path ( result . media [ 1 ] . filename ) . name == " hqdefault.jpg "
@pytest.mark.download
def test_bluesky_download_multiple_images ( self , make_item ) :
2025-01-20 15:17:57 +00:00
item = make_item ( " https://bsky.app/profile/bellingcat.com/post/3lffjoxcu7k2w " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result is not False
@pytest.mark.download
def test_bluesky_download_single_image ( self , make_item ) :
2025-01-20 15:17:57 +00:00
item = make_item ( " https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result is not False
2025-03-10 18:44:54 +00:00
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_bluesky_download_no_media ( self , make_item ) :
item = make_item ( " https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result is not False
@pytest.mark.download
def test_bluesky_download_video ( self , make_item ) :
item = make_item ( " https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
assert result is not False
2025-03-10 18:44:54 +00:00
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_video ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@DaynaTrueman/posts/110602446619561579 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert len ( result . media ) == 1
assert result is not False
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_no_media ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@bbcnewa/posts/109598702184774628 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert result is not False
2025-03-10 18:44:54 +00:00
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_poll ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@CNN_US/posts/113724326568555098 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert result is not False
2025-03-10 18:44:54 +00:00
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_single_image ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@mariabartiromo/posts/113861116433335006 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert len ( result . media ) == 1
assert result is not False
2025-02-18 19:10:09 +00:00
@pytest.mark.skipif ( CI , reason = " Truth social blocks GH actions. " )
2025-01-20 15:17:57 +00:00
@pytest.mark.download
def test_truthsocial_download_multiple_images ( self , make_item ) :
item = make_item ( " https://truthsocial.com/@trrth/posts/113861302149349135 " )
2025-01-28 13:40:12 +00:00
result = self . extractor . download ( item )
2025-01-20 15:17:57 +00:00
assert len ( result . media ) == 3
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_twitter_download_nonexistend_tweet ( self , make_item ) :
# this tweet does not exist
url = " https://x.com/Bellingcat/status/17197025860711058 "
2025-01-28 13:40:12 +00:00
response = self . extractor . download ( make_item ( url ) )
2025-01-17 10:56:08 +00:00
assert not response
2025-03-10 18:44:54 +00:00
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_twitter_download_malformed_tweetid ( self , make_item ) :
# this tweet does not exist
url = " https://x.com/Bellingcat/status/1719702a586071100058 "
2025-01-28 13:40:12 +00:00
response = self . extractor . download ( make_item ( url ) )
2025-01-17 10:56:08 +00:00
assert not response
@pytest.mark.download
def test_twitter_download_tweet_no_media ( self , make_item ) :
item = make_item ( " https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w " )
2025-01-28 13:40:12 +00:00
post = self . extractor . download ( item )
2025-01-17 10:56:08 +00:00
self . assertValidResponseMetadata (
post ,
2025-03-20 14:55:22 +00:00
" Cookie Monster - Onion rings are just vegetable donuts. " ,
2025-01-17 10:56:08 +00:00
datetime . datetime ( 2023 , 1 , 24 , 16 , 25 , 51 , tzinfo = datetime . timezone . utc ) ,
2025-03-10 18:44:54 +00:00
" yt-dlp_Twitter: success " ,
2025-01-17 10:56:08 +00:00
)
2025-03-20 14:55:22 +00:00
assert post . get ( " content " ) == " Onion rings are just vegetable donuts. "
2025-03-10 18:44:54 +00:00
2025-01-17 10:56:08 +00:00
@pytest.mark.download
def test_twitter_download_video ( self , make_item ) :
url = " https://x.com/bellingcat/status/1871552600346415571 "
2025-01-28 13:40:12 +00:00
post = self . extractor . download ( make_item ( url ) )
2025-01-17 10:56:08 +00:00
self . assertValidResponseMetadata (
post ,
" Bellingcat - This month ' s Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it ' s crucial to investigate the people behind these services " ,
2025-03-10 18:44:54 +00:00
datetime . datetime ( 2024 , 12 , 24 , 13 , 44 , 46 , tzinfo = datetime . timezone . utc ) ,
2025-01-17 10:56:08 +00:00
)
2025-03-10 18:44:54 +00:00
@pytest.mark.xfail (
reason = " Currently failing, sensitive content requires logged in users/cookies - not yet implemented "
)
2025-01-17 10:56:08 +00:00
@pytest.mark.download
2025-03-10 18:44:54 +00:00
@pytest.mark.parametrize (
" url, title, timestamp, image_hash " ,
[
(
" https://x.com/SozinhoRamalho/status/1876710769913450647 " ,
" ignore tweet, testing sensitivity warning nudity " ,
datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) ,
" image_hash " ,
) ,
(
" https://x.com/SozinhoRamalho/status/1876710875475681357 " ,
" ignore tweet, testing sensitivity warning violence " ,
datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) ,
" image_hash " ,
) ,
(
" https://x.com/SozinhoRamalho/status/1876711053813227618 " ,
" ignore tweet, testing sensitivity warning sensitive " ,
datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) ,
" image_hash " ,
) ,
(
" https://x.com/SozinhoRamalho/status/1876711141314801937 " ,
" ignore tweet, testing sensitivity warning nudity, violence, sensitivity " ,
datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) ,
" image_hash " ,
) ,
] ,
)
2025-01-17 10:56:08 +00:00
def test_twitter_download_sensitive_media ( self , url , title , timestamp , image_hash , make_item ) :
""" Download tweets with sensitive media """
2025-01-28 13:40:12 +00:00
post = self . extractor . download ( make_item ( url ) )
2025-03-10 18:44:54 +00:00
self . assertValidResponseMetadata ( post , title , timestamp )
2025-01-17 10:56:08 +00:00
assert len ( post . media ) == 1
2025-02-25 11:44:35 +00:00
assert post . media [ 0 ] . hash == image_hash
@pytest.mark.download
def test_download_facebook_video ( self , make_item ) :
post = self . extractor . download ( make_item ( " https://www.facebook.com/bellingcat/videos/588371253839133 " ) )
assert len ( post . media ) == 2
assert post . media [ 0 ] . filename . endswith ( " 588371253839133.mp4 " )
assert post . media [ 0 ] . mimetype == " video/mp4 "
assert post . media [ 1 ] . filename . endswith ( " .jpg " )
assert post . media [ 1 ] . mimetype == " image/jpeg "
assert " Bellingchat Premium is with Kolina Koltai " in post . get_title ( )
2025-03-17 10:05:11 +00:00
2025-02-25 11:44:35 +00:00
@pytest.mark.download
def test_download_facebook_image ( self , make_item ) :
2025-03-17 10:05:11 +00:00
post = self . extractor . download (
make_item ( " https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/ " )
)
2025-02-25 11:44:35 +00:00
assert len ( post . media ) == 1
assert post . media [ 0 ] . filename . endswith ( " .png " )
assert " Byline Festival - BylineFest Partner " == post . get_title ( )
@pytest.mark.download
def test_download_facebook_text_only ( self , make_item ) :
url = " https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl "
post = self . extractor . download ( make_item ( url ) )
2025-03-17 10:05:11 +00:00
assert " Bellingcat researcher Kolina Koltai delves deeper into Clothoff " in post . get ( " content " )
2025-02-25 11:44:35 +00:00
assert post . get_title ( ) == " Bellingcat "