2025-01-07 18:43:20 +00:00
import unittest
import datetime
2025-01-13 13:31:29 +00:00
import pytest
2025-01-07 18:43:20 +00:00
from auto_archiver . archivers . twitter_archiver import TwitterArchiver
from . test_archiver_base import TestArchiverBase
class TestTwitterArchiver ( TestArchiverBase , unittest . TestCase ) :
archiver_class = TwitterArchiver
config = { }
def test_sanitize_url ( self ) :
# should expand t.co URLs
t_co_url = " https://t.co/yl3oOJatFp "
t_co_resolved_url = " https://www.bellingcat.com/category/resources/ "
2025-01-13 12:15:13 +00:00
assert t_co_resolved_url == self . archiver . sanitize_url ( t_co_url )
2025-01-07 18:43:20 +00:00
# shouldn't alter valid x URLs
x_url = " https://x.com/bellingcat/status/1874097816571961839 "
2025-01-13 12:15:13 +00:00
assert x_url == self . archiver . sanitize_url ( x_url )
2025-01-07 18:43:20 +00:00
# shouldn't alter valid twitter.com URLs
twitter_url = " https://twitter.com/bellingcat/status/1874097816571961839 "
2025-01-13 12:15:13 +00:00
assert twitter_url == self . archiver . sanitize_url ( twitter_url )
2025-01-07 18:43:20 +00:00
# should strip tracking params
tracking_url = " https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w "
2025-01-13 12:15:13 +00:00
assert " https://twitter.com/bellingcat/status/1874097816571961839 " == self . archiver . sanitize_url ( tracking_url )
2025-01-07 18:43:20 +00:00
# shouldn't alter non-twitter/x URLs
test_url = " https://www.bellingcat.com/category/resources/ "
2025-01-13 12:15:13 +00:00
assert test_url == self . archiver . sanitize_url ( test_url )
2025-01-07 18:43:20 +00:00
# shouldn't strip params from non-twitter/x URLs
test_url = " https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w "
2025-01-13 12:15:13 +00:00
assert test_url == self . archiver . sanitize_url ( test_url )
2025-01-07 18:43:20 +00:00
def test_get_username_tweet_id_from_url ( self ) :
# test valid twitter URL
url = " https://twitter.com/bellingcat/status/1874097816571961839 "
username , tweet_id = self . archiver . get_username_tweet_id ( url )
2025-01-13 12:15:13 +00:00
assert " bellingcat " == username
assert " 1874097816571961839 " == tweet_id
2025-01-07 18:43:20 +00:00
# test valid x URL
url = " https://x.com/bellingcat/status/1874097816571961839 "
username , tweet_id = self . archiver . get_username_tweet_id ( url )
2025-01-13 12:15:13 +00:00
assert " bellingcat " == username
assert " 1874097816571961839 " == tweet_id
2025-01-07 18:43:20 +00:00
# test invalid URL
# TODO: should this return None, False or raise an exception? Right now it returns False
url = " https://www.bellingcat.com/category/resources/ "
username , tweet_id = self . archiver . get_username_tweet_id ( url )
2025-01-13 12:15:13 +00:00
assert not username
assert not tweet_id
2025-01-07 18:43:20 +00:00
2025-01-13 16:58:10 +00:00
def test_choose_variants ( self ) :
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
variant_list = [ { ' content_type ' : ' application/x-mpegURL ' , ' url ' : ' https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b ' } ,
{ ' bitrate ' : 256000 , ' content_type ' : ' video/mp4 ' , ' url ' : ' https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12 ' } ,
{ ' bitrate ' : 832000 , ' content_type ' : ' video/mp4 ' , ' url ' : ' https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12 ' } ,
{ ' bitrate ' : 2176000 , ' content_type ' : ' video/mp4 ' , ' url ' : ' https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12 ' }
]
chosen_variant = self . archiver . choose_variant ( variant_list )
assert chosen_variant == variant_list [ 3 ]
2025-01-13 13:31:29 +00:00
@pytest.mark.download
2025-01-07 18:43:20 +00:00
def test_youtube_dlp_archiver ( self ) :
url = " https://x.com/bellingcat/status/1874097816571961839 "
post = self . archiver . download_yt_dlp ( self . create_item ( url ) , url , " 1874097816571961839 " )
2025-01-13 12:15:13 +00:00
assert post
2025-01-07 18:43:20 +00:00
self . assertValidResponseMetadata (
post ,
" As 2024 comes to a close, here’ s some examples of what Bellingcat investigated per month in our 10th year! 🧵 " ,
2025-01-13 13:31:29 +00:00
datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) ,
" twitter-ytdl "
2025-01-07 18:43:20 +00:00
)
2025-01-13 13:31:29 +00:00
@pytest.mark.download
def test_download_tweet_no_media ( self ) :
2025-01-07 18:43:20 +00:00
# url https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
2025-01-13 13:31:29 +00:00
item = self . create_item ( " https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w " )
post = self . archiver . download ( item )
self . assertValidResponseMetadata (
post ,
" Onion rings are just vegetable donuts. " ,
datetime . datetime ( 2023 , 1 , 24 , 16 , 25 , 51 , tzinfo = datetime . timezone . utc ) ,
" twitter-ytdl "
)
2025-01-13 16:58:10 +00:00
@pytest.mark.download
def test_download_video ( self ) :
url = " https://x.com/bellingcat/status/1871552600346415571 "
post = self . archiver . download ( self . create_item ( url ) )
self . assertValidResponseMetadata (
post ,
" This month ' s Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it ' s crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8 " ,
datetime . datetime ( 2024 , 12 , 24 , 13 , 44 , 46 , tzinfo = datetime . timezone . utc )
)
2025-01-13 13:31:29 +00:00
@pytest.mark.download
def test_download_sensitive_media ( self ) :
""" Download tweets with sensitive media
Note : currently failing , youtube - dlp requres logged in users """
test_data = [
( " https://x.com/SozinhoRamalho/status/1876710769913450647 " , " ignore tweet, testing sensitivity warning nudity " , datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) , " image_hash " ) ,
( " https://x.com/SozinhoRamalho/status/1876710875475681357 " , " ignore tweet, testing sensitivity warning violence " , datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) , " image_hash " ) ,
( " https://x.com/SozinhoRamalho/status/1876711053813227618 " , " ignore tweet, testing sensitivity warning sensitive " , datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) , " image_hash " ) ,
( " https://x.com/SozinhoRamalho/status/1876711141314801937 " , " ignore tweet, testing sensitivity warning nudity, violence, sensitivity " , datetime . datetime ( 2024 , 12 , 31 , 14 , 18 , 33 , tzinfo = datetime . timezone . utc ) , " image_hash " )
]
for url , title , timestamp , image_hash in test_data :
post = self . archiver . download ( self . create_item ( url ) )
self . assertValidResponseMetadata (
post ,
title ,
timestamp
)
assert len ( post . media ) == 1
assert post . media [ 0 ] . hash == image_hash