kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge branch 'main' into feat/yt-dlp-pots
commit
ba9d67e4bb
|
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "auto-archiver"
|
name = "auto-archiver"
|
||||||
version = "0.13.6"
|
version = "0.13.7"
|
||||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
requires-python = ">=3.10,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
|
|
|
@ -59,9 +59,18 @@ class GenericDropin:
|
||||||
"""
|
"""
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
def is_suitable(self, url, info_extractor: InfoExtractor):
|
def suitable(self, url, info_extractor: InfoExtractor):
|
||||||
"""
|
"""
|
||||||
Used to override the InfoExtractor's 'is_suitable' method. Dropins should override this method to return True if the url is suitable for the extractor
|
A method to allow dropins to override their InfoExtractor's 'suitable' method.
|
||||||
(based on being able to parse other URLs)
|
Dropins should override this method and return True if the url is suitable for the extractor
|
||||||
|
(based on being able to parse other URLs). See the `suitable_extractors` method in the
|
||||||
|
`GenericExtractor` class for how this is implemented.
|
||||||
|
|
||||||
|
The default behaviour of this method is to return the result of the InfoExtractor's 'suitable' method.
|
||||||
|
|
||||||
|
### Example: An example of where this is useful is for the FacebookIE extractor in yt-dlp. By default,
|
||||||
|
it's 'suitable' method only returns True for video URLs. However, we can override this method in the
|
||||||
|
Facebook dropin to return True for all Facebook URLs (photo/post types). This way, the Facebook dropin
|
||||||
|
can be used for all Facebook URLs.
|
||||||
"""
|
"""
|
||||||
return False
|
return info_extractor.suitable(url)
|
||||||
|
|
|
@ -142,7 +142,7 @@ class Facebook(GenericDropin):
|
||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def is_suitable(self, url, info_extractor: FacebookIE):
|
def suitable(self, url, info_extractor: FacebookIE):
|
||||||
regex = r"(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)"
|
regex = r"(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)"
|
||||||
return re.match(regex, url)
|
return re.match(regex, url)
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,8 @@ from loguru import logger
|
||||||
|
|
||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
from auto_archiver.utils import get_datetime_from_str
|
||||||
|
from .dropin import GenericDropin
|
||||||
|
|
||||||
|
|
||||||
class SkipYtdlp(Exception):
|
class SkipYtdlp(Exception):
|
||||||
|
@ -95,14 +97,11 @@ class GenericExtractor(Extractor):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# check if there's a dropin and see if that declares whether it's suitable
|
# check if there's a dropin and see if that declares whether it's suitable
|
||||||
dropin = self.dropin_for_name(info_extractor.ie_key())
|
dropin: GenericDropin = self.dropin_for_name(info_extractor.ie_key())
|
||||||
if dropin and dropin.is_suitable(url, info_extractor):
|
if dropin and dropin.suitable(url, info_extractor):
|
||||||
yield info_extractor
|
yield info_extractor
|
||||||
continue
|
elif info_extractor.suitable(url):
|
||||||
|
|
||||||
if info_extractor.suitable(url):
|
|
||||||
yield info_extractor
|
yield info_extractor
|
||||||
continue
|
|
||||||
|
|
||||||
def suitable(self, url: str) -> bool:
|
def suitable(self, url: str) -> bool:
|
||||||
"""
|
"""
|
||||||
|
@ -249,7 +248,7 @@ class GenericExtractor(Extractor):
|
||||||
timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat()
|
timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat()
|
||||||
result.set_timestamp(timestamp)
|
result.set_timestamp(timestamp)
|
||||||
if upload_date := video_data.pop("upload_date", None) and not result.get("upload_date"):
|
if upload_date := video_data.pop("upload_date", None) and not result.get("upload_date"):
|
||||||
upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
|
upload_date = get_datetime_from_str(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
|
||||||
result.set("upload_date", upload_date)
|
result.set("upload_date", upload_date)
|
||||||
|
|
||||||
# then clean away any keys we don't want
|
# then clean away any keys we don't want
|
||||||
|
@ -324,7 +323,7 @@ class GenericExtractor(Extractor):
|
||||||
|
|
||||||
return self.add_metadata(data, info_extractor, url, result)
|
return self.add_metadata(data, info_extractor, url, result)
|
||||||
|
|
||||||
def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> Type[InfoExtractor]:
|
def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> GenericDropin:
|
||||||
dropin_name = dropin_name.lower()
|
dropin_name = dropin_name.lower()
|
||||||
|
|
||||||
if dropin_name == "generic":
|
if dropin_name == "generic":
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
import requests
|
import requests
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE
|
||||||
|
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from .dropin import GenericDropin
|
from .dropin import GenericDropin
|
||||||
|
@ -13,6 +16,11 @@ class Tiktok(GenericDropin):
|
||||||
|
|
||||||
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
||||||
|
|
||||||
|
def suitable(self, url, info_extractor) -> bool:
|
||||||
|
"""This dropin (which uses Tikvm) is suitable for *all* Tiktok type URLs - videos, lives, VMs, and users.
|
||||||
|
Return the 'suitable' method from the TikTokIE class."""
|
||||||
|
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
|
||||||
|
|
||||||
def extract_post(self, url: str, ie_instance):
|
def extract_post(self, url: str, ie_instance):
|
||||||
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
|
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,12 @@
|
||||||
import re
|
import re
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import json
|
import json
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
from auto_archiver.utils import url as UrlUtil
|
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
|
|
||||||
from .dropin import GenericDropin, InfoExtractor
|
from .dropin import GenericDropin, InfoExtractor
|
||||||
|
@ -38,7 +37,7 @@ class Twitter(GenericDropin):
|
||||||
try:
|
try:
|
||||||
if not tweet.get("user") or not tweet.get("created_at"):
|
if not tweet.get("user") or not tweet.get("created_at"):
|
||||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||||
timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||||
except (ValueError, KeyError) as ex:
|
except (ValueError, KeyError) as ex:
|
||||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -2,7 +2,6 @@ import json
|
||||||
import re
|
import re
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import requests
|
import requests
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from pytwitter import Api
|
from pytwitter import Api
|
||||||
|
@ -10,6 +9,7 @@ from slugify import slugify
|
||||||
|
|
||||||
from auto_archiver.core import Extractor
|
from auto_archiver.core import Extractor
|
||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
from auto_archiver.utils import get_datetime_from_str
|
||||||
|
|
||||||
|
|
||||||
class TwitterApiExtractor(Extractor):
|
class TwitterApiExtractor(Extractor):
|
||||||
|
@ -91,7 +91,7 @@ class TwitterApiExtractor(Extractor):
|
||||||
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
result.set_title(tweet.data.text)
|
result.set_title(tweet.data.text)
|
||||||
result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ"))
|
result.set_timestamp(get_datetime_from_str(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ"))
|
||||||
|
|
||||||
urls = []
|
urls = []
|
||||||
if tweet.includes:
|
if tweet.includes:
|
||||||
|
|
|
@ -118,7 +118,7 @@ def pytest_runtest_setup(item):
|
||||||
pytest.xfail(f"previous test failed ({test_name})")
|
pytest.xfail(f"previous test failed ({test_name})")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture
|
||||||
def unpickle():
|
def unpickle():
|
||||||
"""
|
"""
|
||||||
Returns a helper function that unpickles a file
|
Returns a helper function that unpickles a file
|
||||||
|
|
|
@ -4,6 +4,8 @@ import pytest
|
||||||
import yt_dlp
|
import yt_dlp
|
||||||
|
|
||||||
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
|
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
|
||||||
|
from auto_archiver.modules.generic_extractor.tiktok import Tiktok, TikTokIE
|
||||||
|
|
||||||
from .test_extractor_base import TestExtractorBase
|
from .test_extractor_base import TestExtractorBase
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,11 +19,16 @@ def skip_ytdlp_own_methods(mocker):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture
|
||||||
def mock_get(mocker):
|
def mock_get(mocker):
|
||||||
return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get")
|
return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tiktok_dropin() -> Tiktok:
|
||||||
|
return Tiktok()
|
||||||
|
|
||||||
|
|
||||||
class TestTiktokTikwmExtractor(TestExtractorBase):
|
class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||||
"""
|
"""
|
||||||
Test suite for TestTiktokTikwmExtractor.
|
Test suite for TestTiktokTikwmExtractor.
|
||||||
|
@ -34,6 +41,25 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||||
|
|
||||||
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
|
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, is_suitable",
|
||||||
|
[
|
||||||
|
("https://bellingcat.com", False),
|
||||||
|
("https://youtube.com", False),
|
||||||
|
("https://tiktok.co/", False),
|
||||||
|
("https://tiktok.com/", False),
|
||||||
|
("https://www.tiktok.com/", False),
|
||||||
|
("https://api.cool.tiktok.com/", False),
|
||||||
|
(VALID_EXAMPLE_URL, True),
|
||||||
|
("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
|
||||||
|
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
|
||||||
|
("https://www.tiktok.com/t/ZP8YQ8e5j/", True),
|
||||||
|
("https://vt.tiktok.com/ZSMTJeqRP/", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_suitable(self, url, is_suitable, tiktok_dropin):
|
||||||
|
assert tiktok_dropin.suitable(url, TikTokIE()) == is_suitable
|
||||||
|
|
||||||
def test_invalid_json_responses(self, mock_get, make_item, caplog):
|
def test_invalid_json_responses(self, mock_get, make_item, caplog):
|
||||||
mock_get.return_value.status_code = 200
|
mock_get.return_value.status_code = 200
|
||||||
mock_get.return_value.json.side_effect = ValueError
|
mock_get.return_value.json.side_effect = ValueError
|
||||||
|
|
Ładowanie…
Reference in New Issue