From d03ecdb037174823aef53a193be2c25ad90a2866 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 18 Mar 2025 10:22:58 +0000 Subject: [PATCH 1/3] Standardise parse dates to get_datetime_from_str --- .../modules/generic_extractor/generic_extractor.py | 3 ++- src/auto_archiver/modules/generic_extractor/twitter.py | 5 ++--- .../modules/twitter_api_extractor/twitter_api_extractor.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 80556bf..e7b75d9 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -13,6 +13,7 @@ from loguru import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core import Metadata, Media +from auto_archiver.utils import get_datetime_from_str from .dropin import GenericDropin @@ -223,7 +224,7 @@ class GenericExtractor(Extractor): timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat() result.set_timestamp(timestamp) if upload_date := video_data.pop("upload_date", None) and not result.get("upload_date"): - upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc) + upload_date = get_datetime_from_str(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc) result.set("upload_date", upload_date) # then clean away any keys we don't want diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index e4cbe74..e27a0c1 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -1,13 +1,12 @@ import re import mimetypes import json -from datetime import datetime from loguru import logger from slugify import slugify from auto_archiver.core.metadata import Metadata, Media -from auto_archiver.utils import url as UrlUtil +from auto_archiver.utils import url as UrlUtil, get_datetime_from_str from auto_archiver.core.extractor import Extractor from .dropin import GenericDropin, InfoExtractor @@ -38,7 +37,7 @@ class Twitter(GenericDropin): try: if not tweet.get("user") or not tweet.get("created_at"): raise ValueError("Error retreiving post. Are you sure it exists?") - timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") + timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") except (ValueError, KeyError) as ex: logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") return False diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 1c08235..1b9eb75 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -2,7 +2,6 @@ import json import re import mimetypes import requests -from datetime import datetime from loguru import logger from pytwitter import Api @@ -10,6 +9,7 @@ from slugify import slugify from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media +from auto_archiver.utils import get_datetime_from_str class TwitterApiExtractor(Extractor): @@ -91,7 +91,7 @@ class TwitterApiExtractor(Extractor): result = Metadata() result.set_title(tweet.data.text) - result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ")) + result.set_timestamp(get_datetime_from_str(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ")) urls = [] if tweet.includes: From 23e74803eea3fe19f001cf4b7420727595d2e372 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 18 Mar 2025 10:50:48 +0000 Subject: [PATCH 2/3] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6896e6d..89bd4eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "0.13.6" +version = "0.13.7" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" From a57722846527efe61c9e2ebd28bdff7f3d4bc25d Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 18 Mar 2025 21:10:06 +0000 Subject: [PATCH 3/3] Update generic_extractor.py for general/ youtube extraction. --- .../modules/generic_extractor/generic_extractor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index e7b75d9..6a9e28f 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -203,7 +203,7 @@ class GenericExtractor(Extractor): if not result.get("url"): result.set_url(url) - if "description" in video_data and not result.get_content(): + if "description" in video_data and not result.get("content"): result.set_content(video_data["description"]) # extract comments if enabled if self.comments: @@ -220,10 +220,13 @@ class GenericExtractor(Extractor): ) # then add the common metadata - if timestamp := video_data.pop("timestamp", None) and not result.get("timestamp"): + timestamp = video_data.pop("timestamp", None) + if timestamp and not result.get("timestamp"): timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat() result.set_timestamp(timestamp) - if upload_date := video_data.pop("upload_date", None) and not result.get("upload_date"): + + upload_date = video_data.pop("upload_date", None) + if upload_date and not result.get("upload_date"): upload_date = get_datetime_from_str(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc) result.set("upload_date", upload_date)