Create manifest files for archiver modules.

pull/183/head
erinhmclark 2025-01-21 22:29:50 +00:00 zatwierdzone przez Patrick Robertson
rodzic 4830f99300
commit 7b3a1468cd
23 zmienionych plików z 467 dodań i 129 usunięć

Wyświetl plik

@ -6,10 +6,3 @@ collect and preserve a variety of content types, such as posts, images, videos a
"""
from .archiver import Archiver
from .telethon_archiver import TelethonArchiver
from .twitter_api_archiver import TwitterApiArchiver
from .instagram_archiver import InstagramArchiver
from .instagram_tbot_archiver import InstagramTbotArchiver
from .telegram_archiver import TelegramArchiver
from .vk_archiver import VkArchiver
from .instagram_api_archiver import InstagramAPIArchiver

Wyświetl plik

@ -1,2 +0,0 @@
# temporary hack, as we implement module
from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver

Wyświetl plik

@ -0,0 +1,30 @@
{
"name": "Instagram API Archiver",
"type": ["extractor"],
"entry_point": "instagram_api_archiver:InstagramApiArchiver",
"depends": ["core"],
"external_dependencies":
{"python": ["requests",
"loguru",
"retrying",
"tqdm",],
},
"no_setup_required": False,
"configs": {
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
"api_endpoint": {"default": None, "help": "API endpoint to use"},
"full_profile": {
"default": False,
"help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
},
"full_profile_max_posts": {
"default": 0,
"help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
},
"minimize_json_output": {
"default": True,
"help": "if true, will remove empty values from the json output",
},
},
"description": "",
}

Wyświetl plik

@ -9,32 +9,38 @@ data, reducing JSON output size, and handling large profiles.
"""
import re
import requests
from datetime import datetime
import requests
from loguru import logger
from retrying import retry
from tqdm import tqdm
from . import Archiver
from ..core import Metadata
from ..core import Media
from auto_archiver.archivers import Archiver
from auto_archiver.core import Media
from auto_archiver.core import Metadata
class InstagramAPIArchiver(Archiver):
"""
Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
"""
name = "instagram_api_archiver"
global_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?")
global_pattern = re.compile(
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
)
def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("access_token")
self.assert_valid_string("api_endpoint")
self.full_profile_max_posts = int(self.full_profile_max_posts)
if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1]
if self.api_endpoint[-1] == "/":
self.api_endpoint = self.api_endpoint[:-1]
self.full_profile = bool(self.full_profile)
self.minimize_json_output = bool(self.minimize_json_output)
@ -44,52 +50,74 @@ class InstagramAPIArchiver(Archiver):
return {
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
"api_endpoint": {"default": None, "help": "API endpoint to use"},
"full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."},
"full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"},
"minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"},
"full_profile": {
"default": False,
"help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
},
"full_profile_max_posts": {
"default": 0,
"help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
},
"minimize_json_output": {
"default": True,
"help": "if true, will remove empty values from the json output",
},
}
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
url.replace("instagr.com", "instagram.com").replace(
"instagr.am", "instagram.com"
)
insta_matches = self.global_pattern.findall(url)
logger.info(f"{insta_matches=}")
if not len(insta_matches) or len(insta_matches[0])!=3: return
if len(insta_matches) > 1:
logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
if not len(insta_matches) or len(insta_matches[0]) != 3:
return
if len(insta_matches) > 1:
logger.warning(
f"Multiple instagram matches found in {url=}, using the first one"
)
return
g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
if g1 == "": return self.download_profile(item, g2)
elif g1 == "p": return self.download_post(item, g2, context="post")
elif g1 == "reel": return self.download_post(item, g2, context="reel")
elif g1 == "stories/highlights": return self.download_highlights(item, g2)
elif g1 == "stories":
if len(g3): return self.download_post(item, id=g3, context="story")
if g1 == "":
return self.download_profile(item, g2)
elif g1 == "p":
return self.download_post(item, g2, context="post")
elif g1 == "reel":
return self.download_post(item, g2, context="reel")
elif g1 == "stories/highlights":
return self.download_highlights(item, g2)
elif g1 == "stories":
if len(g3):
return self.download_post(item, id=g3, context="story")
return self.download_stories(item, g2)
else:
else:
logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}")
return
@retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
def call_api(self, path: str, params: dict) -> dict:
headers = {
"accept": "application/json",
"x-access-key": self.access_token
}
headers = {"accept": "application/json", "x-access-key": self.access_token}
logger.debug(f"calling {self.api_endpoint}/{path} with {params=}")
return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json()
return requests.get(
f"{self.api_endpoint}/{path}", headers=headers, params=params
).json()
def cleanup_dict(self, d: dict | list) -> dict:
# repeats 3 times to remove nested empty values
if not self.minimize_json_output: return d
if type(d) == list: return [self.cleanup_dict(v) for v in d]
if type(d) != dict: return d
if not self.minimize_json_output:
return d
if type(d) == list:
return [self.cleanup_dict(v) for v in d]
if type(d) != dict:
return d
return {
k: clean_v
for k, v in d.items()
if (clean_v := self.cleanup_dict(v)) not in [0.0, 0, [], {}, "", None, "null"] and
k not in ["x", "y", "width", "height"]
k: clean_v
for k, v in d.items()
if (clean_v := self.cleanup_dict(v))
not in [0.0, 0, [], {}, "", None, "null"]
and k not in ["x", "y", "width", "height"]
}
def download_profile(self, result: Metadata, username: str) -> Metadata:
@ -125,7 +153,9 @@ class InstagramAPIArchiver(Archiver):
try:
self.download_all_tagged(result, user_id)
except Exception as e:
result.append("errors", f"Error downloading tagged posts for {username}")
result.append(
"errors", f"Error downloading tagged posts for {username}"
)
logger.error(f"Error downloading tagged posts for {username}: {e}")
# download all highlights
@ -135,26 +165,37 @@ class InstagramAPIArchiver(Archiver):
result.append("errors", f"Error downloading highlights for {username}")
logger.error(f"Error downloading highlights for {username}: {e}")
result.set_url(url) # reset as scrape_item modifies it
result.set_url(url) # reset as scrape_item modifies it
return result.success("insta profile")
def download_all_highlights(self, result, username, user_id):
count_highlights = 0
highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
for h in highlights:
try:
try:
h_info = self._download_highlights_reusable(result, h.get("pk"))
count_highlights += len(h_info.get("items", []))
except Exception as e:
result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
result.append(
"errors",
f"Error downloading highlight id{h.get('pk')} for {username}",
)
logger.error(
f"Error downloading highlight id{h.get('pk')} for {username}: {e}"
)
if (
self.full_profile_max_posts
and count_highlights >= self.full_profile_max_posts
):
logger.info(
f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}"
)
break
result.set("#highlights", count_highlights)
def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
def download_post(
self, result: Metadata, code: str = None, id: str = None, context: str = None
) -> Metadata:
if id:
post = self.call_api(f"v1/media/by/id", {"id": id})
else:
@ -166,7 +207,8 @@ class InstagramAPIArchiver(Archiver):
post = self.scrape_item(result, post, context)
if post.get("taken_at"): result.set_timestamp(post.get("taken_at"))
if post.get("taken_at"):
result.set_timestamp(post.get("taken_at"))
return result.success(f"insta {context or 'post'}")
def download_highlights(self, result: Metadata, id: str) -> Metadata:
@ -175,96 +217,127 @@ class InstagramAPIArchiver(Archiver):
del h_info["items"]
result.set_title(h_info.get("title")).set("data", h_info).set("#reels", items)
return result.success("insta highlights")
def _download_highlights_reusable(self, result: Metadata, id: str) ->dict:
def _download_highlights_reusable(self, result: Metadata, id: str) -> dict:
full_h = self.call_api(f"v2/highlight/by/id", {"id": id})
h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}")
assert h_info, f"Highlight {id} not found: {full_h=}"
if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
if (
cover_media := h_info.get("cover_media", {})
.get("cropped_image_version", {})
.get("url")
):
filename = self.download_from_url(cover_media)
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
items = h_info.get("items", [])[::-1] # newest to oldest
items = h_info.get("items", [])[::-1] # newest to oldest
for h in tqdm(items, desc="downloading highlights", unit="highlight"):
try: self.scrape_item(result, h, "highlight")
try:
self.scrape_item(result, h, "highlight")
except Exception as e:
result.append("errors", f"Error downloading highlight {h.get('id')}")
logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e}")
logger.error(
f"Error downloading highlight, skipping {h.get('id')}: {e}"
)
return h_info
def download_stories(self, result: Metadata, username: str) -> Metadata:
now = datetime.now().strftime("%Y-%m-%d_%H-%M")
stories = self._download_stories_reusable(result, username)
if stories == []: return result.success("insta no story")
if stories == []:
return result.success("insta no story")
result.set_title(f"stories {username} at {now}").set("#stories", len(stories))
return result.success(f"insta stories {now}")
def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
stories = self.call_api(f"v1/user/stories/by/username", {"username": username})
if not stories or not len(stories): return []
stories = stories[::-1] # newest to oldest
if not stories or not len(stories):
return []
stories = stories[::-1] # newest to oldest
for s in tqdm(stories, desc="downloading stories", unit="story"):
try: self.scrape_item(result, s, "story")
try:
self.scrape_item(result, s, "story")
except Exception as e:
result.append("errors", f"Error downloading story {s.get('id')}")
logger.error(f"Error downloading story, skipping {s.get('id')}: {e}")
return stories
def download_all_posts(self, result: Metadata, user_id: str):
end_cursor = None
pbar = tqdm(desc="downloading posts")
post_count = 0
while end_cursor != "":
posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
if not len(posts) or not type(posts) == list or len(posts) != 2: break
posts = self.call_api(
f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor}
)
if not len(posts) or not type(posts) == list or len(posts) != 2:
break
posts, end_cursor = posts[0], posts[1]
logger.info(f"parsing {len(posts)} posts, next {end_cursor=}")
for p in posts:
try: self.scrape_item(result, p, "post")
try:
self.scrape_item(result, p, "post")
except Exception as e:
result.append("errors", f"Error downloading post {p.get('id')}")
logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
pbar.update(1)
post_count+=1
if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
post_count += 1
if (
self.full_profile_max_posts
and post_count >= self.full_profile_max_posts
):
logger.info(
f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}"
)
break
result.set("#posts", post_count)
def download_all_tagged(self, result: Metadata, user_id: str):
next_page_id = ""
pbar = tqdm(desc="downloading tagged posts")
tagged_count = 0
while next_page_id != None:
resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id})
resp = self.call_api(
f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id}
)
posts = resp.get("response", {}).get("items", [])
if not len(posts): break
if not len(posts):
break
next_page_id = resp.get("next_page_id")
logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}")
for p in posts:
try: self.scrape_item(result, p, "tagged")
try:
self.scrape_item(result, p, "tagged")
except Exception as e:
result.append("errors", f"Error downloading tagged post {p.get('id')}")
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
result.append(
"errors", f"Error downloading tagged post {p.get('id')}"
)
logger.error(
f"Error downloading tagged post, skipping {p.get('id')}: {e}"
)
pbar.update(1)
tagged_count+=1
if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
tagged_count += 1
if (
self.full_profile_max_posts
and tagged_count >= self.full_profile_max_posts
):
logger.info(
f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}"
)
break
result.set("#tagged", tagged_count)
### reusable parsing utils below
### reusable parsing utils below
def scrape_item(self, result:Metadata, item:dict, context:str=None) -> dict:
def scrape_item(self, result: Metadata, item: dict, context: str = None) -> dict:
"""
receives a Metadata and an API dict response
fetches the media and adds it to the Metadata
@ -272,23 +345,25 @@ class InstagramAPIArchiver(Archiver):
context can be used to give specific id prefixes to media
"""
if "clips_metadata" in item:
if reusable_text := item.get("clips_metadata", {}).get("reusable_text_attribute_string"):
if reusable_text := item.get("clips_metadata", {}).get(
"reusable_text_attribute_string"
):
item["clips_metadata_text"] = reusable_text
if self.minimize_json_output:
if self.minimize_json_output:
del item["clips_metadata"]
if code := item.get("code") and not result.get("url"):
if code := item.get("code") and not result.get("url"):
result.set_url(f"https://www.instagram.com/p/{code}/")
resources = item.get("resources", item.get("carousel_media", []))
item, media, media_id = self.scrape_media(item, context)
# if resources are present take the main media from the first resource
if not media and len(resources):
_, media, media_id = self.scrape_media(resources[0], context)
resources = resources[1:]
assert media, f"Image/video not found in {item=}"
# posts with multiple items contain a resources list
resources_metadata = Metadata()
for r in resources:
@ -298,40 +373,54 @@ class InstagramAPIArchiver(Archiver):
result.add_media(media, id=media_id)
return item
def scrape_media(self, item: dict, context:str) -> tuple[dict, Media, str]:
def scrape_media(self, item: dict, context: str) -> tuple[dict, Media, str]:
# remove unnecessary info
if self.minimize_json_output:
for k in ["image_versions", "video_versions", "video_dash_manifest", "image_versions2", "video_versions2"]:
if k in item: del item[k]
if self.minimize_json_output:
for k in [
"image_versions",
"video_versions",
"video_dash_manifest",
"image_versions2",
"video_versions2",
]:
if k in item:
del item[k]
item = self.cleanup_dict(item)
image_media = None
if image_url := item.get("thumbnail_url"):
filename = self.download_from_url(image_url, verbose=False)
image_media = Media(filename=filename)
# retrieve video info
best_id = item.get('id', item.get('pk'))
best_id = item.get("id", item.get("pk"))
taken_at = item.get("taken_at", item.get("taken_at_ts"))
code = item.get("code")
caption_text = item.get("caption_text")
if "carousel_media" in item: del item["carousel_media"]
if "carousel_media" in item:
del item["carousel_media"]
if video_url := item.get("video_url"):
filename = self.download_from_url(video_url, verbose=False)
video_media = Media(filename=filename)
if taken_at: video_media.set("date", taken_at)
if code: video_media.set("url", f"https://www.instagram.com/p/{code}")
if caption_text: video_media.set("text", caption_text)
if taken_at:
video_media.set("date", taken_at)
if code:
video_media.set("url", f"https://www.instagram.com/p/{code}")
if caption_text:
video_media.set("text", caption_text)
video_media.set("preview", [image_media])
video_media.set("data", [item])
return item, video_media, f"{context or 'video'} {best_id}"
elif image_media:
if taken_at: image_media.set("date", taken_at)
if code: image_media.set("url", f"https://www.instagram.com/p/{code}")
if caption_text: image_media.set("text", caption_text)
if taken_at:
image_media.set("date", taken_at)
if code:
image_media.set("url", f"https://www.instagram.com/p/{code}")
if caption_text:
image_media.set("text", caption_text)
image_media.set("data", [item])
return item, image_media, f"{context or 'image'} {best_id}"
return item, None, None
return item, None, None

Wyświetl plik

@ -0,0 +1,33 @@
{
"name": "Instagram Archiver",
"type": ["extractor"],
"entry_point": "instagram_archiver:InstagramArchiver",
"depends": ["core"],
"external_dependencies": {
"python": ["instaloader",
"loguru",],
},
"no_setup_required": False,
"configs": {
"username": {"default": None, "help": "a valid Instagram username"},
"password": {
"default": None,
"help": "the corresponding Instagram account password",
},
"download_folder": {
"default": "instaloader",
"help": "name of a folder to temporarily download content to",
},
"session_file": {
"default": "secrets/instaloader.session",
"help": "path to the instagram session which saves session credentials",
},
# TODO: fine-grain
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
},
"description": """Uses the Instaloader library to download content from Instagram. This class handles both individual posts
and user profiles, downloading as much information as possible, including images, videos, text, stories,
highlights, and tagged posts. Authentication is required via username/password or a session file.
""",
}

Wyświetl plik

@ -7,9 +7,9 @@ import re, os, shutil, traceback
import instaloader # https://instaloader.github.io/as-module.html
from loguru import logger
from . import Archiver
from ..core import Metadata
from ..core import Media
from auto_archiver.archivers import Archiver
from auto_archiver.core import Metadata
from auto_archiver.core import Media
class InstagramArchiver(Archiver):
"""

Wyświetl plik

@ -0,0 +1,35 @@
{
"name": "Instagram Telegram Bot Archiver",
"type": ["extractor"],
"entry_point": "instagram_tbot_archiver:InstagramTbotArchiver",
"depends": ["core", "utils"],
"external_dependencies": {"python": ["loguru",
"telethon",],
},
"requires_setup": True,
"configs": {
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
},
"description": """
The `InstagramTbotArchiver` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs
to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and
returned as part of a `Metadata` object.
### Features
- Supports archiving Instagram posts and stories through the Telegram bot.
- Downloads and saves media files (e.g., images, videos) in a temporary directory.
- Captures and returns metadata, including titles and descriptions, as a `Metadata` object.
- Automatically manages Telegram session files for secure access.
### Setup
To use the `InstagramTbotArchiver`, you need to provide the following configuration settings:
- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
- **Session File**: Optional path to store the Telegram session file for future use.
""",
}

Wyświetl plik

@ -7,14 +7,17 @@ relevant media and metadata. The fetched content is saved as `Media` objects in
`Metadata` object.
"""
import os
import shutil
from telethon.sync import TelegramClient
from loguru import logger
import time, os
import time
from sqlite3 import OperationalError
from . import Archiver
from ..core import Metadata, Media, ArchivingContext
from ..utils import random_str
from loguru import logger
from telethon.sync import TelegramClient
from auto_archiver.archivers import Archiver
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.utils import random_str
class InstagramTbotArchiver(Archiver):

Wyświetl plik

@ -0,0 +1,26 @@
{
"name": "Telegram Archiver",
"type": ["extractor"],
"entry_point": "telegram_archiver:TelegramArchiver",
"requires_setup": False,
"depends": ["core"],
"external_dependencies": {
"python": [
"requests",
"bs4",
"loguru",
],
},
"description": """
The `TelegramArchiver` retrieves publicly available media content from Telegram message links without requiring login credentials.
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
is advised for more comprehensive functionality.
### Features
- Extracts images and videos from public Telegram message links (`t.me`).
- Processes HTML content of messages to retrieve embedded media.
- Sets structured metadata, including timestamps, content, and media details.
- Does not require user authentication for Telegram.
""",
}

Wyświetl plik

@ -2,13 +2,14 @@ import requests, re, html
from bs4 import BeautifulSoup
from loguru import logger
from . import Archiver
from ..core import Metadata, Media
from auto_archiver.archivers import Archiver
from auto_archiver.core import Metadata, Media
class TelegramArchiver(Archiver):
"""
Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found
Archiver for telegram that does not require login, but the telethon_archiver is much more advised,
will only return if at least one image or one video is found
"""
name = "telegram_archiver"

Wyświetl plik

@ -0,0 +1,48 @@
# TODO rm dependency on json
{
"name": "telethon_archiver",
"type": ["extractor"],
"entry_point": "telethon_archiver:TelethonArchiver",
"requires_setup": True,
"depends": [""],
"external_dependencies": {
"python": ["telethon",
"loguru",
"tqdm",
],
"bin": [""]
},
"configs": {
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
"channel_invites": {
"default": {},
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
# TODO
#"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
}
},
"description": """
The `TelethonArchiver` uses the Telethon library to archive posts and media from Telegram channels and groups.
It supports private and public channels, downloading grouped posts with media, and can join channels using invite links
if provided in the configuration.
### Features
- Fetches posts and metadata from Telegram channels and groups, including private channels.
- Downloads media attachments (e.g., images, videos, audio) from individual posts or grouped posts.
- Handles channel invites to join channels dynamically during setup.
- Utilizes Telethon's capabilities for reliable Telegram interactions.
- Outputs structured metadata and media using `Metadata` and `Media` objects.
### Setup
To use the `TelethonArchiver`, you must configure the following:
- **API ID and API Hash**: Obtain these from [my.telegram.org](https://my.telegram.org/apps).
- **Session File**: Optional, but records login sessions for future use (default: `secrets/anon.session`).
- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.
"""
}

Wyświetl plik

@ -8,9 +8,9 @@ from loguru import logger
from tqdm import tqdm
import re, time, json, os
from . import Archiver
from ..core import Metadata, Media, ArchivingContext
from ..utils import random_str
from auto_archiver.archivers import Archiver
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.utils import random_str
class TelethonArchiver(Archiver):

Wyświetl plik

@ -0,0 +1,45 @@
{
"name": "Twitter API Archiver",
"type": ["extractor"],
"entry_point": "twitter_api_archiver:TwitterApiArchiver",
"requires_setup": True,
"depends": ["core"],
"external_dependencies": {
"python": ["requests",
"loguru",
"pytwitter",
"slugify",],
"bin": [""]
},
"configs": {
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
"access_token": {"default": None, "help": "twitter API access_token"},
"access_secret": {"default": None, "help": "twitter API access_secret"},
},
"description": """
The `TwitterApiArchiver` fetches tweets and associated media using the Twitter API.
It supports multiple API configurations for extended rate limits and reliable access.
Features include URL expansion, media downloads (e.g., images, videos), and structured output
via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens
or consumer key/secret and access token/secret.
### Features
- Fetches tweets and their metadata, including text, creation timestamp, and author information.
- Downloads media attachments (e.g., images, videos) in high quality.
- Supports multiple API configurations for improved rate limiting.
- Expands shortened URLs (e.g., `t.co` links).
- Outputs structured metadata and media using `Metadata` and `Media` objects.
### Setup
To use the `TwitterApiArchiver`, you must provide valid Twitter API credentials via configuration:
- **Bearer Token(s)**: A single token or a list for rate-limited API access.
- **Consumer Key and Secret**: Required for user-authenticated API access.
- **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en).
"""
,
}

Wyświetl plik

@ -8,8 +8,8 @@ from loguru import logger
from pytwitter import Api
from slugify import slugify
from . import Archiver
from ..core import Metadata,Media
from auto_archiver.archivers import Archiver
from auto_archiver.core import Metadata,Media
class TwitterApiArchiver(Archiver):
name = "twitter_api_archiver"

Wyświetl plik

@ -0,0 +1,37 @@
{
"name": "VKontakte Archiver",
"type": ["extractor"],
"entry_point": "vk_archiver:VKArchiver",
"requires_setup": True,
"depends": ["core", "utils"],
"external_dependencies": {
"python": ["loguru",
"vk_url_scraper"],
},
"configs": {
"username": {"default": None, "help": "valid VKontakte username"},
"password": {"default": None, "help": "valid VKontakte password"},
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
},
"description": """
The `VkArchiver` fetches posts, text, and images from VK (VKontakte) social media pages.
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
and download content. Note that VK videos are handled separately by the `YTDownloader`.
### Features
- Extracts text, timestamps, and metadata from VK `/wall` posts.
- Downloads associated images and attaches them to the resulting `Metadata` object.
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
- Outputs structured metadata and media using `Metadata` and `Media` objects.
### Setup
To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
- **Username**: A valid VKontakte account username.
- **Password**: The corresponding password for the VKontakte account.
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
Credentials can be set in the configuration file or directly via environment variables. Ensure you
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
"""
,
}

Wyświetl plik

@ -1,9 +1,9 @@
from loguru import logger
from vk_url_scraper import VkScraper
from ..utils.misc import dump_payload
from . import Archiver
from ..core import Metadata, Media, ArchivingContext
from auto_archiver.utils.misc import dump_payload
from auto_archiver.archivers import Archiver
from auto_archiver.core import Metadata, Media, ArchivingContext
class VkArchiver(Archiver):