auto-archiver/src/auto_archiver/archivers/telethon_archiver.py


import shutil
from telethon.sync import TelegramClient
from telethon.errors import ChannelInvalidError
from telethon.tl.functions.messages import ImportChatInviteRequest
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
from loguru import logger
from tqdm import tqdm
import re, time, json, os

from . import Archiver
from ..core import Metadata, Media, ArchivingContext
from ..utils import random_str


class TelethonArchiver(Archiver):
    name = "telethon_archiver"
    link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
    invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")

    def __init__(self, config: dict) -> None:
        super().__init__(config)
        self.assert_valid_string("api_id")
        self.assert_valid_string("api_hash")

    @staticmethod
    def configs() -> dict:
        return {
            "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
            "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
            "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
            "join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
            "channel_invites": {
                "default": {},
                "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
                "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
            }
        }

    def setup(self) -> None:
        """
        1. makes a copy of session_file that is removed in cleanup
        2. trigger login process for telegram or proceed if already saved in a session file
        3. joins channel_invites where needed
        """
        logger.info(f"SETUP {self.name} checking login...")

        # make a copy of the session that is used exclusively with this archiver instance
        new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
        shutil.copy(self.session_file + ".session", new_session_file)
        self.session_file = new_session_file.replace(".session", "")

        # initiate the client
        self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)

        with self.client.start():
            logger.success(f"SETUP {self.name} login works.")

        if self.join_channels and len(self.channel_invites):
            logger.info(f"SETUP {self.name} joining channels...")
            with self.client.start():
                # get currently joined channels
                # https://docs.telethon.dev/en/stable/modules/custom.html#module-telethon.tl.custom.dialog
                joined_channel_ids = [c.id for c in self.client.get_dialogs() if c.is_channel]
                logger.info(f"already part of {len(joined_channel_ids)} channels")

                i = 0
                pbar = tqdm(desc=f"joining {len(self.channel_invites)} invite links", total=len(self.channel_invites))
                while i < len(self.channel_invites):
                    channel_invite = self.channel_invites[i]
                    channel_id = channel_invite.get("id", False)
                    invite = channel_invite["invite"]
                    if (match := self.invite_pattern.search(invite)):
                        try:
                            if channel_id:
                                ent = self.client.get_entity(int(channel_id))  # fails if not a member
                            else:
                                ent = self.client.get_entity(invite)  # fails if not a member
                                logger.warning(f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting.")
                        except ValueError as e:
                            logger.info(f"joining new channel {invite=}")
                            try:
                                self.client(ImportChatInviteRequest(match.group(2)))
                            except UserAlreadyParticipantError as e:
                                logger.info(f"already joined {invite=}")
                            except InviteRequestSentError:
                                logger.warning(f"already sent a join request with {invite} still no answer")
                            except InviteHashExpiredError:
                                logger.warning(f"{invite=} has expired please find a more recent one")
                            except Exception as e:
                                logger.error(f"could not join channel with {invite=} due to {e}")
                        except FloodWaitError as e:
                            logger.warning(f"got a flood error, need to wait {e.seconds} seconds")
                            time.sleep(e.seconds)
                            continue
                    else:
                        logger.warning(f"Invalid invite link {invite}")
                    i += 1
                    pbar.update()

    def cleanup(self) -> None:
        logger.info(f"CLEANUP {self.name}.")
        if os.path.exists(self.session_file):
            os.remove(self.session_file)

    def download(self, item: Metadata) -> Metadata:
        """
        if this url is archivable will download post info and look for other posts from the same group with media.
        can handle private/public channels
        """
        url = item.get_url()
        # detect URLs that we definitely cannot handle
        match = self.link_pattern.search(url)
        logger.debug(f"TELETHON: {match=}")
        if not match: return False

        is_private = match.group(1) == "/c"
        chat = int(match.group(2)) if is_private else match.group(2)
        post_id = int(match.group(3))

        result = Metadata()

        # NB: not using bot_token since then private channels cannot be archived: self.client.start(bot_token=self.bot_token)
        with self.client.start():
        # with self.client.start(bot_token=self.bot_token):
            try:
                post = self.client.get_messages(chat, ids=post_id)
            except ValueError as e:
                logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
                return False
            except ChannelInvalidError as e:
                logger.error(f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}")
                return False

            logger.debug(f"TELETHON GOT POST {post=}")
            if post is None: return False

            media_posts = self._get_media_posts_in_group(chat, post)
            logger.debug(f'got {len(media_posts)=} for {url=}')

            tmp_dir = ArchivingContext.get_tmp_dir()

            group_id = post.grouped_id if post.grouped_id is not None else post.id
            title = post.message
            for mp in media_posts:
                if len(mp.message) > len(title): title = mp.message  # save the longest text found (usually only 1)

                # media can also be in entities
                if mp.entities:
                    other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
                    if len(other_media_urls):
                        logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
                    for i, om_url in enumerate(other_media_urls):
                        filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}')
                        result.add_media(Media(filename=filename), id=f"{group_id}_{i}")

                filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
                filename = self.client.download_media(mp.media, filename_dest)
                if not filename:
                    logger.debug(f"Empty media found, skipping {str(mp)=}")
                    continue
                result.add_media(Media(filename))

            result.set_title(title).set_timestamp(post.date).set("api_data", post.to_dict())
            if post.message != title:
                result.set_content(post.message)
        return result.success("telethon")

    def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
        """
        Searches for Telegram posts that are part of the same group of uploads
        The search is conducted around the id of the original post with an amplitude
        of `max_amp` both ways
        Returns a list of [post] where each post has media and is in the same grouped_id
        """
        if getattr(original_post, "grouped_id", None) is None:
            return [original_post] if getattr(original_post, "media", False) else []

        search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
        posts = self.client.get_messages(chat, ids=search_ids)
        media = []
        for post in posts:
            if post is not None and post.grouped_id == original_post.grouped_id and post.media is not None:
                media.append(post)
        return media