auto-archiver/archivers/youtubedl_archiver.py


import os
import datetime
import yt_dlp
from loguru import logger

from .base_archiver import Archiver, ArchiveResult
from storages import Storage

class YoutubeDLArchiver(Archiver):
    name = "youtube_dl"
    ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}

    def download(self, url, check_if_exists=False):
        netloc = self.get_netloc(url)
        if netloc in ['facebook.com', 'www.facebook.com'] and os.getenv('FB_COOKIE'):
            logger.info('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')

        ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts)
        cdn_url = None
        status = 'success'

        try:
            info = ydl.extract_info(url, download=False)
        except yt_dlp.utils.DownloadError:
            # no video here
            return False

        if info.get('is_live', False):
            logger.warning("Live streaming media, not archiving now")
            return ArchiveResult(status="Streaming media")

        if check_if_exists:
            if 'entries' in info:
                if len(info['entries']) > 1:
                    logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
                    return False
                elif len(info['entries']) == 0:
                    logger.warning(
                        'YoutubeDLArchiver succeeded but did not find video')
                    return False

                filename = ydl.prepare_filename(info['entries'][0])
            else:
                filename = ydl.prepare_filename(info)

            key = self.get_key(filename)

            if self.storage.exists(key):
                status = 'already archived'
                cdn_url = self.storage.get_cdn_url(key)

        # sometimes this results in a different filename, so do this again
        info = ydl.extract_info(url, download=True)

        # TODO: add support for multiple videos
        if 'entries' in info:
            if len(info['entries']) > 1:
                logger.warning(
                    'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
                return False
            else:
                info = info['entries'][0]

        filename = ydl.prepare_filename(info)

        if not os.path.exists(filename):
            filename = filename.split('.')[0] + '.mkv'

        if status != 'already archived':
            key = self.get_key(filename)
            cdn_url = self.storage.get_cdn_url(key)

            self.storage.upload(filename, key)

        hash = self.get_hash(filename)
        screenshot = self.get_screenshot(url)

        # get duration
        duration = info.get('duration')

        # get thumbnails
        try:
            key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
        except:
            key_thumb = ''
            thumb_index = 'Could not generate thumbnails'

        os.remove(filename)

        timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \
            if 'timestamp' in info else \
                datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
            if 'upload_date' in info and info['upload_date'] is not None else \
                None

        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
                             title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)