auto-archiver/src/auto_archiver/archivers/instagram_api_archiver.py

import re, requests
from datetime import datetime
from loguru import logger
from retrying import retry
from tqdm import tqdm

from . import Archiver
from ..core import Metadata
from ..core import Media

class InstagramAPIArchiver(Archiver):
    """
    Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data

    # TODO: improvement collect aggregates of locations[0].location and mentions for all posts
    """
    name = "instagram_api_archiver"

    global_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?")

    def __init__(self, config: dict) -> None:
        super().__init__(config)
        self.assert_valid_string("access_token")
        self.assert_valid_string("api_endpoint")
        if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1]

        self.full_profile = bool(self.full_profile)
        self.minimize_json_output = bool(self.minimize_json_output)

    @staticmethod
    def configs() -> dict:
        return {
            "access_token": {"default": None, "help": "a valid instagrapi-api token"},
            "api_endpoint": {"default": None, "help": "API endpoint to use"},
            "full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."},
            "minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"},
        }

    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()

        url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
        insta_matches = self.global_pattern.findall(url)
        logger.info(f"{insta_matches=}")
        if not len(insta_matches) or len(insta_matches[0])!=3: return
        if len(insta_matches) > 1:
            logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
            return
        g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
        if g1 == "": return self.download_profile(item, g2)
        elif g1 == "p": return self.download_post(item, g2, context="post")
        elif g1 == "reel": return self.download_post(item, g2, context="reel")
        elif g1 == "stories/highlights": return self.download_highlights(item, g2)
        elif g1 == "stories":
            if len(g3): return self.download_post(item, id=g3, context="story")
            return self.download_stories(item, g2)
        else:
            logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}")
            return

    @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
    def call_api(self, path: str, params: dict) -> dict:
        headers = {
            "accept": "application/json",
            "x-access-key": self.access_token
        }
        logger.debug(f"calling {self.api_endpoint}/{path} with {params=}")
        return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json()

    def cleanup_dict(self, d: dict | list) -> dict:
        # repeats 3 times to remove nested empty values
        if not self.minimize_json_output: return d
        if type(d) == list: return [self.cleanup_dict(v) for v in d]
        if type(d) != dict: return d
        return {
                k: self.cleanup_dict(v) if type(v) in [dict, list] else v
                for k, v in d.items()
                if v not in [0.0, 0, [], {}, "", None, "null"] and
                k not in ["x", "y", "width", "height"]
        }

    def download_profile(self, result: Metadata, username: str) -> Metadata:
        # download basic profile info
        url = result.get_url()
        user = self.call_api("v2/user/by/username", {"username": username}).get("user")
        assert user, f"User {username} not found"
        user = self.cleanup_dict(user)

        result.set_title(user.get("full_name", username)).set("data", user)
        if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
            filename = self.download_from_url(pic_url)
            result.add_media(Media(filename=filename), id=f"profile_picture")

        if self.full_profile:
            user_id = user.get("pk")
            # download all posts
            self.download_all_posts(result, user_id)

            # download all stories
            try:
                stories = self._download_stories_reusable(result, username)
                result.set("#stories", len(stories))
            except Exception as e:
                result.append("errors", f"Error downloading stories for {username}")
                logger.error(f"Error downloading stories for {username}: {e}")

            # download all highlights
            try:
                count_highlights = 0
                highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
                for h in highlights:
                    try:
                        h_info = self._download_highlights_reusable(result, h.get("pk"))
                        count_highlights += len(h_info.get("items", []))
                    except Exception as e:
                        result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
                        logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
                result.set("#highlights", count_highlights)
            except Exception as e:
                result.append("errors", f"Error downloading highlights for {username}")
                logger.error(f"Error downloading highlights for {username}: {e}")

        result.set_url(url) # reset as scrape_item modifies it
        return result.success("insta profile")

    def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
        if id:
            post = self.call_api(f"v1/media/by/id", {"id": id})
        else:
            post = self.call_api(f"v1/media/by/code", {"code": code})
        assert post, f"Post {id or code} not found"

        if caption_text := post.get("caption_text"):
            result.set_title(caption_text)

        post = self.scrape_item(result, post, context)

        if post.get("taken_at"): result.set_timestamp(post.get("taken_at"))
        return result.success(f"insta {context or 'post'}")

    def download_highlights(self, result: Metadata, id: str) -> Metadata:
        h_info = self._download_highlights_reusable(result, id)
        items = len(h_info.get("items", []))
        del h_info["items"]
        result.set_title(h_info.get("title")).set("data", h_info).set("#reels", items)
        return result.success("insta highlights")

    def _download_highlights_reusable(self, result: Metadata, id: str) ->dict:
        full_h = self.call_api(f"v2/highlight/by/id", {"id": id})
        h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}")
        assert h_info, f"Highlight {id} not found: {full_h=}"

        if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
            filename = self.download_from_url(cover_media)
            result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")

        items = h_info.get("items", [])[::-1] # newest to oldest
        for h in tqdm(items, desc="downloading highlights", unit="highlight"):
            try: self.scrape_item(result, h, "highlight")
            except Exception as e:
                result.append("errors", f"Error downloading highlight {h.get('id')}")
                logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e}")

        return h_info

    def download_stories(self, result: Metadata, username: str) -> Metadata:
        now = datetime.now().strftime("%Y-%m-%d_%H-%M")
        stories = self._download_stories_reusable(result, username)
        result.set_title(f"stories {username} at {now}").set("#stories", len(stories))
        return result.success(f"insta stories {now}")

    def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
        stories = self.call_api(f"v1/user/stories/by/username", {"username": username})
        assert stories, f"Stories for {username} not found"
        stories = stories[::-1] # newest to oldest

        for s in tqdm(stories, desc="downloading stories", unit="story"):
            try: self.scrape_item(result, s, "story")
            except Exception as e:
                result.append("errors", f"Error downloading story {s.get('id')}")
                logger.error(f"Error downloading story, skipping {s.get('id')}: {e}")
        return stories

    def download_all_posts(self, result: Metadata, user_id: str):
        end_cursor = None
        pbar = tqdm(desc="downloading posts")

        post_count = 0
        while end_cursor != "":
            posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
            if not len(posts): break
            posts, end_cursor = posts[0], posts[1]
            logger.info(f"parsing {len(posts)} posts, next {end_cursor=}")

            for p in posts:
                try: self.scrape_item(result, p, "post")
                except Exception as e:
                    result.append("errors", f"Error downloading post {p.get('id')}")
                    logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
                pbar.update(1)
                post_count+=1
        result.set("#posts", post_count)


### reusable parsing utils below

    def scrape_item(self, result:Metadata, item:dict, context:str=None) -> dict:
        """
        receives a Metadata and an API dict response
        fetches the media and adds it to the Metadata
        cleans and returns the API dict
        context can be used to give specific id prefixes to media
        """
        if "clips_metadata" in item:
            if reusable_text := item.get("clips_metadata", {}).get("reusable_text_attribute_string"):
                item["clips_metadata_text"] = reusable_text
            if self.minimize_json_output:
                del item["clips_metadata"]

        if code := item.get("code"):
            result.set("url", f"https://www.instagram.com/p/{code}/")

        resources = item.get("resources", [])
        item, media, media_id = self.scrape_media(item, context)
        # if resources are present take the main media from the first resource
        if not media and len(resources):
            _, media, media_id = self.scrape_media(resources[0], context)
            resources = resources[1:]

        assert media, f"Image/video not found in {item=}"

        # posts with multiple items contain a resources list
        resources_metadata = Metadata()
        for r in resources:
            self.scrape_item(resources_metadata, r)
        if not resources_metadata.is_empty():
            media.set("other media", resources_metadata.media)

        result.add_media(media, id=media_id)
        return item

    def scrape_media(self, item: dict, context:str) -> tuple[dict, Media, str]:
        # remove unnecessary info
        if self.minimize_json_output:
            for k in ["image_versions", "video_versions", "video_dash_manifest"]:
                if k in item: del item[k]
        item = self.cleanup_dict(item)

        image_media = None
        if image_url := item.get("thumbnail_url"):
            filename = self.download_from_url(image_url, verbose=False)
            image_media = Media(filename=filename)

        # retrieve video info
        best_id = item.get('id', item.get('pk'))
        taken_at = item.get("taken_at")
        code = item.get("code")
        if video_url := item.get("video_url"):
            filename = self.download_from_url(video_url, verbose=False)
            video_media = Media(filename=filename)
            if taken_at: video_media.set("date", taken_at)
            if code: video_media.set("url", f"https://www.instagram.com/p/{code}")
            video_media.set("preview", [image_media])
            video_media.set("data", [item])
            return item, video_media, f"{context or 'video'} {best_id}"
        elif image_media:
            if taken_at: image_media.set("date", taken_at)
            if code: image_media.set("url", f"https://www.instagram.com/p/{code}")
            image_media.set("data", [item])
            return item, image_media, f"{context or 'image'} {best_id}"

        return item, None, None