Add script to import data from desktop client json dumps. Implement #10.

2022-12-19 04:00:58 -05:00 · 2022-12-19 04:00:58 -05:00 · b8b8eff5bf
commit b8b8eff5bf
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -13,6 +13,10 @@ Fixed
 -----
 - Sticker pack names save correctly now

+Added
+-----
+- Add script to import data from desktop client json dumps
+
 -------------
 `0.6.4`_ - 2022-02-27
 -------------
--- a/README.rst
+++ b/README.rst
@ -39,6 +39,8 @@ Table of contents

 - `Setup`_

+- `Importing Data`_
+
 - `Fetching Stats`_

  - `counts`_
@ -191,6 +193,26 @@ you've sent a message to trigger the update).
 You can see if messages are being logged correctly by reviewing the terminal output.
 You should see a line like ``2020-06-04 02:08:39,212 - __main__ - INFO - 8``, whenever a message is logged.

+--------------
+Importing Data
+--------------
+Data can be imported from JSON dumps from the desktop client.
+Hit the three dot button from inside the desired group and select "Export chat history".
+Make sure you select JSON as the output format.
+You can also limit the date, as desired.
+The database will be updated and existing messages will remain, so you can use this feature to fill in gaps when the bot was not running.
+
+To import data, simply call:
+
+.. code:: shell
+
+    $ python -m telegram_stats_bot.json_dump_parser "/some/path/to/dump.json" "postgresql://telegram:CoolPassword@localhost/telegram_bot" --tz="America/Toronto"
+
+Where the first argument is the path to the json dump, the second is the db connection string, as above, and the optional `tz` argument should be the time zone of the system used to dump the json.
+
+This can be run without stopping a running bot, though it also attempts to set the user id to user name mapping, so will add an extra entry to every user in the dump (this currently only affects the user stats related to user name changes).
+Before you run this, make sure your db string is correct or you might accidentally mess up other databases on the same server.
+
 --------------
 Fetching Stats
 --------------
--- a/poetry.lock
+++ b/poetry.lock
@ -59,6 +59,25 @@ category = "main"
 optional = false
 python-versions = "*"

+[[package]]
+name = "click"
+version = "8.1.3"
+description = "Composable command line interface toolkit"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+
 [[package]]
 name = "cycler"
 version = "0.11.0"
@ -377,6 +396,23 @@ category = "main"
 optional = false
 python-versions = ">= 3.5"

+[[package]]
+name = "typer"
+version = "0.7.0"
+description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+click = ">=7.1.1,<9.0.0"
+
+[package.extras]
+all = ["colorama (>=0.4.3,<0.5.0)", "shellingham (>=1.3.0,<2.0.0)", "rich (>=10.11.0,<13.0.0)"]
+dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
+doc = ["mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "pillow (>=9.3.0,<10.0.0)", "cairosvg (>=2.5.2,<3.0.0)"]
+test = ["shellingham (>=1.3.0,<2.0.0)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "coverage (>=6.2,<7.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "mypy (==0.910)", "black (>=22.3.0,<23.0.0)", "isort (>=5.0.6,<6.0.0)", "rich (>=10.11.0,<13.0.0)"]
+
 [[package]]
 name = "tzdata"
 version = "2021.5"
@ -405,7 +441,7 @@ test = ["pytest-mock (>=3.3)", "pytest (>=4.3)"]
 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.8,<3.11"
-content-hash = "1855bdc73cff766f144e12e38f659568f877681286822ac77457ada13479afcb"
+content-hash = "f81adc14942f648ccbcb6745e57ea72d0b8290c969dd81f2fe940a02ac217edd"

 [metadata.files]
 appdirs = [
@ -442,6 +478,8 @@ certifi = [
    {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
    {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
 ]
+click = []
+colorama = []
 cycler = [
    {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
    {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
@ -883,6 +921,7 @@ tornado = [
    {file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"},
    {file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"},
 ]
+typer = []
 tzdata = [
    {file = "tzdata-2021.5-py2.py3-none-any.whl", hash = "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5"},
    {file = "tzdata-2021.5.tar.gz", hash = "sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21"},
--- a/pyproject.toml
+++ b/pyproject.toml
@ -19,6 +19,7 @@ numpy = "^1.22.0"
 matplotlib = "^3.2.1"
 appdirs = "^1.4.4"
 single-source = "^0.2.0"
+typer = "^0.7.0"

 [tool.poetry.dev-dependencies]

--- a/telegram_stats_bot/db.py
+++ b/telegram_stats_bot/db.py
@ -74,8 +74,8 @@ def init_dbs(engine: Engine):
        create table if not exists user_events
        (
            message_id bigint,
-            user_id    bigint,
-            date       timestamp with time zone,
+            user_id    text,
+            date       timestamptz,
            event      text
        );
        
--- a/telegram_stats_bot/json_dump_parser.py
+++ b/telegram_stats_bot/json_dump_parser.py
@ -23,6 +23,11 @@ import json
 import typing

 import pandas as pd
+import sqlalchemy.engine
+import typer
+from sqlalchemy import create_engine
+
+from .stats import StatsRunner

 media_dict = {'sticker': 'sticker',
              'animation': 'animation',
@ -31,8 +36,8 @@ media_dict = {'sticker': 'sticker',
              'audio_file': 'audio',
              'video_message': 'video_note'}

-user_event_cat = pd.Categorical(['left', 'joined'])
-message_type_cat = pd.Categorical(['migrate_from_group', 'text', 'pinned_message', 'photo', 'sticker',
+user_event_cat = pd.CategoricalDtype(['left', 'joined'])
+message_type_cat = pd.CategoricalDtype(['migrate_from_group', 'text', 'pinned_message', 'photo', 'sticker',
                                   'new_chat_members', 'left_chat_member', 'animation', 'video',
                                   'location', 'new_chat_title', 'voice', 'audio',
                                   'new_chat_photo', 'video_note', 'poll'])
@ -50,9 +55,10 @@ def text_list_parser(text: typing.Union[str, typing.Sequence]) -> str:
    return out


-def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict]]:
+def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict], dict]:
    messages_out = []
    users_out = []
+
    for message in df.itertuples():
        message_dict = {'message_id': message.id,
                        'date': message.date,
@ -71,16 +77,18 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
        user_event_dict = {}
        if message.type == 'message':
            if pd.notnull(message.from_id):
-                message_dict['from_user'] = message.from_id
+                if not message.from_id.startswith('user'):
+                    continue
+                message_dict['from_user'] = int(message.from_id[4:])  # remove 'user' from id

            if pd.notnull(message.forwarded_from):
                try:
-                    message_dict['forward_from'] = int(message.forwarded_from)
+                    message_dict['forward_from'] = int(message.from_id[4:])  # username is used in forwarded_from
                except ValueError:
                    pass

            if pd.notnull(message.reply_to_message_id):
-                message_dict['reply_to_message'] = message.reply_to_message_id
+                message_dict['reply_to_message'] = int(message.reply_to_message_id)

            if pd.notnull(message.photo):
                message_dict['type'] = 'photo'
@ -97,12 +105,11 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
                message_dict['text'] = text_list_parser(message.text)
            elif pd.notnull(message.poll):
                message_dict['type'] = 'poll'
-            elif pd.notnull(message.location_information):
-                message_dict['type'] = 'location'

        elif message.type == 'service':
            if pd.notnull(message.actor_id):
-                message_dict['from_user'] = message.actor_id
+                if message.actor_id.startswith('user'):
+                    message_dict['from_user'] = int(message.actor_id[4:])

            if message.action == 'edit_group_title':
                message_dict['type'] = 'new_chat_title'
@ -118,12 +125,12 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
                        users_out.append({'message_id': message.id,
                                          'user_id': i,
                                          'date': message.date,
-                                          'event': 'join'})
+                                          'event': 'joined'})
                except TypeError:
                    user_event_dict = {'message_id': message.id,
                                       'user_id': message.actor_id,
                                       'date': message.date,
-                                       'event': 'join'}
+                                       'event': 'joined'}
            elif message.action == 'remove_members':
                message_dict['type'] = 'left_chat_member'
                for i in message.members:
@ -136,7 +143,15 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
        messages_out.append(message_dict)
        if user_event_dict != {}:
            users_out.append(user_event_dict)
-    return messages_out, users_out
+
+    user_map = {int(i[4:]): df.loc[df['from_id'] == i, 'from'].iloc[0]
+                for i in df['from_id'].unique()
+                if (df['from_id'] == i).any() and i.startswith('user')}
+
+    # Use long name for both name and long name since we can't fetch usernames
+    user_map = {k: (v, v) for k, v in user_map.items() if v}
+
+    return messages_out, users_out, user_map


 def parse_json(path: str):
@ -144,3 +159,66 @@ def parse_json(path: str):
        js = json.load(f)
    chat = js['chats']['list'][1]['messages']
    df = pd.DataFrame(chat)
+
+
+def fix_dtypes_m(df: pd.DataFrame, tz: str) -> pd.DataFrame:
+    intcols = ['forward_from_message_id', 'forward_from', 'forward_from_chat',
+               'from_user', 'reply_to_message']
+    df_out = df.copy()
+    df_out.loc[:, intcols] = df_out.loc[:, intcols].astype('Int64')
+    df_out.loc[:, 'date'] = pd.to_datetime(df_out['date'], utc=False).dt.tz_localize(tz=tz,
+                                                                                     ambiguous=True)
+    df_out.loc[:, 'type'] = df_out.loc[:, 'type'].astype(message_type_cat)
+    return df_out.convert_dtypes()
+
+
+def fix_dtypes_u(df: pd.DataFrame, tz: str) -> pd.DataFrame:
+    df_out = df.copy()
+    df_out.loc[:, 'date'] = pd.to_datetime(df_out['date'], utc=False).dt.tz_localize(tz=tz,
+                                                                                     ambiguous=True)
+    df_out.loc[df_out.event == 'join', 'event'] = 'joined'
+    df_out['event'] = df_out.event.astype(user_event_cat)
+
+    return df_out.convert_dtypes()
+
+
+def update_user_list(users: dict[int, tuple[str, str]],  engine: sqlalchemy.engine.Engine, tz: str):
+    stats_runner = StatsRunner(engine, tz)
+    stats_runner.update_user_ids(users)
+
+
+def main(json_path: str, db_url: str, tz: str = 'Etc/UTC'):
+    """
+    Parse backup json file and update database with contents.
+    :param json_path:
+    :param db_url:
+    :param tz:
+    :return:
+    """
+    with open(json_path, encoding='utf-8') as f:
+        js = json.load(f)
+
+    chat = js['messages']
+    messages, users, user_map = convert_messages(pd.DataFrame(chat))
+
+    df_m = pd.DataFrame(messages).set_index('message_id')
+    df_m = fix_dtypes_m(df_m, tz)
+    df_u = pd.DataFrame(users).set_index('message_id')
+    df_u = fix_dtypes_u(df_u, tz)
+
+    engine = create_engine(db_url, echo=False)
+
+    # Exclude existing messages
+    m_ids = pd.read_sql_table('messages_utc', engine).message_id
+    df_m = df_m.loc[~df_m.index.isin(m_ids)]
+    m_ids = pd.read_sql_table('user_events', engine).message_id
+    df_u = df_u.loc[~df_u.index.isin(m_ids)]
+
+    df_u.to_sql('user_events', engine, if_exists='append')
+    df_m.to_sql('messages_utc', engine, if_exists='append')
+
+    update_user_list(user_map, engine, tz)
+
+
+if __name__ == '__main__':
+    typer.run(main)