Add script to import data from desktop client json dumps. Implement #10.

pull/12/head
Michael DM Dryden 2022-12-19 04:00:58 -05:00
rodzic 7d80fc8ed3
commit b8b8eff5bf
6 zmienionych plików z 159 dodań i 15 usunięć

Wyświetl plik

@ -13,6 +13,10 @@ Fixed
-----
- Sticker pack names save correctly now
Added
-----
- Add script to import data from desktop client json dumps
-------------
`0.6.4`_ - 2022-02-27
-------------

Wyświetl plik

@ -39,6 +39,8 @@ Table of contents
- `Setup`_
- `Importing Data`_
- `Fetching Stats`_
- `counts`_
@ -191,6 +193,26 @@ you've sent a message to trigger the update).
You can see if messages are being logged correctly by reviewing the terminal output.
You should see a line like ``2020-06-04 02:08:39,212 - __main__ - INFO - 8``, whenever a message is logged.
--------------
Importing Data
--------------
Data can be imported from JSON dumps from the desktop client.
Hit the three dot button from inside the desired group and select "Export chat history".
Make sure you select JSON as the output format.
You can also limit the date, as desired.
The database will be updated and existing messages will remain, so you can use this feature to fill in gaps when the bot was not running.
To import data, simply call:
.. code:: shell
$ python -m telegram_stats_bot.json_dump_parser "/some/path/to/dump.json" "postgresql://telegram:CoolPassword@localhost/telegram_bot" --tz="America/Toronto"
Where the first argument is the path to the json dump, the second is the db connection string, as above, and the optional `tz` argument should be the time zone of the system used to dump the json.
This can be run without stopping a running bot, though it also attempts to set the user id to user name mapping, so will add an extra entry to every user in the dump (this currently only affects the user stats related to user name changes).
Before you run this, make sure your db string is correct or you might accidentally mess up other databases on the same server.
--------------
Fetching Stats
--------------

41
poetry.lock wygenerowano
Wyświetl plik

@ -59,6 +59,25 @@ category = "main"
optional = false
python-versions = "*"
[[package]]
name = "click"
version = "8.1.3"
description = "Composable command line interface toolkit"
category = "main"
optional = false
python-versions = ">=3.7"
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "colorama"
version = "0.4.6"
description = "Cross-platform colored terminal text."
category = "main"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
[[package]]
name = "cycler"
version = "0.11.0"
@ -377,6 +396,23 @@ category = "main"
optional = false
python-versions = ">= 3.5"
[[package]]
name = "typer"
version = "0.7.0"
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
category = "main"
optional = false
python-versions = ">=3.6"
[package.dependencies]
click = ">=7.1.1,<9.0.0"
[package.extras]
all = ["colorama (>=0.4.3,<0.5.0)", "shellingham (>=1.3.0,<2.0.0)", "rich (>=10.11.0,<13.0.0)"]
dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
doc = ["mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "pillow (>=9.3.0,<10.0.0)", "cairosvg (>=2.5.2,<3.0.0)"]
test = ["shellingham (>=1.3.0,<2.0.0)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "coverage (>=6.2,<7.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "mypy (==0.910)", "black (>=22.3.0,<23.0.0)", "isort (>=5.0.6,<6.0.0)", "rich (>=10.11.0,<13.0.0)"]
[[package]]
name = "tzdata"
version = "2021.5"
@ -405,7 +441,7 @@ test = ["pytest-mock (>=3.3)", "pytest (>=4.3)"]
[metadata]
lock-version = "1.1"
python-versions = ">=3.8,<3.11"
content-hash = "1855bdc73cff766f144e12e38f659568f877681286822ac77457ada13479afcb"
content-hash = "f81adc14942f648ccbcb6745e57ea72d0b8290c969dd81f2fe940a02ac217edd"
[metadata.files]
appdirs = [
@ -442,6 +478,8 @@ certifi = [
{file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
{file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
]
click = []
colorama = []
cycler = [
{file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
{file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
@ -883,6 +921,7 @@ tornado = [
{file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"},
{file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"},
]
typer = []
tzdata = [
{file = "tzdata-2021.5-py2.py3-none-any.whl", hash = "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5"},
{file = "tzdata-2021.5.tar.gz", hash = "sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21"},

Wyświetl plik

@ -19,6 +19,7 @@ numpy = "^1.22.0"
matplotlib = "^3.2.1"
appdirs = "^1.4.4"
single-source = "^0.2.0"
typer = "^0.7.0"
[tool.poetry.dev-dependencies]

Wyświetl plik

@ -74,8 +74,8 @@ def init_dbs(engine: Engine):
create table if not exists user_events
(
message_id bigint,
user_id bigint,
date timestamp with time zone,
user_id text,
date timestamptz,
event text
);

Wyświetl plik

@ -23,6 +23,11 @@ import json
import typing
import pandas as pd
import sqlalchemy.engine
import typer
from sqlalchemy import create_engine
from .stats import StatsRunner
media_dict = {'sticker': 'sticker',
'animation': 'animation',
@ -31,8 +36,8 @@ media_dict = {'sticker': 'sticker',
'audio_file': 'audio',
'video_message': 'video_note'}
user_event_cat = pd.Categorical(['left', 'joined'])
message_type_cat = pd.Categorical(['migrate_from_group', 'text', 'pinned_message', 'photo', 'sticker',
user_event_cat = pd.CategoricalDtype(['left', 'joined'])
message_type_cat = pd.CategoricalDtype(['migrate_from_group', 'text', 'pinned_message', 'photo', 'sticker',
'new_chat_members', 'left_chat_member', 'animation', 'video',
'location', 'new_chat_title', 'voice', 'audio',
'new_chat_photo', 'video_note', 'poll'])
@ -50,9 +55,10 @@ def text_list_parser(text: typing.Union[str, typing.Sequence]) -> str:
return out
def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict]]:
def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict], dict]:
messages_out = []
users_out = []
for message in df.itertuples():
message_dict = {'message_id': message.id,
'date': message.date,
@ -71,16 +77,18 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
user_event_dict = {}
if message.type == 'message':
if pd.notnull(message.from_id):
message_dict['from_user'] = message.from_id
if not message.from_id.startswith('user'):
continue
message_dict['from_user'] = int(message.from_id[4:]) # remove 'user' from id
if pd.notnull(message.forwarded_from):
try:
message_dict['forward_from'] = int(message.forwarded_from)
message_dict['forward_from'] = int(message.from_id[4:]) # username is used in forwarded_from
except ValueError:
pass
if pd.notnull(message.reply_to_message_id):
message_dict['reply_to_message'] = message.reply_to_message_id
message_dict['reply_to_message'] = int(message.reply_to_message_id)
if pd.notnull(message.photo):
message_dict['type'] = 'photo'
@ -97,12 +105,11 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
message_dict['text'] = text_list_parser(message.text)
elif pd.notnull(message.poll):
message_dict['type'] = 'poll'
elif pd.notnull(message.location_information):
message_dict['type'] = 'location'
elif message.type == 'service':
if pd.notnull(message.actor_id):
message_dict['from_user'] = message.actor_id
if message.actor_id.startswith('user'):
message_dict['from_user'] = int(message.actor_id[4:])
if message.action == 'edit_group_title':
message_dict['type'] = 'new_chat_title'
@ -118,12 +125,12 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
users_out.append({'message_id': message.id,
'user_id': i,
'date': message.date,
'event': 'join'})
'event': 'joined'})
except TypeError:
user_event_dict = {'message_id': message.id,
'user_id': message.actor_id,
'date': message.date,
'event': 'join'}
'event': 'joined'}
elif message.action == 'remove_members':
message_dict['type'] = 'left_chat_member'
for i in message.members:
@ -136,7 +143,15 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
messages_out.append(message_dict)
if user_event_dict != {}:
users_out.append(user_event_dict)
return messages_out, users_out
user_map = {int(i[4:]): df.loc[df['from_id'] == i, 'from'].iloc[0]
for i in df['from_id'].unique()
if (df['from_id'] == i).any() and i.startswith('user')}
# Use long name for both name and long name since we can't fetch usernames
user_map = {k: (v, v) for k, v in user_map.items() if v}
return messages_out, users_out, user_map
def parse_json(path: str):
@ -144,3 +159,66 @@ def parse_json(path: str):
js = json.load(f)
chat = js['chats']['list'][1]['messages']
df = pd.DataFrame(chat)
def fix_dtypes_m(df: pd.DataFrame, tz: str) -> pd.DataFrame:
intcols = ['forward_from_message_id', 'forward_from', 'forward_from_chat',
'from_user', 'reply_to_message']
df_out = df.copy()
df_out.loc[:, intcols] = df_out.loc[:, intcols].astype('Int64')
df_out.loc[:, 'date'] = pd.to_datetime(df_out['date'], utc=False).dt.tz_localize(tz=tz,
ambiguous=True)
df_out.loc[:, 'type'] = df_out.loc[:, 'type'].astype(message_type_cat)
return df_out.convert_dtypes()
def fix_dtypes_u(df: pd.DataFrame, tz: str) -> pd.DataFrame:
df_out = df.copy()
df_out.loc[:, 'date'] = pd.to_datetime(df_out['date'], utc=False).dt.tz_localize(tz=tz,
ambiguous=True)
df_out.loc[df_out.event == 'join', 'event'] = 'joined'
df_out['event'] = df_out.event.astype(user_event_cat)
return df_out.convert_dtypes()
def update_user_list(users: dict[int, tuple[str, str]], engine: sqlalchemy.engine.Engine, tz: str):
stats_runner = StatsRunner(engine, tz)
stats_runner.update_user_ids(users)
def main(json_path: str, db_url: str, tz: str = 'Etc/UTC'):
"""
Parse backup json file and update database with contents.
:param json_path:
:param db_url:
:param tz:
:return:
"""
with open(json_path, encoding='utf-8') as f:
js = json.load(f)
chat = js['messages']
messages, users, user_map = convert_messages(pd.DataFrame(chat))
df_m = pd.DataFrame(messages).set_index('message_id')
df_m = fix_dtypes_m(df_m, tz)
df_u = pd.DataFrame(users).set_index('message_id')
df_u = fix_dtypes_u(df_u, tz)
engine = create_engine(db_url, echo=False)
# Exclude existing messages
m_ids = pd.read_sql_table('messages_utc', engine).message_id
df_m = df_m.loc[~df_m.index.isin(m_ids)]
m_ids = pd.read_sql_table('user_events', engine).message_id
df_u = df_u.loc[~df_u.index.isin(m_ids)]
df_u.to_sql('user_events', engine, if_exists='append')
df_m.to_sql('messages_utc', engine, if_exists='append')
update_user_list(user_map, engine, tz)
if __name__ == '__main__':
typer.run(main)