kopia lustrzana https://github.com/mkdryden/telegram-stats-bot
Add script to import data from desktop client json dumps. Implement #10.
rodzic
7d80fc8ed3
commit
b8b8eff5bf
|
@ -13,6 +13,10 @@ Fixed
|
|||
-----
|
||||
- Sticker pack names save correctly now
|
||||
|
||||
Added
|
||||
-----
|
||||
- Add script to import data from desktop client json dumps
|
||||
|
||||
-------------
|
||||
`0.6.4`_ - 2022-02-27
|
||||
-------------
|
||||
|
|
22
README.rst
22
README.rst
|
@ -39,6 +39,8 @@ Table of contents
|
|||
|
||||
- `Setup`_
|
||||
|
||||
- `Importing Data`_
|
||||
|
||||
- `Fetching Stats`_
|
||||
|
||||
- `counts`_
|
||||
|
@ -191,6 +193,26 @@ you've sent a message to trigger the update).
|
|||
You can see if messages are being logged correctly by reviewing the terminal output.
|
||||
You should see a line like ``2020-06-04 02:08:39,212 - __main__ - INFO - 8``, whenever a message is logged.
|
||||
|
||||
--------------
|
||||
Importing Data
|
||||
--------------
|
||||
Data can be imported from JSON dumps from the desktop client.
|
||||
Hit the three dot button from inside the desired group and select "Export chat history".
|
||||
Make sure you select JSON as the output format.
|
||||
You can also limit the date, as desired.
|
||||
The database will be updated and existing messages will remain, so you can use this feature to fill in gaps when the bot was not running.
|
||||
|
||||
To import data, simply call:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
$ python -m telegram_stats_bot.json_dump_parser "/some/path/to/dump.json" "postgresql://telegram:CoolPassword@localhost/telegram_bot" --tz="America/Toronto"
|
||||
|
||||
Where the first argument is the path to the json dump, the second is the db connection string, as above, and the optional `tz` argument should be the time zone of the system used to dump the json.
|
||||
|
||||
This can be run without stopping a running bot, though it also attempts to set the user id to user name mapping, so will add an extra entry to every user in the dump (this currently only affects the user stats related to user name changes).
|
||||
Before you run this, make sure your db string is correct or you might accidentally mess up other databases on the same server.
|
||||
|
||||
--------------
|
||||
Fetching Stats
|
||||
--------------
|
||||
|
|
|
@ -59,6 +59,25 @@ category = "main"
|
|||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "click"
|
||||
version = "8.1.3"
|
||||
description = "Composable command line interface toolkit"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
description = "Cross-platform colored terminal text."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||
|
||||
[[package]]
|
||||
name = "cycler"
|
||||
version = "0.11.0"
|
||||
|
@ -377,6 +396,23 @@ category = "main"
|
|||
optional = false
|
||||
python-versions = ">= 3.5"
|
||||
|
||||
[[package]]
|
||||
name = "typer"
|
||||
version = "0.7.0"
|
||||
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
|
||||
[package.dependencies]
|
||||
click = ">=7.1.1,<9.0.0"
|
||||
|
||||
[package.extras]
|
||||
all = ["colorama (>=0.4.3,<0.5.0)", "shellingham (>=1.3.0,<2.0.0)", "rich (>=10.11.0,<13.0.0)"]
|
||||
dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
|
||||
doc = ["mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "pillow (>=9.3.0,<10.0.0)", "cairosvg (>=2.5.2,<3.0.0)"]
|
||||
test = ["shellingham (>=1.3.0,<2.0.0)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "coverage (>=6.2,<7.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "mypy (==0.910)", "black (>=22.3.0,<23.0.0)", "isort (>=5.0.6,<6.0.0)", "rich (>=10.11.0,<13.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "tzdata"
|
||||
version = "2021.5"
|
||||
|
@ -405,7 +441,7 @@ test = ["pytest-mock (>=3.3)", "pytest (>=4.3)"]
|
|||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = ">=3.8,<3.11"
|
||||
content-hash = "1855bdc73cff766f144e12e38f659568f877681286822ac77457ada13479afcb"
|
||||
content-hash = "f81adc14942f648ccbcb6745e57ea72d0b8290c969dd81f2fe940a02ac217edd"
|
||||
|
||||
[metadata.files]
|
||||
appdirs = [
|
||||
|
@ -442,6 +478,8 @@ certifi = [
|
|||
{file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
|
||||
{file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
|
||||
]
|
||||
click = []
|
||||
colorama = []
|
||||
cycler = [
|
||||
{file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
|
||||
{file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
|
||||
|
@ -883,6 +921,7 @@ tornado = [
|
|||
{file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"},
|
||||
{file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"},
|
||||
]
|
||||
typer = []
|
||||
tzdata = [
|
||||
{file = "tzdata-2021.5-py2.py3-none-any.whl", hash = "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5"},
|
||||
{file = "tzdata-2021.5.tar.gz", hash = "sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21"},
|
||||
|
|
|
@ -19,6 +19,7 @@ numpy = "^1.22.0"
|
|||
matplotlib = "^3.2.1"
|
||||
appdirs = "^1.4.4"
|
||||
single-source = "^0.2.0"
|
||||
typer = "^0.7.0"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
|
||||
|
|
|
@ -74,8 +74,8 @@ def init_dbs(engine: Engine):
|
|||
create table if not exists user_events
|
||||
(
|
||||
message_id bigint,
|
||||
user_id bigint,
|
||||
date timestamp with time zone,
|
||||
user_id text,
|
||||
date timestamptz,
|
||||
event text
|
||||
);
|
||||
|
||||
|
|
|
@ -23,6 +23,11 @@ import json
|
|||
import typing
|
||||
|
||||
import pandas as pd
|
||||
import sqlalchemy.engine
|
||||
import typer
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
from .stats import StatsRunner
|
||||
|
||||
media_dict = {'sticker': 'sticker',
|
||||
'animation': 'animation',
|
||||
|
@ -31,8 +36,8 @@ media_dict = {'sticker': 'sticker',
|
|||
'audio_file': 'audio',
|
||||
'video_message': 'video_note'}
|
||||
|
||||
user_event_cat = pd.Categorical(['left', 'joined'])
|
||||
message_type_cat = pd.Categorical(['migrate_from_group', 'text', 'pinned_message', 'photo', 'sticker',
|
||||
user_event_cat = pd.CategoricalDtype(['left', 'joined'])
|
||||
message_type_cat = pd.CategoricalDtype(['migrate_from_group', 'text', 'pinned_message', 'photo', 'sticker',
|
||||
'new_chat_members', 'left_chat_member', 'animation', 'video',
|
||||
'location', 'new_chat_title', 'voice', 'audio',
|
||||
'new_chat_photo', 'video_note', 'poll'])
|
||||
|
@ -50,9 +55,10 @@ def text_list_parser(text: typing.Union[str, typing.Sequence]) -> str:
|
|||
return out
|
||||
|
||||
|
||||
def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict]]:
|
||||
def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict], dict]:
|
||||
messages_out = []
|
||||
users_out = []
|
||||
|
||||
for message in df.itertuples():
|
||||
message_dict = {'message_id': message.id,
|
||||
'date': message.date,
|
||||
|
@ -71,16 +77,18 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
|
|||
user_event_dict = {}
|
||||
if message.type == 'message':
|
||||
if pd.notnull(message.from_id):
|
||||
message_dict['from_user'] = message.from_id
|
||||
if not message.from_id.startswith('user'):
|
||||
continue
|
||||
message_dict['from_user'] = int(message.from_id[4:]) # remove 'user' from id
|
||||
|
||||
if pd.notnull(message.forwarded_from):
|
||||
try:
|
||||
message_dict['forward_from'] = int(message.forwarded_from)
|
||||
message_dict['forward_from'] = int(message.from_id[4:]) # username is used in forwarded_from
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if pd.notnull(message.reply_to_message_id):
|
||||
message_dict['reply_to_message'] = message.reply_to_message_id
|
||||
message_dict['reply_to_message'] = int(message.reply_to_message_id)
|
||||
|
||||
if pd.notnull(message.photo):
|
||||
message_dict['type'] = 'photo'
|
||||
|
@ -97,12 +105,11 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
|
|||
message_dict['text'] = text_list_parser(message.text)
|
||||
elif pd.notnull(message.poll):
|
||||
message_dict['type'] = 'poll'
|
||||
elif pd.notnull(message.location_information):
|
||||
message_dict['type'] = 'location'
|
||||
|
||||
elif message.type == 'service':
|
||||
if pd.notnull(message.actor_id):
|
||||
message_dict['from_user'] = message.actor_id
|
||||
if message.actor_id.startswith('user'):
|
||||
message_dict['from_user'] = int(message.actor_id[4:])
|
||||
|
||||
if message.action == 'edit_group_title':
|
||||
message_dict['type'] = 'new_chat_title'
|
||||
|
@ -118,12 +125,12 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
|
|||
users_out.append({'message_id': message.id,
|
||||
'user_id': i,
|
||||
'date': message.date,
|
||||
'event': 'join'})
|
||||
'event': 'joined'})
|
||||
except TypeError:
|
||||
user_event_dict = {'message_id': message.id,
|
||||
'user_id': message.actor_id,
|
||||
'date': message.date,
|
||||
'event': 'join'}
|
||||
'event': 'joined'}
|
||||
elif message.action == 'remove_members':
|
||||
message_dict['type'] = 'left_chat_member'
|
||||
for i in message.members:
|
||||
|
@ -136,7 +143,15 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
|
|||
messages_out.append(message_dict)
|
||||
if user_event_dict != {}:
|
||||
users_out.append(user_event_dict)
|
||||
return messages_out, users_out
|
||||
|
||||
user_map = {int(i[4:]): df.loc[df['from_id'] == i, 'from'].iloc[0]
|
||||
for i in df['from_id'].unique()
|
||||
if (df['from_id'] == i).any() and i.startswith('user')}
|
||||
|
||||
# Use long name for both name and long name since we can't fetch usernames
|
||||
user_map = {k: (v, v) for k, v in user_map.items() if v}
|
||||
|
||||
return messages_out, users_out, user_map
|
||||
|
||||
|
||||
def parse_json(path: str):
|
||||
|
@ -144,3 +159,66 @@ def parse_json(path: str):
|
|||
js = json.load(f)
|
||||
chat = js['chats']['list'][1]['messages']
|
||||
df = pd.DataFrame(chat)
|
||||
|
||||
|
||||
def fix_dtypes_m(df: pd.DataFrame, tz: str) -> pd.DataFrame:
|
||||
intcols = ['forward_from_message_id', 'forward_from', 'forward_from_chat',
|
||||
'from_user', 'reply_to_message']
|
||||
df_out = df.copy()
|
||||
df_out.loc[:, intcols] = df_out.loc[:, intcols].astype('Int64')
|
||||
df_out.loc[:, 'date'] = pd.to_datetime(df_out['date'], utc=False).dt.tz_localize(tz=tz,
|
||||
ambiguous=True)
|
||||
df_out.loc[:, 'type'] = df_out.loc[:, 'type'].astype(message_type_cat)
|
||||
return df_out.convert_dtypes()
|
||||
|
||||
|
||||
def fix_dtypes_u(df: pd.DataFrame, tz: str) -> pd.DataFrame:
|
||||
df_out = df.copy()
|
||||
df_out.loc[:, 'date'] = pd.to_datetime(df_out['date'], utc=False).dt.tz_localize(tz=tz,
|
||||
ambiguous=True)
|
||||
df_out.loc[df_out.event == 'join', 'event'] = 'joined'
|
||||
df_out['event'] = df_out.event.astype(user_event_cat)
|
||||
|
||||
return df_out.convert_dtypes()
|
||||
|
||||
|
||||
def update_user_list(users: dict[int, tuple[str, str]], engine: sqlalchemy.engine.Engine, tz: str):
|
||||
stats_runner = StatsRunner(engine, tz)
|
||||
stats_runner.update_user_ids(users)
|
||||
|
||||
|
||||
def main(json_path: str, db_url: str, tz: str = 'Etc/UTC'):
|
||||
"""
|
||||
Parse backup json file and update database with contents.
|
||||
:param json_path:
|
||||
:param db_url:
|
||||
:param tz:
|
||||
:return:
|
||||
"""
|
||||
with open(json_path, encoding='utf-8') as f:
|
||||
js = json.load(f)
|
||||
|
||||
chat = js['messages']
|
||||
messages, users, user_map = convert_messages(pd.DataFrame(chat))
|
||||
|
||||
df_m = pd.DataFrame(messages).set_index('message_id')
|
||||
df_m = fix_dtypes_m(df_m, tz)
|
||||
df_u = pd.DataFrame(users).set_index('message_id')
|
||||
df_u = fix_dtypes_u(df_u, tz)
|
||||
|
||||
engine = create_engine(db_url, echo=False)
|
||||
|
||||
# Exclude existing messages
|
||||
m_ids = pd.read_sql_table('messages_utc', engine).message_id
|
||||
df_m = df_m.loc[~df_m.index.isin(m_ids)]
|
||||
m_ids = pd.read_sql_table('user_events', engine).message_id
|
||||
df_u = df_u.loc[~df_u.index.isin(m_ids)]
|
||||
|
||||
df_u.to_sql('user_events', engine, if_exists='append')
|
||||
df_m.to_sql('messages_utc', engine, if_exists='append')
|
||||
|
||||
update_user_list(user_map, engine, tz)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
typer.run(main)
|
||||
|
|
Ładowanie…
Reference in New Issue