diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1b7abaa..7f9d27d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,17 +7,18 @@ The format is based on `Keep a Changelog ` and this project adheres to `Semantic Versioning `_. ---------- -Unreleased +`0.7.0`_ - 2023-01-14 ---------- Fixed ----- - Sticker pack names save correctly now -- Explicitly add psycopg2-binary as dependency because sqlalchemy extra doesn't seem to work anymore. +- Explicitly add psycopg2-binary as dependency because sqlalchemy extra doesn't seem to work anymore - Try to map user ids to names during json dump import. (#17) Added ----- - Add script to import data from desktop client json dumps +- Add ECDF plot for message counts by user with ``/stats count-dist`` ------------- `0.6.4`_ - 2022-02-27 @@ -131,7 +132,7 @@ Fixed ---------------------- - Initial release -.. _Unreleased: https://github.com/mkdryden/telegram-stats-bot/compare/v0.6.2...HEAD +.. _Unreleased: https://github.com/mkdryden/telegram-stats-bot/compare/v0.7.0...HEAD .. _0.1.1: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.1.1 .. _0.2.0: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.2.0 .. _0.3.0: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.3.0 @@ -142,3 +143,4 @@ Fixed .. _0.6.1: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.6.1 .. _0.6.2: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.6.2 .. _0.6.3: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.6.3 +.. _0.7.0: https://github.com/mkdryden/telegram-stats-bot/releases/tag/v0.7.0 diff --git a/README.rst b/README.rst index a4ba154..d68ff62 100644 --- a/README.rst +++ b/README.rst @@ -45,6 +45,8 @@ Table of contents - `counts`_ + - `count-dist`_ + - `hours`_ - `days`_ @@ -251,6 +253,13 @@ counts @WhereAreMyManners 30481 5.1 @TheWorstOfTheBest 28705 4.8 +count-dist +---------- +``/stats count-dist`` returns an ECDF plot of the users in the group by message count. + +.. image:: examples/count-dist.png + :alt: Example of count-dist plot + hours ----- ``/stats hours`` returns a plot of message frequency for the hours of the day. diff --git a/examples/count-dist.png b/examples/count-dist.png new file mode 100644 index 0000000..51a0250 Binary files /dev/null and b/examples/count-dist.png differ diff --git a/pyproject.toml b/pyproject.toml index e906e4d..023fcb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "telegram-stats-bot" -version = "0.6.4" +version = "0.7.0" description = "A logging and statistics bot for Telegram based on python-telegram-bot." authors = ["Michael DM Dryden "] repository = "https://github.com/mkdryden/telegram-stats-bot" diff --git a/telegram_stats_bot/stats.py b/telegram_stats_bot/stats.py index 5f03cc4..525fefa 100644 --- a/telegram_stats_bot/stats.py +++ b/telegram_stats_bot/stats.py @@ -72,6 +72,7 @@ class InternalParser(argparse.ArgumentParser): class StatsRunner(object): allowed_methods = {'counts': "get_chat_counts", + 'count-dist': 'get_chat_ecdf', 'hours': "get_counts_by_hour", 'days': "get_counts_by_day", 'week': "get_week_by_hourday", @@ -205,6 +206,75 @@ class StatsRunner(object): return f"```\n{text}\n```", None + def get_chat_ecdf(self, lquery: str = None, mtype: str = None, start: str = None, end: str = None, + log: bool = False) -> Tuple[Union[str, None], Union[None, BytesIO]]: + """ + Get message counts by number of users as an ECDF plot. + :param lquery: Limit results to lexical query (&, |, !, ) + :param mtype: Limit results to message type (text, sticker, photo, etc.) + :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") + :param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") + :param log: Plot with log scale. + """ + sql_dict = {} + query_conditions = [] + + if lquery: + query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )") + + if mtype: + if mtype not in ('text', 'sticker', 'photo', 'animation', 'video', 'voice', 'location', 'video_note', + 'audio', 'document', 'poll'): + raise HelpException(f'mtype {mtype} is invalid.') + query_conditions.append(f"""type = '{mtype}'""") + + if start: + sql_dict['start_dt'] = pd.to_datetime(start) + query_conditions.append("date >= %(start_dt)s") + + if end: + sql_dict['end_dt'] = pd.to_datetime(end) + query_conditions.append("date < %(end_dt)s") + + query_where = "" + if query_conditions: + query_where = f"WHERE {' AND '.join(query_conditions)}" + + query = f""" + SELECT "from_user", COUNT(*) as "count" + FROM "messages_utc" + {query_where} + GROUP BY "from_user" + ORDER BY "count" DESC; + """ + + with self.engine.connect() as con: + df = pd.read_sql_query(query, con, params=sql_dict) + + if len(df) == 0: + return "No matching messages", None + + fig = Figure(constrained_layout=True) + subplot = fig.subplots() + + sns.ecdfplot(df, y='count', stat='count', log_scale=log, ax=subplot) + subplot.set_xlabel('User') + subplot.set_ylabel('Messages') + + if lquery: + subplot.set_title(f"Messages by User for {lquery}") + else: + subplot.set_title("Messages by User") + + sns.despine(fig=fig) + + bio = BytesIO() + bio.name = 'plot.png' + fig.savefig(bio, bbox_inches='tight') + bio.seek(0) + + return None, bio + def get_counts_by_hour(self, user: Tuple[int, str] = None, lquery: str = None, start: str = None, end: str = None) \ -> Tuple[Union[str, None], Union[None, BytesIO]]: """