2020-06-04 02:29:59 +00:00
|
|
|
# !/usr/bin/env python
|
|
|
|
#
|
|
|
|
# A logging and statistics bot for Telegram based on python-telegram-bot.
|
|
|
|
# Copyright (C) 2020
|
|
|
|
# Michael DM Dryden <mk.dryden@utoronto.ca>
|
|
|
|
#
|
|
|
|
# This file is part of telegram-stats-bot.
|
|
|
|
#
|
|
|
|
# telegram-stats-bot is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU Lesser Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Public License
|
|
|
|
# along with this program. If not, see [http://www.gnu.org/licenses/].
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
import logging
|
2021-06-07 04:15:53 +00:00
|
|
|
from typing import Dict, List, Tuple, Text, NoReturn, Union
|
2020-05-28 05:18:52 +00:00
|
|
|
from threading import Lock
|
|
|
|
from io import BytesIO
|
|
|
|
import argparse
|
|
|
|
import inspect
|
|
|
|
import re
|
2020-06-16 07:48:39 +00:00
|
|
|
from datetime import timedelta, datetime
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
import seaborn as sns
|
2020-06-16 07:48:39 +00:00
|
|
|
import numpy as np
|
2020-05-28 05:18:52 +00:00
|
|
|
from matplotlib.figure import Figure
|
2020-06-16 07:48:39 +00:00
|
|
|
from matplotlib.dates import date2num
|
2020-05-28 05:18:52 +00:00
|
|
|
from sqlalchemy.engine import Engine
|
2023-11-13 18:17:34 +00:00
|
|
|
from sqlalchemy import select, func, text
|
2021-06-12 01:38:23 +00:00
|
|
|
from sqlalchemy.dialects import postgresql
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-12 01:38:23 +00:00
|
|
|
from .utils import escape_markdown, TsStat, random_quote
|
|
|
|
from .db import messages
|
2021-06-07 03:01:47 +00:00
|
|
|
from . import __version__
|
2020-05-30 04:30:20 +00:00
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
sns.set_context('paper')
|
|
|
|
sns.set_style('whitegrid')
|
|
|
|
sns.set_palette("Set2")
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
logging.getLogger('matplotlib').setLevel(logging.WARNING) # Mute matplotlib debug messages
|
2020-05-28 05:18:52 +00:00
|
|
|
logger = logging.getLogger()
|
|
|
|
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
def output_fig(fig: Figure) -> BytesIO:
|
|
|
|
bio = BytesIO()
|
|
|
|
bio.name = 'plot.png'
|
|
|
|
fig.savefig(bio, bbox_inches='tight', dpi=200, format='png')
|
|
|
|
bio.seek(0)
|
|
|
|
return bio
|
|
|
|
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
class HelpException(Exception):
|
|
|
|
def __init__(self, msg: str = None):
|
|
|
|
self.msg = msg
|
|
|
|
|
|
|
|
|
|
|
|
class InternalParser(argparse.ArgumentParser):
|
|
|
|
def error(self, message: Text) -> NoReturn:
|
|
|
|
try:
|
|
|
|
raise # Reraises mostly ArgumentError for bad arg
|
|
|
|
except RuntimeError:
|
|
|
|
raise HelpException(message)
|
|
|
|
|
|
|
|
def print_help(self, file=None) -> None:
|
|
|
|
raise HelpException(self.format_help())
|
|
|
|
|
2021-06-07 03:01:47 +00:00
|
|
|
def _print_message(self, message: str, file=None) -> None:
|
|
|
|
raise HelpException(message)
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
def exit(self, status=None, message=None):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class StatsRunner(object):
|
|
|
|
allowed_methods = {'counts': "get_chat_counts",
|
2023-01-14 08:24:59 +00:00
|
|
|
'count-dist': 'get_chat_ecdf',
|
2020-05-28 05:18:52 +00:00
|
|
|
'hours': "get_counts_by_hour",
|
|
|
|
'days': "get_counts_by_day",
|
2020-05-28 05:43:39 +00:00
|
|
|
'week': "get_week_by_hourday",
|
2020-05-30 04:33:01 +00:00
|
|
|
'history': "get_message_history",
|
2020-06-16 07:48:39 +00:00
|
|
|
'titles': 'get_title_history',
|
2021-06-20 05:37:29 +00:00
|
|
|
'user': 'get_user_summary',
|
2020-05-30 06:18:48 +00:00
|
|
|
'corr': "get_user_correlation",
|
2020-06-06 21:25:06 +00:00
|
|
|
'delta': "get_message_deltas",
|
2020-08-08 06:07:02 +00:00
|
|
|
'types': "get_type_stats",
|
2021-06-12 01:38:23 +00:00
|
|
|
'words': "get_word_stats",
|
2020-08-08 06:07:02 +00:00
|
|
|
'random': "get_random_message"}
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-20 03:11:22 +00:00
|
|
|
def __init__(self, engine: Engine, tz: str = 'Etc/UTC'):
|
2020-05-28 05:18:52 +00:00
|
|
|
self.engine = engine
|
|
|
|
self.tz = tz
|
|
|
|
|
2021-06-20 05:40:09 +00:00
|
|
|
self.users: Dict[int, Tuple[str, str]] = self.get_db_users()
|
2020-05-28 05:18:52 +00:00
|
|
|
self.users_lock = Lock()
|
|
|
|
|
2021-06-20 05:40:09 +00:00
|
|
|
def get_message_user_ids(self) -> List[int]:
|
2021-06-07 03:00:08 +00:00
|
|
|
"""Returns list of unique user ids from messages in database."""
|
2020-05-28 05:18:52 +00:00
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
result = con.execute(text("SELECT DISTINCT from_user FROM messages_utc;"))
|
2023-11-14 03:03:19 +00:00
|
|
|
return [user for user, in result.fetchall() if user is not None]
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-20 05:40:09 +00:00
|
|
|
def get_db_users(self) -> Dict[int, Tuple[str, str]]:
|
2021-06-07 03:00:08 +00:00
|
|
|
"""Returns dictionary mapping user ids to usernames and full names."""
|
2020-05-28 05:18:52 +00:00
|
|
|
query = """
|
|
|
|
select user_id, username, display_name from (
|
|
|
|
select
|
|
|
|
*,
|
|
|
|
row_number() over(partition by user_id order by date desc) as rn
|
|
|
|
from
|
|
|
|
user_names
|
|
|
|
) t
|
|
|
|
where t.rn = 1
|
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
result = con.execute(text(query))
|
2020-05-28 05:18:52 +00:00
|
|
|
result = result.fetchall()
|
|
|
|
|
|
|
|
return {user_id: (username, name) for user_id, username, name in result}
|
|
|
|
|
2021-06-20 05:40:09 +00:00
|
|
|
def update_user_ids(self, user_dict: Dict[int, Tuple[str, str]]):
|
2021-06-07 03:00:08 +00:00
|
|
|
"""
|
|
|
|
Updates user names table with user_dict
|
|
|
|
:param user_dict: mapping of user ids to (username, display name)
|
|
|
|
"""
|
2020-05-28 05:18:52 +00:00
|
|
|
for uid in user_dict:
|
|
|
|
username, display_name = user_dict[uid]
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict = {'uid': uid, 'username': username, 'display_name': display_name}
|
|
|
|
query = """
|
2023-11-13 18:17:34 +00:00
|
|
|
UPDATE user_names
|
|
|
|
SET username = :username
|
|
|
|
WHERE user_id = :uid AND username IS DISTINCT FROM :username;
|
2020-05-28 05:18:52 +00:00
|
|
|
"""
|
|
|
|
if display_name:
|
2020-05-28 06:51:44 +00:00
|
|
|
query += """\n
|
2020-05-28 05:18:52 +00:00
|
|
|
INSERT INTO user_names(user_id, date, username, display_name)
|
2023-11-13 18:17:34 +00:00
|
|
|
VALUES (:uid, current_timestamp, :username, :display_name);
|
2020-05-28 05:18:52 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
con.execute(text(query), sql_dict)
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-12 01:39:12 +00:00
|
|
|
def get_chat_counts(self, n: int = 20, lquery: str = None, mtype: str = None, start: str = None, end: str = None) \
|
2021-06-08 04:06:06 +00:00
|
|
|
-> Tuple[Union[str, None], Union[None, BytesIO]]:
|
2020-05-28 05:18:52 +00:00
|
|
|
"""
|
|
|
|
Get top chat users
|
2021-06-07 04:15:53 +00:00
|
|
|
:param lquery: Limit results to lexical query (&, |, !, <n>)
|
2021-06-08 03:53:21 +00:00
|
|
|
:param mtype: Limit results to message type (text, sticker, photo, etc.)
|
2020-05-28 05:18:52 +00:00
|
|
|
:param n: Number of users to show
|
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
"""
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict = {}
|
|
|
|
query_conditions = []
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2020-05-30 04:30:20 +00:00
|
|
|
if n <= 0:
|
2020-05-30 06:19:52 +00:00
|
|
|
raise HelpException(f'n must be greater than 0, got: {n}')
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if lquery:
|
2021-06-12 01:39:12 +00:00
|
|
|
query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )")
|
2021-06-07 04:15:53 +00:00
|
|
|
|
2021-06-08 03:53:21 +00:00
|
|
|
if mtype:
|
|
|
|
if mtype not in ('text', 'sticker', 'photo', 'animation', 'video', 'voice', 'location', 'video_note',
|
|
|
|
'audio', 'document', 'poll'):
|
|
|
|
raise HelpException(f'mtype {mtype} is invalid.')
|
|
|
|
query_conditions.append(f"""type = '{mtype}'""")
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
if start:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
if end:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-05-28 06:51:44 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"WHERE {' AND '.join(query_conditions)}"
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
query = f"""
|
2023-11-13 18:17:34 +00:00
|
|
|
SELECT "from_user", COUNT(*) as "count"
|
|
|
|
FROM "messages_utc"
|
|
|
|
{query_where}
|
|
|
|
GROUP BY "from_user"
|
|
|
|
ORDER BY "count" DESC;
|
2020-05-28 05:18:52 +00:00
|
|
|
"""
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.read_sql_query(text(query), con, params=sql_dict, index_col='from_user')
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-08 04:06:06 +00:00
|
|
|
if len(df) == 0:
|
|
|
|
return "No matching messages", None
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
user_df = pd.Series(self.users, name="user")
|
|
|
|
user_df = user_df.apply(lambda x: x[0]) # Take only @usernames
|
|
|
|
df = df.join(user_df)
|
|
|
|
df['Percent'] = df['count'] / df['count'].sum() * 100
|
|
|
|
df = df[['user', 'count', 'Percent']]
|
2021-06-08 03:53:21 +00:00
|
|
|
if mtype:
|
|
|
|
df.columns = ['User', mtype, 'Percent']
|
|
|
|
elif lquery:
|
|
|
|
df.columns = ['User', 'lquery', 'Percent']
|
2021-06-07 04:15:53 +00:00
|
|
|
else:
|
|
|
|
df.columns = ['User', 'Total Messages', 'Percent']
|
2021-06-08 03:53:21 +00:00
|
|
|
df['User'] = df['User'].str.replace(r'[^\x00-\x7F]|[@]', "", regex=True) # Drop emoji and @
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
out_text = df.iloc[:n].to_string(index=False, header=True, float_format=lambda x: f"{x:.1f}")
|
2020-05-30 04:31:32 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
return f"```\n{out_text}\n```", None
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2023-01-14 08:24:59 +00:00
|
|
|
def get_chat_ecdf(self, lquery: str = None, mtype: str = None, start: str = None, end: str = None,
|
|
|
|
log: bool = False) -> Tuple[Union[str, None], Union[None, BytesIO]]:
|
|
|
|
"""
|
|
|
|
Get message counts by number of users as an ECDF plot.
|
|
|
|
:param lquery: Limit results to lexical query (&, |, !, <n>)
|
|
|
|
:param mtype: Limit results to message type (text, sticker, photo, etc.)
|
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param log: Plot with log scale.
|
|
|
|
"""
|
|
|
|
sql_dict = {}
|
|
|
|
query_conditions = []
|
|
|
|
|
|
|
|
if lquery:
|
|
|
|
query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )")
|
|
|
|
|
|
|
|
if mtype:
|
|
|
|
if mtype not in ('text', 'sticker', 'photo', 'animation', 'video', 'voice', 'location', 'video_note',
|
|
|
|
'audio', 'document', 'poll'):
|
|
|
|
raise HelpException(f'mtype {mtype} is invalid.')
|
|
|
|
query_conditions.append(f"""type = '{mtype}'""")
|
|
|
|
|
|
|
|
if start:
|
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2023-01-14 08:24:59 +00:00
|
|
|
|
|
|
|
if end:
|
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2023-01-14 08:24:59 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"WHERE {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
query = f"""
|
2023-11-13 18:17:34 +00:00
|
|
|
SELECT "from_user", COUNT(*) as "count"
|
|
|
|
FROM "messages_utc"
|
|
|
|
{query_where}
|
|
|
|
GROUP BY "from_user"
|
|
|
|
ORDER BY "count" DESC;
|
2023-01-14 08:24:59 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.read_sql_query(text(query), con, params=sql_dict)
|
2023-01-14 08:24:59 +00:00
|
|
|
|
|
|
|
if len(df) == 0:
|
|
|
|
return "No matching messages", None
|
|
|
|
|
|
|
|
fig = Figure(constrained_layout=True)
|
|
|
|
subplot = fig.subplots()
|
|
|
|
|
|
|
|
sns.ecdfplot(df, y='count', stat='count', log_scale=log, ax=subplot)
|
|
|
|
subplot.set_xlabel('User')
|
|
|
|
subplot.set_ylabel('Messages')
|
|
|
|
|
|
|
|
if lquery:
|
|
|
|
subplot.set_title(f"Messages by User for {lquery}")
|
|
|
|
else:
|
|
|
|
subplot.set_title("Messages by User")
|
|
|
|
|
|
|
|
sns.despine(fig=fig)
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
bio = output_fig(fig)
|
2023-01-14 08:24:59 +00:00
|
|
|
|
|
|
|
return None, bio
|
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
def get_counts_by_hour(self, user: Tuple[int, str] = None, lquery: str = None, start: str = None, end: str = None) \
|
|
|
|
-> Tuple[Union[str, None], Union[None, BytesIO]]:
|
2020-05-28 05:18:52 +00:00
|
|
|
"""
|
|
|
|
Get plot of messages for hours of the day
|
2021-06-07 04:15:53 +00:00
|
|
|
:param lquery: Limit results to lexical query (&, |, !, <n>)
|
2020-05-28 05:18:52 +00:00
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
"""
|
|
|
|
query_conditions = []
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict = {}
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if lquery:
|
2021-06-12 01:39:12 +00:00
|
|
|
query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )")
|
2021-06-07 04:15:53 +00:00
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
if start:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
if end:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
if user:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['user'] = user[0]
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("from_user = :user")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"WHERE {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
query = f"""
|
|
|
|
SELECT date_trunc('hour', date) as day, count(*) as messages
|
|
|
|
FROM messages_utc
|
|
|
|
{query_where}
|
|
|
|
GROUP BY day
|
|
|
|
ORDER BY day
|
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.read_sql_query(text(query), con, params=sql_dict)
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if len(df) == 0:
|
|
|
|
return "No matching messages", None
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
df['day'] = pd.to_datetime(df.day)
|
|
|
|
df['day'] = df.day.dt.tz_convert(self.tz)
|
|
|
|
df = df.set_index('day')
|
|
|
|
df = df.asfreq('h', fill_value=0) # Insert 0s for periods with no messages
|
2020-06-05 06:17:26 +00:00
|
|
|
|
|
|
|
if (df.index.max() - df.index.min()) < pd.Timedelta('24 hours'): # Deal with data covering < 24 hours
|
|
|
|
df = df.reindex(pd.date_range(df.index.min(), periods=24, freq='h'))
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
df['hour'] = df.index.hour
|
|
|
|
|
|
|
|
if user:
|
|
|
|
# Aggregate over 1 week periods
|
|
|
|
df = df.groupby('hour').resample('7D').sum().drop(columns='hour')
|
|
|
|
df['hour'] = df.index.get_level_values('hour')
|
|
|
|
|
|
|
|
fig = Figure(constrained_layout=True)
|
|
|
|
subplot = fig.subplots()
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
plot_common_kwargs = dict(x='hour', y='messages', hue='hour', data=df, ax=subplot, legend=False,
|
|
|
|
palette='flare')
|
|
|
|
sns.stripplot(jitter=.4, size=2, alpha=.5, zorder=1, **plot_common_kwargs)
|
|
|
|
sns.boxplot(whis=1, showfliers=False, whiskerprops={"zorder": 10}, boxprops={"zorder": 10}, zorder=10,
|
|
|
|
**plot_common_kwargs)
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
subplot.set_ylim(bottom=0, top=df['messages'].quantile(0.999, interpolation='higher'))
|
|
|
|
|
|
|
|
subplot.axvspan(11.5, 23.5, zorder=0, color=(0, 0, 0, 0.05))
|
2020-06-05 06:17:26 +00:00
|
|
|
subplot.set_xlim(-1, 24) # Set explicitly to plot properly even with missing data
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if lquery:
|
|
|
|
subplot.set_title(f"Messages by Hour for {lquery}")
|
|
|
|
elif user:
|
|
|
|
subplot.set_title(f"Messages by Hour for {user[1]}")
|
2020-05-28 05:18:52 +00:00
|
|
|
if user:
|
|
|
|
subplot.set_ylabel('Messages per Week')
|
|
|
|
else:
|
|
|
|
subplot.set_ylabel('Messages per Day')
|
|
|
|
subplot.set_title("Messages by Hour")
|
|
|
|
|
|
|
|
sns.despine(fig=fig)
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
bio = output_fig(fig)
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
return None, bio
|
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
def get_counts_by_day(self, user: Tuple[int, str] = None, lquery: str = None, start: str = None, end: str = None,
|
|
|
|
plot: str = None) -> Tuple[Union[str, None], Union[None, BytesIO]]:
|
2020-05-28 05:18:52 +00:00
|
|
|
"""
|
|
|
|
Get plot of messages for days of the week
|
2021-06-07 04:15:53 +00:00
|
|
|
:param lquery: Limit results to lexical query (&, |, !, <n>)
|
2020-05-28 05:18:52 +00:00
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
2020-05-28 05:44:02 +00:00
|
|
|
:param plot: Type of plot. ('box' or 'violin')
|
2020-05-28 05:18:52 +00:00
|
|
|
"""
|
|
|
|
query_conditions = []
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict = {}
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if lquery:
|
2021-06-12 01:39:12 +00:00
|
|
|
query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )")
|
2021-06-07 04:15:53 +00:00
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
if start:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
if end:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
if user:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['user'] = user[0]
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("from_user = :user")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"WHERE {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
query = f"""
|
2023-11-13 18:17:34 +00:00
|
|
|
SELECT date_trunc('day', date)
|
|
|
|
as day, count(*) as messages
|
|
|
|
FROM messages_utc
|
|
|
|
{query_where}
|
|
|
|
GROUP BY day
|
|
|
|
ORDER BY day
|
2020-05-28 05:18:52 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.read_sql_query(text(query), con, params=sql_dict)
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if len(df) == 0:
|
|
|
|
return "No matching messages", None
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
df['day'] = pd.to_datetime(df.day)
|
|
|
|
df['day'] = df.day.dt.tz_convert(self.tz)
|
|
|
|
df = df.set_index('day')
|
|
|
|
df = df.asfreq('d', fill_value=0) # Fill periods with no messages
|
2020-06-05 06:06:29 +00:00
|
|
|
if (df.index.max() - df.index.min()) < pd.Timedelta('7 days'): # Deal with data covering < 7 days
|
|
|
|
df = df.reindex(pd.date_range(df.index.min(), periods=7, freq='d'))
|
2020-05-28 05:18:52 +00:00
|
|
|
df['dow'] = df.index.weekday
|
|
|
|
df['day_name'] = df.index.day_name()
|
|
|
|
df = df.sort_values('dow') # Make sure start is Monday
|
|
|
|
|
|
|
|
fig = Figure(constrained_layout=True)
|
|
|
|
subplot = fig.subplots()
|
|
|
|
if plot == 'box':
|
|
|
|
sns.boxplot(x='day_name', y='messages', data=df, whis=1, showfliers=False, ax=subplot)
|
|
|
|
elif plot == 'violin' or plot is None:
|
|
|
|
sns.violinplot(x='day_name', y='messages', data=df, cut=0, inner="box", scale='width', ax=subplot)
|
|
|
|
else:
|
|
|
|
raise HelpException("plot must be either box or violin")
|
|
|
|
subplot.axvspan(4.5, 6.5, zorder=0, color=(0, .8, 0, 0.1))
|
|
|
|
subplot.set_xlabel('')
|
|
|
|
subplot.set_ylabel('Messages per Day')
|
2020-06-05 06:06:29 +00:00
|
|
|
subplot.set_xlim(-0.5, 6.5) # Need to set this explicitly to show full range of days with na data
|
2021-06-07 04:15:53 +00:00
|
|
|
|
|
|
|
if lquery:
|
|
|
|
subplot.set_title(f"Messages by Day of Week for {lquery}")
|
|
|
|
elif user:
|
2020-05-28 05:18:52 +00:00
|
|
|
subplot.set_title(f"Messages by Day of Week for {user[1]}")
|
|
|
|
else:
|
|
|
|
subplot.set_title("Messages by Day of Week")
|
|
|
|
|
|
|
|
sns.despine(fig=fig)
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
bio = output_fig(fig)
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
return None, bio
|
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
def get_week_by_hourday(self, lquery: str = None, user: Tuple[int, str] = None, start: str = None, end: str = None) \
|
|
|
|
-> Tuple[Union[str, None], Union[None, BytesIO]]:
|
2020-05-28 05:43:39 +00:00
|
|
|
"""
|
|
|
|
Get plot of messages over the week by day and hour.
|
2021-06-07 04:15:53 +00:00
|
|
|
:param lquery: Limit results to lexical query (&, |, !, <n>)
|
2020-05-28 05:43:39 +00:00
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
"""
|
|
|
|
query_conditions = []
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict = {}
|
2020-05-28 05:43:39 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if lquery:
|
2021-06-12 01:39:12 +00:00
|
|
|
query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )")
|
2021-06-07 04:15:53 +00:00
|
|
|
|
2020-05-28 05:43:39 +00:00
|
|
|
if start:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-05-28 05:43:39 +00:00
|
|
|
|
|
|
|
if end:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-05-28 05:43:39 +00:00
|
|
|
|
|
|
|
if user:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['user'] = user[0]
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("from_user = :user")
|
2020-05-28 05:43:39 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"WHERE {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
query = f"""
|
2023-11-13 18:17:34 +00:00
|
|
|
SELECT date_trunc('hour', date)
|
|
|
|
as msg_time, count(*) as messages
|
|
|
|
FROM messages_utc
|
|
|
|
{query_where}
|
|
|
|
GROUP BY msg_time
|
|
|
|
ORDER BY msg_time
|
2020-05-28 05:43:39 +00:00
|
|
|
"""
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.read_sql_query(text(query), con, params=sql_dict)
|
2020-05-28 05:43:39 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if len(df) == 0:
|
|
|
|
return "No matching messages", None
|
|
|
|
|
2020-05-28 05:43:39 +00:00
|
|
|
df['msg_time'] = pd.to_datetime(df.msg_time)
|
2021-06-07 04:15:53 +00:00
|
|
|
df['msg_time'] = df.msg_time.dt.tz_convert(self.tz)
|
2020-05-28 05:43:39 +00:00
|
|
|
df = df.set_index('msg_time')
|
|
|
|
df = df.asfreq('h', fill_value=0) # Fill periods with no messages
|
|
|
|
df['dow'] = df.index.weekday
|
|
|
|
df['hour'] = df.index.hour
|
|
|
|
df['day_name'] = df.index.day_name()
|
|
|
|
df_grouped = df[['messages', 'hour', 'day_name']].groupby(['hour', 'day_name']).sum().unstack()
|
|
|
|
df_grouped = df_grouped.loc[:, 'messages']
|
2020-06-05 05:47:41 +00:00
|
|
|
df_grouped = df_grouped.reindex(columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday',
|
|
|
|
'Friday', 'Saturday', 'Sunday'])
|
2020-05-28 05:43:39 +00:00
|
|
|
|
|
|
|
fig = Figure(constrained_layout=True)
|
|
|
|
ax = fig.subplots()
|
|
|
|
|
|
|
|
sns.heatmap(df_grouped.T, yticklabels=['M', 'T', 'W', 'Th', 'F', 'Sa', 'Su'], linewidths=.5,
|
2020-05-28 06:51:44 +00:00
|
|
|
square=True, fmt='d', vmin=0,
|
2020-05-28 05:43:39 +00:00
|
|
|
cbar_kws={"orientation": "horizontal"}, cmap="viridis", ax=ax)
|
|
|
|
ax.tick_params(axis='y', rotation=0)
|
|
|
|
ax.set_ylabel("")
|
|
|
|
ax.set_xlabel("")
|
2021-06-07 04:15:53 +00:00
|
|
|
if lquery:
|
|
|
|
ax.set_title(f"Messages by day and hour for {lquery}")
|
|
|
|
elif user:
|
2020-05-28 05:43:39 +00:00
|
|
|
ax.set_title(f"Total messages by day and hour for {user[1]}")
|
|
|
|
else:
|
|
|
|
ax.set_title("Total messages by day and hour")
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
bio = output_fig(fig)
|
2020-05-28 05:43:39 +00:00
|
|
|
|
|
|
|
return None, bio
|
|
|
|
|
2021-06-12 01:39:12 +00:00
|
|
|
def get_message_history(self, user: Tuple[int, str] = None, lquery: str = None, averages: int = None,
|
|
|
|
start: str = None,
|
2020-05-30 06:19:52 +00:00
|
|
|
end: str = None) \
|
2021-06-07 04:15:53 +00:00
|
|
|
-> Tuple[Union[str, None], Union[None, BytesIO]]:
|
2020-05-28 05:18:52 +00:00
|
|
|
"""
|
|
|
|
Make a plot of message history over time
|
2021-06-07 03:45:16 +00:00
|
|
|
:param lquery: Limit results to lexical query (&, |, !, <n>)
|
2020-05-28 05:18:52 +00:00
|
|
|
:param averages: Moving average width (in days)
|
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
"""
|
|
|
|
query_conditions = []
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict = {}
|
2020-06-05 06:42:55 +00:00
|
|
|
|
|
|
|
if averages:
|
|
|
|
if averages < 0:
|
|
|
|
raise HelpException("averages must be >= 0")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-07 03:45:16 +00:00
|
|
|
if lquery:
|
2021-06-12 01:39:12 +00:00
|
|
|
query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )")
|
2021-06-07 03:45:16 +00:00
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
if start:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
if end:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
if user:
|
2020-05-28 06:51:44 +00:00
|
|
|
sql_dict['user'] = user[0]
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("from_user = :user")
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"WHERE {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
query = f"""
|
|
|
|
SELECT date_trunc('day', date)
|
|
|
|
as day, count(*) as messages
|
|
|
|
FROM messages_utc
|
|
|
|
{query_where}
|
|
|
|
GROUP BY day
|
|
|
|
ORDER BY day
|
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.read_sql_query(text(query), con, params=sql_dict)
|
2021-06-07 04:15:53 +00:00
|
|
|
|
|
|
|
if len(df) == 0:
|
|
|
|
return "No matching messages", None
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
df['day'] = pd.to_datetime(df.day)
|
|
|
|
df['day'] = df.day.dt.tz_convert(self.tz)
|
2021-06-08 03:39:06 +00:00
|
|
|
df = df.set_index('day')
|
|
|
|
df = df.resample('1D').sum()
|
2020-06-05 06:42:55 +00:00
|
|
|
|
|
|
|
if averages is None:
|
|
|
|
averages = len(df) // 20
|
|
|
|
if averages <= 1:
|
|
|
|
averages = 0
|
2020-05-28 05:18:52 +00:00
|
|
|
if averages:
|
|
|
|
df['msg_rolling'] = df['messages'].rolling(averages, center=True).mean()
|
2020-06-05 06:43:30 +00:00
|
|
|
alpha = 0.5
|
|
|
|
else:
|
|
|
|
alpha = 1
|
2020-05-28 05:18:52 +00:00
|
|
|
|
2021-06-07 03:45:16 +00:00
|
|
|
fig = Figure(constrained_layout=True)
|
2020-05-28 05:18:52 +00:00
|
|
|
subplot = fig.subplots()
|
2021-06-08 03:39:06 +00:00
|
|
|
df.plot(y='messages', alpha=alpha, legend=False, ax=subplot)
|
2020-05-28 05:18:52 +00:00
|
|
|
if averages:
|
2021-06-08 03:39:06 +00:00
|
|
|
df.plot(y='msg_rolling', legend=False, ax=subplot)
|
2020-05-28 05:18:52 +00:00
|
|
|
subplot.set_ylabel("Messages")
|
|
|
|
subplot.set_xlabel("Date")
|
2021-06-07 03:45:16 +00:00
|
|
|
if lquery:
|
|
|
|
subplot.set_title(f"History for query: {lquery}")
|
|
|
|
elif user:
|
2020-05-28 05:18:52 +00:00
|
|
|
subplot.set_title(f"Message History for {user[1]}")
|
|
|
|
else:
|
|
|
|
subplot.set_title("Message History")
|
|
|
|
sns.despine(fig=fig)
|
|
|
|
fig.tight_layout()
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
bio = output_fig(fig)
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
return None, bio
|
|
|
|
|
2020-10-07 03:47:40 +00:00
|
|
|
def get_title_history(self, start: str = None, end: str = None, duration: bool = False) \
|
2021-06-07 04:15:53 +00:00
|
|
|
-> Tuple[Union[str, None], Union[None, BytesIO]]:
|
2020-06-16 07:48:39 +00:00
|
|
|
"""
|
|
|
|
Make a plot of group titles history over time
|
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
2020-10-07 03:47:40 +00:00
|
|
|
:param duration: If true, order by duration instead of time.
|
2020-06-16 07:48:39 +00:00
|
|
|
"""
|
|
|
|
query_conditions = []
|
|
|
|
sql_dict = {}
|
|
|
|
|
|
|
|
if start:
|
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-06-16 07:48:39 +00:00
|
|
|
|
|
|
|
if end:
|
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-06-16 07:48:39 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"AND {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
query = f"""
|
|
|
|
SELECT date, new_chat_title
|
|
|
|
FROM messages_utc
|
|
|
|
WHERE type = 'new_chat_title' {query_where}
|
|
|
|
ORDER BY date;
|
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.read_sql_query(text(query), con, params=sql_dict)
|
2020-06-16 07:48:39 +00:00
|
|
|
|
2023-11-14 03:03:19 +00:00
|
|
|
if len(df) == 0:
|
|
|
|
return "No chat titles in range", None
|
|
|
|
|
2020-06-16 07:48:39 +00:00
|
|
|
df['idx'] = np.arange(len(df))
|
2020-10-07 03:47:40 +00:00
|
|
|
df['diff'] = -df['date'].diff(-1)
|
|
|
|
df['end'] = df['date'] + df['diff']
|
2020-06-16 07:48:39 +00:00
|
|
|
|
|
|
|
if end:
|
|
|
|
last = pd.Timestamp(sql_dict['end_dt'], tz=self.tz).tz_convert('utc')
|
|
|
|
else:
|
2022-01-13 08:07:14 +00:00
|
|
|
last = pd.Timestamp(datetime.utcnow(), tz='utc')
|
2020-06-16 07:48:39 +00:00
|
|
|
|
|
|
|
df_end = df['end']
|
|
|
|
df_end.iloc[-1] = last
|
|
|
|
df.loc[:, 'end'] = df_end
|
2020-10-07 03:47:40 +00:00
|
|
|
df.loc[:, 'diff'].iloc[-1] = df.iloc[-1]['end'] - df.iloc[-1]['date']
|
2020-06-16 07:48:39 +00:00
|
|
|
|
|
|
|
fig = Figure(constrained_layout=True, figsize=(12, 0.15 * len(df)))
|
|
|
|
ax = fig.subplots()
|
|
|
|
|
2020-10-07 03:47:40 +00:00
|
|
|
if duration:
|
|
|
|
df = df.sort_values('diff')
|
|
|
|
df = df.reset_index(drop=True)
|
|
|
|
df['idx'] = df.index
|
2020-06-16 07:48:39 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
ax.barh(df.idx, df['diff'].dt.days + df['diff'].dt.seconds / 86400, tick_label=df.new_chat_title)
|
2020-06-16 07:48:39 +00:00
|
|
|
|
2020-10-07 03:47:40 +00:00
|
|
|
ax.margins(0.2)
|
|
|
|
ax.set_ylabel("")
|
|
|
|
ax.set_xlabel("Duration (days)")
|
|
|
|
ax.set_ylim(-1, (df.idx.max() + 1))
|
|
|
|
ax.set_title("Chat Title History")
|
|
|
|
ax.grid(False, which='both', axis='y')
|
|
|
|
sns.despine(fig=fig, left=True)
|
2020-06-16 07:48:39 +00:00
|
|
|
|
2020-10-07 03:47:40 +00:00
|
|
|
else:
|
|
|
|
x = df.iloc[:-1].end
|
|
|
|
y = df.iloc[:-1].idx + .5
|
|
|
|
|
|
|
|
ax.scatter(x, y, zorder=4, color=sns.color_palette()[1])
|
|
|
|
|
|
|
|
titles = list(zip(df.date.apply(date2num),
|
|
|
|
df.end.apply(date2num) - df.date.apply(date2num)))
|
|
|
|
|
|
|
|
for n, i in enumerate(titles):
|
|
|
|
ax.broken_barh([i], (n, 1))
|
|
|
|
ax.annotate(df.new_chat_title[n], xy=(i[0] + i[1], n), xycoords='data',
|
|
|
|
xytext=(10, 0), textcoords='offset points',
|
|
|
|
horizontalalignment='left', verticalalignment='bottom')
|
|
|
|
|
|
|
|
ax.set_ylim(-1, (df.idx.max() + 1))
|
|
|
|
ax.set_xlim(titles[0][0] - 1, None)
|
|
|
|
|
|
|
|
ax.margins(0.2)
|
|
|
|
ax.set_ylabel("")
|
|
|
|
ax.set_xlabel("")
|
|
|
|
ax.set_title("Chat Title History")
|
|
|
|
ax.grid(False, which='both', axis='y')
|
|
|
|
ax.tick_params(axis='y', which='both', labelleft=False, left=False)
|
|
|
|
sns.despine(fig=fig, left=True)
|
2020-06-16 07:48:39 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
bio = output_fig(fig)
|
2020-06-16 07:48:39 +00:00
|
|
|
|
|
|
|
return None, bio
|
|
|
|
|
2021-06-20 05:37:29 +00:00
|
|
|
def get_user_summary(self, autouser=None, **kwargs) -> Tuple[Union[str, None], Union[None, BytesIO]]:
|
|
|
|
"""
|
|
|
|
Get summary of a user.
|
|
|
|
"""
|
|
|
|
user: Tuple[int, str] = kwargs['user']
|
|
|
|
sql_dict = {'user': user[0]}
|
|
|
|
|
|
|
|
count_query = """
|
|
|
|
SELECT COUNT(*)
|
|
|
|
FROM "messages_utc"
|
2023-11-13 18:17:34 +00:00
|
|
|
WHERE from_user = :user;
|
2021-06-20 05:37:29 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
days_query = """
|
|
|
|
SELECT EXTRACT(epoch FROM(NOW() - MIN(date))) / 86400 as "days"
|
|
|
|
FROM "messages_utc"
|
2023-11-13 18:17:34 +00:00
|
|
|
WHERE from_user = :user;
|
2021-06-20 05:37:29 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
event_query = """
|
|
|
|
SELECT date, event
|
|
|
|
FROM user_events
|
2023-11-13 18:17:34 +00:00
|
|
|
WHERE user_id = :user
|
2021-06-20 05:37:29 +00:00
|
|
|
ORDER BY "date";
|
|
|
|
"""
|
|
|
|
|
|
|
|
username_query = """
|
|
|
|
SELECT COUNT(*)
|
|
|
|
FROM "user_names"
|
2023-11-13 18:17:34 +00:00
|
|
|
WHERE user_id = :user;
|
2021-06-20 05:37:29 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
result = con.execute(text(count_query), sql_dict)
|
2021-06-20 05:37:29 +00:00
|
|
|
msg_count: int = result.fetchall()[0][0]
|
2023-11-13 18:17:34 +00:00
|
|
|
result = con.execute(text(days_query), sql_dict)
|
2021-06-20 05:37:29 +00:00
|
|
|
days: float = result.fetchall()[0][0]
|
2023-11-13 18:17:34 +00:00
|
|
|
result = con.execute(text(event_query), sql_dict)
|
2021-06-20 05:37:29 +00:00
|
|
|
events: list = result.fetchall()
|
2023-11-13 18:17:34 +00:00
|
|
|
result = con.execute(text(username_query), sql_dict)
|
2021-06-20 05:37:29 +00:00
|
|
|
name_count: int = result.fetchall()[0][0]
|
|
|
|
|
|
|
|
event_text = '\n'.join([f'{event.event} on {pd.to_datetime(event.date).tz_convert(self.tz)}'
|
|
|
|
for event in events])
|
|
|
|
|
|
|
|
# Add separator line
|
|
|
|
if event_text:
|
|
|
|
event_text = '\n' + event_text
|
|
|
|
|
2023-11-14 03:03:19 +00:00
|
|
|
try:
|
|
|
|
out_text = f"Messages sent: {msg_count}\n" \
|
|
|
|
f"Average messages per day: {msg_count / days:.2f}\n" \
|
|
|
|
f"First message was {days:.2f} days ago.\n" \
|
|
|
|
f"Usernames on record: {name_count}\n" \
|
|
|
|
f"Average username lifetime: {days / name_count:.2f} days\n" + event_text
|
|
|
|
except TypeError:
|
|
|
|
return 'No data for user', None
|
|
|
|
|
2021-06-20 05:37:29 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
return f"User {user[1].lstrip('@')}: ```\n{out_text}\n```", None
|
2021-06-20 05:37:29 +00:00
|
|
|
|
2020-05-30 04:33:01 +00:00
|
|
|
def get_user_correlation(self, start: str = None, end: str = None, agg: bool = True, c_type: str = None,
|
|
|
|
n: int = 5, thresh: float = 0.05, autouser=None, **kwargs) -> Tuple[str, None]:
|
|
|
|
"""
|
|
|
|
Return correlations between you and other users.
|
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param agg: If True, calculate correlation over messages aggregated by hours of the week
|
|
|
|
:param c_type: Correlation type to use. Either 'pearson' or 'spearman'
|
|
|
|
:param n: Show n highest and lowest correlation scores
|
|
|
|
:param thresh: Fraction of time bins that have data for both users to be considered valid (0-1)
|
|
|
|
"""
|
|
|
|
user: Tuple[int, str] = kwargs['user']
|
|
|
|
query_conditions = []
|
|
|
|
sql_dict = {}
|
|
|
|
|
|
|
|
if start:
|
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-05-30 04:33:01 +00:00
|
|
|
|
|
|
|
if end:
|
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-05-30 04:33:01 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"WHERE {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
if n <= 0:
|
|
|
|
raise HelpException(f'n must be greater than 0, got: {n}')
|
|
|
|
|
|
|
|
if not c_type:
|
|
|
|
c_type = 'pearson'
|
|
|
|
elif c_type not in ['pearson', 'spearman']:
|
|
|
|
raise HelpException("corr must be either pearson or spearman")
|
|
|
|
|
|
|
|
if not 0 <= thresh <= 1:
|
2020-05-30 06:19:52 +00:00
|
|
|
raise HelpException(f'n must be in the range [0, 1], got: {n}')
|
2020-05-30 04:33:01 +00:00
|
|
|
|
|
|
|
query = f"""
|
|
|
|
SELECT msg_time, extract(ISODOW FROM msg_time) as dow, extract(HOUR FROM msg_time) as hour,
|
|
|
|
"user", messages
|
|
|
|
FROM (
|
|
|
|
SELECT date_trunc('hour', date)
|
|
|
|
as msg_time,
|
|
|
|
count(*) as messages, from_user as "user"
|
|
|
|
FROM messages_utc
|
|
|
|
{query_where}
|
|
|
|
GROUP BY msg_time, from_user
|
|
|
|
ORDER BY msg_time
|
|
|
|
) t
|
|
|
|
ORDER BY dow, hour;
|
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.read_sql_query(text(query), con, params=sql_dict)
|
2023-11-14 03:03:19 +00:00
|
|
|
|
|
|
|
if len(df) == 0:
|
|
|
|
return 'No messages in range', None
|
|
|
|
|
2020-05-30 04:33:01 +00:00
|
|
|
df['msg_time'] = pd.to_datetime(df.msg_time)
|
|
|
|
df['msg_time'] = df.msg_time.dt.tz_convert(self.tz)
|
|
|
|
|
|
|
|
# Prune irrelevant messages (not sure if this actually improves performance)
|
|
|
|
user_first_date = df.loc[df.user == user[0], 'msg_time'].iloc[0]
|
|
|
|
df = df.loc[df.msg_time >= user_first_date]
|
|
|
|
|
|
|
|
df = df.set_index('msg_time')
|
|
|
|
|
|
|
|
user_dict = {'user': {user_id: value[0] for user_id, value in self.users.items()}}
|
|
|
|
df = df.loc[df.user.isin(list(user_dict['user'].keys()))] # Filter out users with no names
|
|
|
|
df = df.replace(user_dict) # Replace user ids with names
|
|
|
|
df['user'] = df['user'].str.replace(r'[^\x00-\x7F]', "", regex=True)
|
|
|
|
|
|
|
|
if agg:
|
|
|
|
df = df.pivot_table(index=['dow', 'hour'], columns='user', values='messages', aggfunc='sum')
|
|
|
|
corrs = []
|
|
|
|
for other_user in df.columns.values:
|
|
|
|
if df[user[1]].sum() / df[other_user].sum() > thresh:
|
|
|
|
me_notna = df[user[1]].notna()
|
|
|
|
other_notna = df[other_user].notna()
|
|
|
|
idx = me_notna | other_notna
|
|
|
|
corrs.append(df.loc[idx, user[1]].fillna(0).corr(df.loc[idx, other_user].fillna(0)))
|
|
|
|
else:
|
|
|
|
corrs.append(pd.NA)
|
|
|
|
|
|
|
|
me = pd.Series(corrs, index=df.columns.values).sort_values(ascending=False).iloc[1:].dropna()
|
|
|
|
else:
|
|
|
|
df = df.pivot(columns='user', values='messages')
|
|
|
|
|
|
|
|
if thresh == 0:
|
|
|
|
df_corr = df.corr(method=c_type)
|
|
|
|
else:
|
2020-05-30 06:19:52 +00:00
|
|
|
df_corr = df.corr(method=c_type, min_periods=int(thresh * len(df)))
|
2020-05-30 04:33:01 +00:00
|
|
|
me = df_corr[user[1]].sort_values(ascending=False).iloc[1:].dropna()
|
|
|
|
|
|
|
|
if len(me) < 1:
|
|
|
|
return "`Sorry, not enough data, try with -aggtimes, decrease -thresh, or use a bigger date range.`", None
|
|
|
|
|
2020-05-30 06:19:52 +00:00
|
|
|
if n > len(me) // 2:
|
|
|
|
n = int(len(me) // 2)
|
2020-05-30 04:33:01 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
out_text = me.to_string(header=False, float_format=lambda x: f"{x:.3f}")
|
|
|
|
split = out_text.splitlines()
|
|
|
|
out_text = "\n".join(['HIGHEST CORRELATION:'] + split[:n] + ['\nLOWEST CORRELATION:'] + split[-n:])
|
2020-05-30 04:33:01 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
return f"**User Correlations for {escape_markdown(user[1])}**\n```\n{out_text}\n```", None
|
2020-05-30 04:33:01 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
def get_message_deltas(self, lquery: str = None, start: str = None, end: str = None, n: int = 10, thresh: int = 500,
|
|
|
|
autouser=None, **kwargs) -> Tuple[Union[str, None], Union[None, BytesIO]]:
|
2020-05-30 06:18:48 +00:00
|
|
|
"""
|
|
|
|
Return the median difference in message time between you and other users.
|
2021-06-07 04:15:53 +00:00
|
|
|
:param lquery: Limit results to lexical query (&, |, !, <n>)
|
2020-05-30 06:18:48 +00:00
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param n: Show n highest and lowest correlation scores
|
|
|
|
:param thresh: Only consider users with at least this many message group pairs with you
|
|
|
|
"""
|
|
|
|
user: Tuple[int, str] = kwargs['user']
|
|
|
|
query_conditions = []
|
|
|
|
sql_dict = {}
|
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if lquery:
|
2021-06-12 01:39:12 +00:00
|
|
|
query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )")
|
2021-06-07 04:15:53 +00:00
|
|
|
|
2020-05-30 06:18:48 +00:00
|
|
|
if start:
|
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-05-30 06:18:48 +00:00
|
|
|
|
|
|
|
if end:
|
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-05-30 06:18:48 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"AND {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
if n <= 0:
|
|
|
|
raise HelpException(f'n must be greater than 0')
|
|
|
|
|
|
|
|
if thresh < 0:
|
|
|
|
raise HelpException(f'n cannot be negative')
|
|
|
|
|
|
|
|
def fetch_mean_delta(me: int, other: int, where: str, sql_dict: dict) -> Tuple[timedelta, int]:
|
|
|
|
query = f"""
|
|
|
|
select percentile_cont(0.5) within group (order by t_delta), count(t_delta)
|
|
|
|
from(
|
|
|
|
select start - lag("end", 1) over (order by start) as t_delta
|
|
|
|
from (
|
|
|
|
select min(date) as start, max(date) as "end"
|
|
|
|
from (select date, from_user,
|
|
|
|
(dense_rank() over (order by date) -
|
|
|
|
dense_rank() over (partition by from_user order by date)
|
|
|
|
) as grp
|
|
|
|
from messages_utc
|
2023-11-13 18:17:34 +00:00
|
|
|
where from_user in (:me, :other) {where}
|
2020-05-30 06:18:48 +00:00
|
|
|
order by date
|
|
|
|
) t
|
|
|
|
group by from_user, grp
|
|
|
|
order by start
|
|
|
|
) t1
|
|
|
|
) t2;
|
|
|
|
"""
|
|
|
|
|
|
|
|
sql_dict['me'] = me
|
|
|
|
sql_dict['other'] = other
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
result = con.execute(text(query), sql_dict)
|
2020-05-30 06:18:48 +00:00
|
|
|
output: Tuple[timedelta, int] = result.fetchall()[0]
|
|
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
results = {other: fetch_mean_delta(user[0], other, query_where, sql_dict) for other in self.users
|
|
|
|
if user[0] != other}
|
|
|
|
|
2020-06-01 07:06:30 +00:00
|
|
|
user_deltas = {self.users[other][0]: pd.to_timedelta(result[0]) for other, result in results.items()
|
|
|
|
if result[1] > thresh}
|
2020-05-30 06:18:48 +00:00
|
|
|
|
|
|
|
me = pd.Series(user_deltas).sort_values()
|
|
|
|
me = me.apply(lambda x: x.round('1s'))
|
|
|
|
|
2020-06-05 06:22:03 +00:00
|
|
|
if len(me) < 1:
|
|
|
|
return "\n```\nSorry, not enough data, try a bigger date range or decrease -thresh.\n```", None
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
out_text = me.iloc[:n].to_string(header=False, index=True)
|
2020-05-30 06:18:48 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
return f"**Median message delays for {escape_markdown(user[1])} and:**\n```\n{out_text}\n```", None
|
2020-05-30 06:18:48 +00:00
|
|
|
|
2020-06-06 22:13:58 +00:00
|
|
|
def get_type_stats(self, start: str = None, end: str = None, autouser=None, **kwargs) -> Tuple[str, None]:
|
2020-06-06 21:25:06 +00:00
|
|
|
"""
|
2020-06-06 22:12:58 +00:00
|
|
|
Print table of message statistics by type.
|
2020-06-06 21:25:06 +00:00
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
"""
|
2020-06-06 22:13:58 +00:00
|
|
|
user: Tuple[int, str] = kwargs['user']
|
2020-06-06 21:25:06 +00:00
|
|
|
query_conditions = []
|
|
|
|
sql_dict = {}
|
|
|
|
|
|
|
|
if start:
|
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-06-06 21:25:06 +00:00
|
|
|
|
|
|
|
if end:
|
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-06-06 21:25:06 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f" AND {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
query = f"""
|
|
|
|
SELECT type, count(*) as count
|
|
|
|
FROM messages_utc
|
|
|
|
WHERE type NOT IN ('new_chat_members', 'left_chat_member', 'new_chat_photo',
|
|
|
|
'new_chat_title', 'migrate_from_group', 'pinned_message')
|
|
|
|
{query_where}
|
|
|
|
GROUP BY type
|
|
|
|
ORDER BY count DESC;
|
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.read_sql_query(text(query), con, params=sql_dict)
|
2023-11-14 03:03:19 +00:00
|
|
|
|
|
|
|
if len(df) == 0:
|
|
|
|
return 'No messages in range', None
|
|
|
|
|
2020-06-06 21:25:06 +00:00
|
|
|
df['Group Percent'] = df['count'] / df['count'].sum() * 100
|
|
|
|
df.columns = ['type', 'Group Count', 'Group Percent']
|
|
|
|
|
|
|
|
if user:
|
|
|
|
sql_dict['user'] = user[0]
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("from_user = :user")
|
2020-06-06 21:25:06 +00:00
|
|
|
|
|
|
|
query = f"""
|
|
|
|
SELECT type, count(*) as user_count
|
|
|
|
FROM messages_utc
|
|
|
|
WHERE type NOT IN ('new_chat_members', 'left_chat_member', 'new_chat_photo',
|
|
|
|
'new_chat_title', 'migrate_from_group', 'pinned_message')
|
|
|
|
AND {' AND '.join(query_conditions)}
|
|
|
|
GROUP BY type
|
|
|
|
ORDER BY user_count DESC;
|
|
|
|
"""
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
df_u = pd.read_sql_query(text(query), con, params=sql_dict)
|
2020-06-06 21:25:06 +00:00
|
|
|
df_u['User Percent'] = df_u['user_count'] / df_u['user_count'].sum() * 100
|
|
|
|
df_u.columns = ['type', 'User Count', 'User Percent']
|
|
|
|
|
|
|
|
df = df.merge(df_u, on="type", how="outer")
|
2020-06-06 22:14:16 +00:00
|
|
|
|
|
|
|
a = list(zip(df.columns.values, ["Total"] + df.iloc[:, 1:].sum().to_list()))
|
2023-11-13 18:17:34 +00:00
|
|
|
df = pd.concat((df, pd.DataFrame({key: [value] for key, value in a})), ignore_index=True)
|
|
|
|
|
2020-06-06 22:32:39 +00:00
|
|
|
df['Group Count'] = df['Group Count'].astype('Int64')
|
|
|
|
try:
|
|
|
|
df['User Count'] = df['User Count'].astype('Int64')
|
|
|
|
except KeyError:
|
|
|
|
pass
|
2023-11-14 03:03:19 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
out_text = df.to_string(index=False, header=True, float_format=lambda x: f"{x:.1f}")
|
2020-06-06 21:25:06 +00:00
|
|
|
|
|
|
|
if user:
|
2023-11-13 18:17:34 +00:00
|
|
|
return f"**Messages by type, {escape_markdown(user[1])} vs group:**\n```\n{out_text}\n```", None
|
2020-06-06 21:25:06 +00:00
|
|
|
else:
|
2023-11-13 18:17:34 +00:00
|
|
|
return f"**Messages by type:**\n```\n{out_text}\n```", None
|
2020-06-06 21:25:06 +00:00
|
|
|
|
2021-06-12 01:38:23 +00:00
|
|
|
def get_word_stats(self, n: int = 4, limit: int = 20, start: str = None, end: str = None,
|
|
|
|
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
|
|
|
|
"""
|
|
|
|
Print table of lexeme statistics.
|
|
|
|
:param n: Only consider lexemes with length of at least n
|
|
|
|
:param limit: Number of top lexemes to return
|
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
"""
|
|
|
|
|
|
|
|
q = select(messages.c['text_index_col'])
|
|
|
|
|
|
|
|
if user:
|
|
|
|
q = q.where(messages.c['from_user'] == user[0])
|
|
|
|
if start:
|
2023-11-13 18:16:12 +00:00
|
|
|
q = q.where(messages.c['date'] >= str(pd.to_datetime(start)))
|
2021-06-12 01:38:23 +00:00
|
|
|
if end:
|
2023-11-13 18:16:12 +00:00
|
|
|
q = q.where(messages.c['date'] < str(pd.to_datetime(end)))
|
2021-06-12 01:38:23 +00:00
|
|
|
|
|
|
|
q = q.scalar_subquery()
|
|
|
|
f = TsStat(q)
|
2023-11-13 18:17:34 +00:00
|
|
|
stmt = select(f.columns['word'], f.columns['ndoc'], f.columns['nentry']) \
|
2021-06-12 01:38:23 +00:00
|
|
|
.select_from(f)
|
|
|
|
|
|
|
|
if n:
|
2023-11-13 18:17:34 +00:00
|
|
|
stmt = stmt.where(func.length(f.columns['word']) >= n)
|
2021-06-12 01:38:23 +00:00
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
stmt = stmt.order_by(f.columns['nentry'].desc(),
|
|
|
|
f.columns['ndoc'].desc(),
|
|
|
|
f.columns['word'])
|
2021-06-12 01:38:23 +00:00
|
|
|
|
|
|
|
if limit:
|
2023-11-13 18:17:34 +00:00
|
|
|
stmt = stmt.limit(limit) \
|
|
|
|
.compile(dialect=postgresql.dialect())
|
2021-06-12 01:38:23 +00:00
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
|
|
|
df = pd.read_sql_query(stmt, con)
|
|
|
|
|
2023-11-14 03:03:19 +00:00
|
|
|
if len(df) == 0:
|
|
|
|
return 'No messages in range', None
|
|
|
|
|
2021-06-12 01:38:23 +00:00
|
|
|
df.columns = ['Lexeme', 'Messages', 'Uses']
|
|
|
|
|
2023-11-13 18:17:34 +00:00
|
|
|
out_text = df.to_string(index=False, header=True, float_format=lambda x: f"{x:.1f}")
|
2021-06-12 01:38:23 +00:00
|
|
|
|
|
|
|
if user:
|
2023-11-13 18:17:34 +00:00
|
|
|
return f"**Most frequently used lexemes, {escape_markdown(user[1].lstrip('@'))}\n```\n{out_text}\n```", None
|
2021-06-12 01:38:23 +00:00
|
|
|
else:
|
2023-11-13 18:17:34 +00:00
|
|
|
return f"**Most frequently used lexemes, all users:**\n```\n{out_text}\n```", None
|
2021-06-12 01:38:23 +00:00
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
def get_random_message(self, lquery: str = None, start: str = None, end: str = None,
|
2020-08-08 06:07:02 +00:00
|
|
|
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
|
|
|
|
"""
|
|
|
|
Display a random message.
|
2021-06-07 04:15:53 +00:00
|
|
|
:param lquery: Limit results to lexical query (&, |, !, <n>)
|
2020-08-08 06:07:02 +00:00
|
|
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
|
|
|
"""
|
|
|
|
query_conditions = []
|
|
|
|
sql_dict = {}
|
|
|
|
|
2021-06-07 04:15:53 +00:00
|
|
|
if lquery:
|
2021-06-12 01:39:12 +00:00
|
|
|
query_conditions.append(f"text_index_col @@ to_tsquery( {random_quote(lquery)} )")
|
2021-06-07 04:15:53 +00:00
|
|
|
|
2020-08-08 06:07:02 +00:00
|
|
|
if user:
|
|
|
|
sql_dict['user'] = user[0]
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("from_user = :user")
|
2020-08-08 06:07:02 +00:00
|
|
|
|
|
|
|
if start:
|
|
|
|
sql_dict['start_dt'] = pd.to_datetime(start)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date >= :start_dt")
|
2020-08-08 06:07:02 +00:00
|
|
|
|
|
|
|
if end:
|
|
|
|
sql_dict['end_dt'] = pd.to_datetime(end)
|
2023-11-13 18:17:34 +00:00
|
|
|
query_conditions.append("date < :end_dt")
|
2020-08-08 06:07:02 +00:00
|
|
|
|
|
|
|
query_where = ""
|
|
|
|
if query_conditions:
|
|
|
|
query_where = f"AND {' AND '.join(query_conditions)}"
|
|
|
|
|
|
|
|
query = f"""
|
2023-11-13 18:17:34 +00:00
|
|
|
SELECT date, from_user, text
|
|
|
|
FROM messages_utc
|
|
|
|
WHERE type = 'text'
|
|
|
|
{query_where}
|
|
|
|
ORDER BY RANDOM()
|
|
|
|
LIMIT 1;
|
2020-08-08 06:07:02 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
with self.engine.connect() as con:
|
2023-11-13 18:17:34 +00:00
|
|
|
result = con.execute(text(query), sql_dict)
|
2021-06-07 04:15:53 +00:00
|
|
|
try:
|
2023-11-13 18:17:34 +00:00
|
|
|
date, from_user, out_text = result.fetchall()[0]
|
2021-06-07 04:15:53 +00:00
|
|
|
except IndexError:
|
|
|
|
return "No matching messages", None
|
2020-08-08 06:07:02 +00:00
|
|
|
|
2021-06-08 03:56:34 +00:00
|
|
|
return f"*On {escape_markdown(date.strftime('%Y-%m-%d'))}, " \
|
|
|
|
f"{escape_markdown(self.users[from_user][0]).lstrip('@')}" \
|
2020-08-08 06:07:02 +00:00
|
|
|
f" gave these words of wisdom:*\n" \
|
2023-11-13 18:17:34 +00:00
|
|
|
f"{escape_markdown(out_text)}\n", \
|
|
|
|
None
|
2020-08-08 06:07:02 +00:00
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
def get_parser(runner: StatsRunner) -> InternalParser:
|
|
|
|
parser = InternalParser(prog="/stats")
|
|
|
|
parser.set_defaults(func=runner.get_chat_counts)
|
|
|
|
subparsers = parser.add_subparsers(title="Statistics:")
|
|
|
|
|
2021-06-07 03:01:47 +00:00
|
|
|
parser.add_argument('-v', '--version', action='version', version=__version__)
|
|
|
|
|
2020-05-28 05:18:52 +00:00
|
|
|
for name, func in runner.allowed_methods.items():
|
|
|
|
try:
|
|
|
|
doc = inspect.getdoc(getattr(runner, func)).splitlines()
|
|
|
|
except AttributeError:
|
|
|
|
doc = None
|
|
|
|
subparser = subparsers.add_parser(name, help=doc[0])
|
|
|
|
subparser.set_defaults(func=getattr(runner, func))
|
|
|
|
f_args = inspect.signature(getattr(runner, func)).parameters
|
|
|
|
|
|
|
|
for _, arg in f_args.items():
|
|
|
|
arg: inspect.Parameter
|
|
|
|
if arg.name == 'self':
|
|
|
|
continue
|
|
|
|
if arg.name == 'user':
|
|
|
|
group = subparser.add_mutually_exclusive_group()
|
|
|
|
group.add_argument('-me', action='store_true', help='calculate stats for yourself')
|
|
|
|
group.add_argument('-user', type=int, help=argparse.SUPPRESS)
|
2020-05-30 04:28:36 +00:00
|
|
|
elif arg.name == 'autouser':
|
|
|
|
subparser.set_defaults(me=True)
|
|
|
|
subparser.add_argument('-user', type=int, help=argparse.SUPPRESS)
|
|
|
|
elif arg.name == 'kwargs':
|
|
|
|
pass
|
2020-05-28 05:18:52 +00:00
|
|
|
else:
|
2020-05-30 04:32:19 +00:00
|
|
|
arg_doc = None
|
2020-05-28 05:18:52 +00:00
|
|
|
if doc:
|
|
|
|
for line in doc:
|
|
|
|
match = re.match(rf"^:param {arg.name}: (.*)", line)
|
|
|
|
if match:
|
|
|
|
arg_doc = match.group(1)
|
2020-05-30 04:32:19 +00:00
|
|
|
|
|
|
|
if arg.annotation == bool:
|
|
|
|
subparser.add_argument(f"-{arg.name}".replace('_', '-'), action='store_true', help=arg_doc)
|
|
|
|
else:
|
|
|
|
subparser.add_argument(f"-{arg.name}".replace('_', '-'), type=arg.annotation, help=arg_doc,
|
|
|
|
default=arg.default)
|
2020-05-28 05:18:52 +00:00
|
|
|
|
|
|
|
return parser
|