stats: add lexical query to most stats

pull/6/head
Michael D. M. Dryden 2021-06-07 00:15:53 -04:00
rodzic eee136dbb8
commit 7510569aa5
3 zmienionych plików z 75 dodań i 24 usunięć

Wyświetl plik

@ -12,7 +12,7 @@ and this project adheres to `Semantic Versioning <https://semver.org/spec/v2.0.0
Added Added
----- -----
- Read version from bot - Read version from bot
- stats: add lexical query to history - stats: add lexical query to several stats
Removed Removed
------- -------

Wyświetl plik

@ -174,6 +174,10 @@ Nearly all have:
range of data to fetch, otherwise all available data will be used. range of data to fetch, otherwise all available data will be used.
Either or both options can be given. Either or both options can be given.
- ``lsquery`` followed by a lexical query (using Postgres'
`tsquery syntax <https://www.postgresql.org/docs/12/datatype-textsearch.html#DATATYPE-TSQUERY>`_)
limits results to matching messages.
- ``-me`` calculates statistics for the user sending the command, rather than all chat users. - ``-me`` calculates statistics for the user sending the command, rather than all chat users.
Sample outputs of each available subcommand follow. Sample outputs of each available subcommand follow.
@ -216,9 +220,6 @@ week
history history
------- -------
``/stats history`` returns a plot of messages versus date. ``/stats history`` returns a plot of messages versus date.
Allows limiting by a lexical query (using Postgres'
`tsquery syntax <https://www.postgresql.org/docs/12/datatype-textsearch.html#DATATYPE-TSQUERY>`_)
with the ``lquery`` option.
.. image:: examples/history.png .. image:: examples/history.png
:alt: Example of history plot :alt: Example of history plot

Wyświetl plik

@ -20,7 +20,7 @@
# along with this program. If not, see [http://www.gnu.org/licenses/]. # along with this program. If not, see [http://www.gnu.org/licenses/].
import logging import logging
from typing import Dict, List, Tuple, Text, NoReturn from typing import Dict, List, Tuple, Text, NoReturn, Union
from threading import Lock from threading import Lock
from io import BytesIO from io import BytesIO
import argparse import argparse
@ -133,20 +133,23 @@ class StatsRunner(object):
with self.engine.connect() as con: with self.engine.connect() as con:
con.execute(query, sql_dict) con.execute(query, sql_dict)
def get_chat_counts(self, n: int = 20, start: str = None, end: str = None) -> Tuple[str, None]: def get_chat_counts(self, n: int = 20, lquery: str = None, start: str = None, end: str = None) -> Tuple[str, None]:
""" """
Get top chat users Get top chat users
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param n: Number of users to show :param n: Number of users to show
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
""" """
date_query = None
sql_dict = {} sql_dict = {}
query_conditions = [] query_conditions = []
if n <= 0: if n <= 0:
raise HelpException(f'n must be greater than 0, got: {n}') raise HelpException(f'n must be greater than 0, got: {n}')
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start: if start:
sql_dict['start_dt'] = pd.to_datetime(start) sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s") query_conditions.append("date >= %(start_dt)s")
@ -174,6 +177,9 @@ class StatsRunner(object):
df = df.join(user_df) df = df.join(user_df)
df['Percent'] = df['count'] / df['count'].sum() * 100 df['Percent'] = df['count'] / df['count'].sum() * 100
df = df[['user', 'count', 'Percent']] df = df[['user', 'count', 'Percent']]
if lquery:
df.columns = ['User', lquery, 'Percent']
else:
df.columns = ['User', 'Total Messages', 'Percent'] df.columns = ['User', 'Total Messages', 'Percent']
df['User'] = df['User'].str.replace(r'[^\x00-\x7F]', "", regex=True) # Drop emoji df['User'] = df['User'].str.replace(r'[^\x00-\x7F]', "", regex=True) # Drop emoji
@ -181,16 +187,20 @@ class StatsRunner(object):
return f"```\n{text}\n```", None return f"```\n{text}\n```", None
def get_counts_by_hour(self, user: Tuple[int, str] = None, start: str = None, end: str = None) \ def get_counts_by_hour(self, user: Tuple[int, str] = None, lquery: str = None, start: str = None, end: str = None) \
-> Tuple[None, BytesIO]: -> Tuple[Union[str, None], Union[None, BytesIO]]:
""" """
Get plot of messages for hours of the day Get plot of messages for hours of the day
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
""" """
query_conditions = [] query_conditions = []
sql_dict = {} sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start: if start:
sql_dict['start_dt'] = pd.to_datetime(start) sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s") query_conditions.append("date >= %(start_dt)s")
@ -218,6 +228,9 @@ class StatsRunner(object):
with self.engine.connect() as con: with self.engine.connect() as con:
df = pd.read_sql_query(query, con, params=sql_dict) df = pd.read_sql_query(query, con, params=sql_dict)
if len(df) == 0:
return "No matching messages", None
df['day'] = pd.to_datetime(df.day) df['day'] = pd.to_datetime(df.day)
df['day'] = df.day.dt.tz_convert(self.tz) df['day'] = df.day.dt.tz_convert(self.tz)
df = df.set_index('day') df = df.set_index('day')
@ -245,9 +258,12 @@ class StatsRunner(object):
subplot.axvspan(11.5, 23.5, zorder=0, color=(0, 0, 0, 0.05)) subplot.axvspan(11.5, 23.5, zorder=0, color=(0, 0, 0, 0.05))
subplot.set_xlim(-1, 24) # Set explicitly to plot properly even with missing data subplot.set_xlim(-1, 24) # Set explicitly to plot properly even with missing data
if lquery:
subplot.set_title(f"Messages by Hour for {lquery}")
elif user:
subplot.set_title(f"Messages by Hour for {user[1]}")
if user: if user:
subplot.set_ylabel('Messages per Week') subplot.set_ylabel('Messages per Week')
subplot.set_title(f"Messages by Hour for {user[1]}")
else: else:
subplot.set_ylabel('Messages per Day') subplot.set_ylabel('Messages per Day')
subplot.set_title("Messages by Hour") subplot.set_title("Messages by Hour")
@ -261,10 +277,11 @@ class StatsRunner(object):
return None, bio return None, bio
def get_counts_by_day(self, user: Tuple[int, str] = None, start: str = None, end: str = None, plot: str = None) \ def get_counts_by_day(self, user: Tuple[int, str] = None, lquery: str = None, start: str = None, end: str = None,
-> Tuple[None, BytesIO]: plot: str = None) -> Tuple[Union[str, None], Union[None, BytesIO]]:
""" """
Get plot of messages for days of the week Get plot of messages for days of the week
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param plot: Type of plot. ('box' or 'violin') :param plot: Type of plot. ('box' or 'violin')
@ -272,6 +289,9 @@ class StatsRunner(object):
query_conditions = [] query_conditions = []
sql_dict = {} sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start: if start:
sql_dict['start_dt'] = pd.to_datetime(start) sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s") query_conditions.append("date >= %(start_dt)s")
@ -300,6 +320,9 @@ class StatsRunner(object):
with self.engine.connect() as con: with self.engine.connect() as con:
df = pd.read_sql_query(query, con, params=sql_dict) df = pd.read_sql_query(query, con, params=sql_dict)
if len(df) == 0:
return "No matching messages", None
df['day'] = pd.to_datetime(df.day) df['day'] = pd.to_datetime(df.day)
df['day'] = df.day.dt.tz_convert(self.tz) df['day'] = df.day.dt.tz_convert(self.tz)
df = df.set_index('day') df = df.set_index('day')
@ -322,7 +345,10 @@ class StatsRunner(object):
subplot.set_xlabel('') subplot.set_xlabel('')
subplot.set_ylabel('Messages per Day') subplot.set_ylabel('Messages per Day')
subplot.set_xlim(-0.5, 6.5) # Need to set this explicitly to show full range of days with na data subplot.set_xlim(-0.5, 6.5) # Need to set this explicitly to show full range of days with na data
if user:
if lquery:
subplot.set_title(f"Messages by Day of Week for {lquery}")
elif user:
subplot.set_title(f"Messages by Day of Week for {user[1]}") subplot.set_title(f"Messages by Day of Week for {user[1]}")
else: else:
subplot.set_title("Messages by Day of Week") subplot.set_title("Messages by Day of Week")
@ -336,16 +362,20 @@ class StatsRunner(object):
return None, bio return None, bio
def get_week_by_hourday(self, user: Tuple[int, str] = None, start: str = None, end: str = None) \ def get_week_by_hourday(self, lquery: str = None, user: Tuple[int, str] = None, start: str = None, end: str = None) \
-> Tuple[None, BytesIO]: -> Tuple[Union[str, None], Union[None, BytesIO]]:
""" """
Get plot of messages over the week by day and hour. Get plot of messages over the week by day and hour.
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
""" """
query_conditions = [] query_conditions = []
sql_dict = {} sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start: if start:
sql_dict['start_dt'] = pd.to_datetime(start) sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s") query_conditions.append("date >= %(start_dt)s")
@ -373,8 +403,11 @@ class StatsRunner(object):
with self.engine.connect() as con: with self.engine.connect() as con:
df = pd.read_sql_query(query, con, params=sql_dict) df = pd.read_sql_query(query, con, params=sql_dict)
if len(df) == 0:
return "No matching messages", None
df['msg_time'] = pd.to_datetime(df.msg_time) df['msg_time'] = pd.to_datetime(df.msg_time)
df['msg_time'] = df.msg_time.dt.tz_convert('America/Toronto') df['msg_time'] = df.msg_time.dt.tz_convert(self.tz)
df = df.set_index('msg_time') df = df.set_index('msg_time')
df = df.asfreq('h', fill_value=0) # Fill periods with no messages df = df.asfreq('h', fill_value=0) # Fill periods with no messages
df['dow'] = df.index.weekday df['dow'] = df.index.weekday
@ -394,7 +427,9 @@ class StatsRunner(object):
ax.tick_params(axis='y', rotation=0) ax.tick_params(axis='y', rotation=0)
ax.set_ylabel("") ax.set_ylabel("")
ax.set_xlabel("") ax.set_xlabel("")
if user: if lquery:
ax.set_title(f"Messages by day and hour for {lquery}")
elif user:
ax.set_title(f"Total messages by day and hour for {user[1]}") ax.set_title(f"Total messages by day and hour for {user[1]}")
else: else:
ax.set_title("Total messages by day and hour") ax.set_title("Total messages by day and hour")
@ -408,7 +443,7 @@ class StatsRunner(object):
def get_message_history(self, user: Tuple[int, str] = None, lquery: str = None, averages: int = None, start: str = None, def get_message_history(self, user: Tuple[int, str] = None, lquery: str = None, averages: int = None, start: str = None,
end: str = None) \ end: str = None) \
-> Tuple[None, BytesIO]: -> Tuple[Union[str, None], Union[None, BytesIO]]:
""" """
Make a plot of message history over time Make a plot of message history over time
:param lquery: Limit results to lexical query (&, |, !, <n>) :param lquery: Limit results to lexical query (&, |, !, <n>)
@ -453,6 +488,10 @@ class StatsRunner(object):
with self.engine.connect() as con: with self.engine.connect() as con:
df = pd.read_sql_query(query, con, params=sql_dict) df = pd.read_sql_query(query, con, params=sql_dict)
if len(df) == 0:
return "No matching messages", None
df['day'] = pd.to_datetime(df.day) df['day'] = pd.to_datetime(df.day)
df['day'] = df.day.dt.tz_convert(self.tz) df['day'] = df.day.dt.tz_convert(self.tz)
@ -490,7 +529,7 @@ class StatsRunner(object):
return None, bio return None, bio
def get_title_history(self, start: str = None, end: str = None, duration: bool = False) \ def get_title_history(self, start: str = None, end: str = None, duration: bool = False) \
-> Tuple[None, BytesIO]: -> Tuple[Union[str, None], Union[None, BytesIO]]:
""" """
Make a plot of group titles history over time Make a plot of group titles history over time
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
@ -690,10 +729,11 @@ class StatsRunner(object):
return f"**User Correlations for {escape_markdown(user[1])}**\n```\n{text}\n```", None return f"**User Correlations for {escape_markdown(user[1])}**\n```\n{text}\n```", None
def get_message_deltas(self, start: str = None, end: str = None, n: int = 10, thresh: int = 500, def get_message_deltas(self, lquery: str = None, start: str = None, end: str = None, n: int = 10, thresh: int = 500,
autouser=None, **kwargs) -> Tuple[str, None]: autouser=None, **kwargs) -> Tuple[Union[str, None], Union[None, BytesIO]]:
""" """
Return the median difference in message time between you and other users. Return the median difference in message time between you and other users.
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param n: Show n highest and lowest correlation scores :param n: Show n highest and lowest correlation scores
@ -703,6 +743,9 @@ class StatsRunner(object):
query_conditions = [] query_conditions = []
sql_dict = {} sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start: if start:
sql_dict['start_dt'] = pd.to_datetime(start) sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s") query_conditions.append("date >= %(start_dt)s")
@ -838,16 +881,20 @@ class StatsRunner(object):
else: else:
return f"**Messages by type:**\n```\n{text}\n```", None return f"**Messages by type:**\n```\n{text}\n```", None
def get_random_message(self, start: str = None, end: str = None, def get_random_message(self, lquery: str = None, start: str = None, end: str = None,
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]: user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
""" """
Display a random message. Display a random message.
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21") :param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
""" """
query_conditions = [] query_conditions = []
sql_dict = {} sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if user: if user:
sql_dict['user'] = user[0] sql_dict['user'] = user[0]
query_conditions.append("from_user = %(user)s") query_conditions.append("from_user = %(user)s")
@ -875,7 +922,10 @@ class StatsRunner(object):
with self.engine.connect() as con: with self.engine.connect() as con:
result = con.execute(query, sql_dict) result = con.execute(query, sql_dict)
try:
date, from_user, text = result.fetchall()[0] date, from_user, text = result.fetchall()[0]
except IndexError:
return "No matching messages", None
return f"*On {escape_markdown(date.strftime('%Y-%m-%d'))}, {escape_markdown(self.users[from_user][0])}" \ return f"*On {escape_markdown(date.strftime('%Y-%m-%d'))}, {escape_markdown(self.users[from_user][0])}" \
f" gave these words of wisdom:*\n" \ f" gave these words of wisdom:*\n" \