stats: add lexical query to most stats

pull/6/head
Michael D. M. Dryden 2021-06-07 00:15:53 -04:00
rodzic eee136dbb8
commit 7510569aa5
3 zmienionych plików z 75 dodań i 24 usunięć

Wyświetl plik

@ -12,7 +12,7 @@ and this project adheres to `Semantic Versioning <https://semver.org/spec/v2.0.0
Added
-----
- Read version from bot
- stats: add lexical query to history
- stats: add lexical query to several stats
Removed
-------

Wyświetl plik

@ -174,6 +174,10 @@ Nearly all have:
range of data to fetch, otherwise all available data will be used.
Either or both options can be given.
- ``lsquery`` followed by a lexical query (using Postgres'
`tsquery syntax <https://www.postgresql.org/docs/12/datatype-textsearch.html#DATATYPE-TSQUERY>`_)
limits results to matching messages.
- ``-me`` calculates statistics for the user sending the command, rather than all chat users.
Sample outputs of each available subcommand follow.
@ -216,9 +220,6 @@ week
history
-------
``/stats history`` returns a plot of messages versus date.
Allows limiting by a lexical query (using Postgres'
`tsquery syntax <https://www.postgresql.org/docs/12/datatype-textsearch.html#DATATYPE-TSQUERY>`_)
with the ``lquery`` option.
.. image:: examples/history.png
:alt: Example of history plot

Wyświetl plik

@ -20,7 +20,7 @@
# along with this program. If not, see [http://www.gnu.org/licenses/].
import logging
from typing import Dict, List, Tuple, Text, NoReturn
from typing import Dict, List, Tuple, Text, NoReturn, Union
from threading import Lock
from io import BytesIO
import argparse
@ -133,20 +133,23 @@ class StatsRunner(object):
with self.engine.connect() as con:
con.execute(query, sql_dict)
def get_chat_counts(self, n: int = 20, start: str = None, end: str = None) -> Tuple[str, None]:
def get_chat_counts(self, n: int = 20, lquery: str = None, start: str = None, end: str = None) -> Tuple[str, None]:
"""
Get top chat users
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param n: Number of users to show
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
"""
date_query = None
sql_dict = {}
query_conditions = []
if n <= 0:
raise HelpException(f'n must be greater than 0, got: {n}')
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start:
sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s")
@ -174,23 +177,30 @@ class StatsRunner(object):
df = df.join(user_df)
df['Percent'] = df['count'] / df['count'].sum() * 100
df = df[['user', 'count', 'Percent']]
df.columns = ['User', 'Total Messages', 'Percent']
if lquery:
df.columns = ['User', lquery, 'Percent']
else:
df.columns = ['User', 'Total Messages', 'Percent']
df['User'] = df['User'].str.replace(r'[^\x00-\x7F]', "", regex=True) # Drop emoji
text = df.iloc[:n].to_string(index=False, header=True, float_format=lambda x: f"{x:.1f}")
return f"```\n{text}\n```", None
def get_counts_by_hour(self, user: Tuple[int, str] = None, start: str = None, end: str = None) \
-> Tuple[None, BytesIO]:
def get_counts_by_hour(self, user: Tuple[int, str] = None, lquery: str = None, start: str = None, end: str = None) \
-> Tuple[Union[str, None], Union[None, BytesIO]]:
"""
Get plot of messages for hours of the day
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
"""
query_conditions = []
sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start:
sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s")
@ -218,6 +228,9 @@ class StatsRunner(object):
with self.engine.connect() as con:
df = pd.read_sql_query(query, con, params=sql_dict)
if len(df) == 0:
return "No matching messages", None
df['day'] = pd.to_datetime(df.day)
df['day'] = df.day.dt.tz_convert(self.tz)
df = df.set_index('day')
@ -245,9 +258,12 @@ class StatsRunner(object):
subplot.axvspan(11.5, 23.5, zorder=0, color=(0, 0, 0, 0.05))
subplot.set_xlim(-1, 24) # Set explicitly to plot properly even with missing data
if lquery:
subplot.set_title(f"Messages by Hour for {lquery}")
elif user:
subplot.set_title(f"Messages by Hour for {user[1]}")
if user:
subplot.set_ylabel('Messages per Week')
subplot.set_title(f"Messages by Hour for {user[1]}")
else:
subplot.set_ylabel('Messages per Day')
subplot.set_title("Messages by Hour")
@ -261,10 +277,11 @@ class StatsRunner(object):
return None, bio
def get_counts_by_day(self, user: Tuple[int, str] = None, start: str = None, end: str = None, plot: str = None) \
-> Tuple[None, BytesIO]:
def get_counts_by_day(self, user: Tuple[int, str] = None, lquery: str = None, start: str = None, end: str = None,
plot: str = None) -> Tuple[Union[str, None], Union[None, BytesIO]]:
"""
Get plot of messages for days of the week
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param plot: Type of plot. ('box' or 'violin')
@ -272,6 +289,9 @@ class StatsRunner(object):
query_conditions = []
sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start:
sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s")
@ -300,6 +320,9 @@ class StatsRunner(object):
with self.engine.connect() as con:
df = pd.read_sql_query(query, con, params=sql_dict)
if len(df) == 0:
return "No matching messages", None
df['day'] = pd.to_datetime(df.day)
df['day'] = df.day.dt.tz_convert(self.tz)
df = df.set_index('day')
@ -322,7 +345,10 @@ class StatsRunner(object):
subplot.set_xlabel('')
subplot.set_ylabel('Messages per Day')
subplot.set_xlim(-0.5, 6.5) # Need to set this explicitly to show full range of days with na data
if user:
if lquery:
subplot.set_title(f"Messages by Day of Week for {lquery}")
elif user:
subplot.set_title(f"Messages by Day of Week for {user[1]}")
else:
subplot.set_title("Messages by Day of Week")
@ -336,16 +362,20 @@ class StatsRunner(object):
return None, bio
def get_week_by_hourday(self, user: Tuple[int, str] = None, start: str = None, end: str = None) \
-> Tuple[None, BytesIO]:
def get_week_by_hourday(self, lquery: str = None, user: Tuple[int, str] = None, start: str = None, end: str = None) \
-> Tuple[Union[str, None], Union[None, BytesIO]]:
"""
Get plot of messages over the week by day and hour.
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
"""
query_conditions = []
sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start:
sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s")
@ -373,8 +403,11 @@ class StatsRunner(object):
with self.engine.connect() as con:
df = pd.read_sql_query(query, con, params=sql_dict)
if len(df) == 0:
return "No matching messages", None
df['msg_time'] = pd.to_datetime(df.msg_time)
df['msg_time'] = df.msg_time.dt.tz_convert('America/Toronto')
df['msg_time'] = df.msg_time.dt.tz_convert(self.tz)
df = df.set_index('msg_time')
df = df.asfreq('h', fill_value=0) # Fill periods with no messages
df['dow'] = df.index.weekday
@ -394,7 +427,9 @@ class StatsRunner(object):
ax.tick_params(axis='y', rotation=0)
ax.set_ylabel("")
ax.set_xlabel("")
if user:
if lquery:
ax.set_title(f"Messages by day and hour for {lquery}")
elif user:
ax.set_title(f"Total messages by day and hour for {user[1]}")
else:
ax.set_title("Total messages by day and hour")
@ -408,7 +443,7 @@ class StatsRunner(object):
def get_message_history(self, user: Tuple[int, str] = None, lquery: str = None, averages: int = None, start: str = None,
end: str = None) \
-> Tuple[None, BytesIO]:
-> Tuple[Union[str, None], Union[None, BytesIO]]:
"""
Make a plot of message history over time
:param lquery: Limit results to lexical query (&, |, !, <n>)
@ -453,6 +488,10 @@ class StatsRunner(object):
with self.engine.connect() as con:
df = pd.read_sql_query(query, con, params=sql_dict)
if len(df) == 0:
return "No matching messages", None
df['day'] = pd.to_datetime(df.day)
df['day'] = df.day.dt.tz_convert(self.tz)
@ -490,7 +529,7 @@ class StatsRunner(object):
return None, bio
def get_title_history(self, start: str = None, end: str = None, duration: bool = False) \
-> Tuple[None, BytesIO]:
-> Tuple[Union[str, None], Union[None, BytesIO]]:
"""
Make a plot of group titles history over time
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
@ -690,10 +729,11 @@ class StatsRunner(object):
return f"**User Correlations for {escape_markdown(user[1])}**\n```\n{text}\n```", None
def get_message_deltas(self, start: str = None, end: str = None, n: int = 10, thresh: int = 500,
autouser=None, **kwargs) -> Tuple[str, None]:
def get_message_deltas(self, lquery: str = None, start: str = None, end: str = None, n: int = 10, thresh: int = 500,
autouser=None, **kwargs) -> Tuple[Union[str, None], Union[None, BytesIO]]:
"""
Return the median difference in message time between you and other users.
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param n: Show n highest and lowest correlation scores
@ -703,6 +743,9 @@ class StatsRunner(object):
query_conditions = []
sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if start:
sql_dict['start_dt'] = pd.to_datetime(start)
query_conditions.append("date >= %(start_dt)s")
@ -838,16 +881,20 @@ class StatsRunner(object):
else:
return f"**Messages by type:**\n```\n{text}\n```", None
def get_random_message(self, start: str = None, end: str = None,
def get_random_message(self, lquery: str = None, start: str = None, end: str = None,
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
"""
Display a random message.
:param lquery: Limit results to lexical query (&, |, !, <n>)
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
"""
query_conditions = []
sql_dict = {}
if lquery:
query_conditions.append(f"text_index_col @@ to_tsquery('{lquery}')")
if user:
sql_dict['user'] = user[0]
query_conditions.append("from_user = %(user)s")
@ -875,7 +922,10 @@ class StatsRunner(object):
with self.engine.connect() as con:
result = con.execute(query, sql_dict)
date, from_user, text = result.fetchall()[0]
try:
date, from_user, text = result.fetchall()[0]
except IndexError:
return "No matching messages", None
return f"*On {escape_markdown(date.strftime('%Y-%m-%d'))}, {escape_markdown(self.users[from_user][0])}" \
f" gave these words of wisdom:*\n" \