stats: Add lexeme frequency

Michael D. M. Dryden 2021-06-11 21:38:23 -04:00
rodzic ed51680630
commit 80d2c0848c
5 zmienionych plików z 129 dodań i 2 usunięć

Wyświetl plik

@ -12,6 +12,7 @@ Unreleased
- Allow limiting counts by message type
- stats: Added words statistic

Wyświetl plik

@ -57,6 +57,8 @@ Table of contents
- `types`_
- `words`_
- `random`_
- `License`_
@ -286,6 +288,35 @@ types
document 1.0 0.0 1.0 0.0
Total 598640.0 100.0 16693.0 100.0
``/stats words`` returns a table of the most commonly used lexemes
.. code::
Most frequently used lexemes:
Lexeme Messages Uses
like 1265 1334
well 753 765
actual 628 645
make 600 619
yeah 609 609
mean 544 553
thing 473 490
realli 472 482
though 467 470
peopl 415 445
think 425 433
know 403 409
need 396 408
time 371 389
want 354 371
would 345 366
much 345 357
probabl 348 356
even 331 338
stuff 318 332
``/stats random`` prints a random message from the database.

Wyświetl plik

@ -21,11 +21,20 @@
import logging
from sqlalchemy import Column, Table, MetaData
from sqlalchemy.engine import Engine
from sqlalchemy.dialects import postgresql
from sqlalchemy.types import TIMESTAMP, BigInteger
logger = logging.getLogger(__name__)
metadata = MetaData()
messages = Table('messages_utc', metadata,
Column('date', TIMESTAMP),
Column('from_user', BigInteger),
Column('text_index_col', postgresql.TSVECTOR))
def init_dbs(engine: Engine):
sql = """

Wyświetl plik

@ -34,8 +34,11 @@ import numpy as np
from matplotlib.figure import Figure
from matplotlib.dates import date2num
from sqlalchemy.engine import Engine
from sqlalchemy import select, func
from sqlalchemy.dialects import postgresql
from .utils import escape_markdown
from .utils import escape_markdown, TsStat, random_quote
from .db import messages
from . import __version__
@ -77,6 +80,7 @@ class StatsRunner(object):
'corr': "get_user_correlation",
'delta': "get_message_deltas",
'types': "get_type_stats",
'words': "get_word_stats",
'random': "get_random_message"}
def __init__(self, engine: Engine, tz: str = 'America/Toronto'):
@ -896,6 +900,53 @@ class StatsRunner(object):
return f"**Messages by type:**\n```\n{text}\n```", None
def get_word_stats(self, n: int = 4, limit: int = 20, start: str = None, end: str = None,
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
Print table of lexeme statistics.
:param n: Only consider lexemes with length of at least n
:param limit: Number of top lexemes to return
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
q = select(messages.c['text_index_col'])
if user:
q = q.where(messages.c['from_user'] == user[0])
if start:
q = q.where(messages.c['date'] >= str(pd.to_datetime('2019')))
if end:
q = q.where(messages.c['date'] < str(pd.to_datetime('2019')))
q = q.scalar_subquery()
f = TsStat(q)
stmt = select([f.c['word'], f.c['ndoc'], f.c['nentry']]) \
if n:
stmt = stmt.where(func.length(f.c['word']) >= n)
stmt = stmt.order_by(f.c.nentry.desc(),
if limit:
stmt = stmt.limit(limit)\
with self.engine.connect() as con:
df = pd.read_sql_query(stmt, con)
df.columns = ['Lexeme', 'Messages', 'Uses']
text = df.to_string(index=False, header=True, float_format=lambda x: f"{x:.1f}")
if user:
return f"**Most frequently used lexemes, {escape_markdown(user[1].lstrip('@'))}\n```\n{text}\n```", None
return f"**Most frequently used lexemes, all users:**\n```\n{text}\n```", None
def get_random_message(self, lquery: str = None, start: str = None, end: str = None,
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:

Wyświetl plik

@ -18,9 +18,16 @@
# You should have received a copy of the GNU Public License
# along with this program. If not, see [].
import string
import secrets
import re
from sqlalchemy import Column, Integer, Text
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.functions import FunctionElement
from sqlalchemy.sql.base import ColumnCollection
md_match = re.compile(r"(\[[^][]*]\(http[^()]*\))|([_*[\]()~>#+-=|{}.!\\])")
@ -31,3 +38,31 @@ def escape_markdown(string: str) -> str:
return f'\\{}'
return re.sub(md_match, url_match, string)
# Modified from
class TsStat(FunctionElement):
name = "ts_stat"
def columns(self):
word = Column('word', Text)
ndoc = Column('ndoc', Integer)
nentry = Column('nentry', Integer)
return ColumnCollection(columns=((, col) for col in (word, ndoc, nentry)))
@compiles(TsStat, 'postgresql')
def pg_ts_stat(element, compiler, **kw):
kw.pop("asfrom", None) # Ignore and set explicitly
arg1, = element.clauses
# arg1 is a FromGrouping, which would force parens around the SELECT.
stmt = compiler.process(
arg1.element, asfrom=False, literal_binds=True, **kw)
return f"ts_stat({random_quote(stmt)})"
def random_quote(statement: str) -> str:
quote_str = ''.join(secrets.choice(string.ascii_uppercase) for _ in range(8)) # Randomize dollar quotes
return f"${quote_str}${statement}${quote_str}$"