stats: Add lexeme frequency

2021-06-11 21:38:23 -04:00 · 2021-06-11 21:38:23 -04:00 · 80d2c0848c
commit 80d2c0848c
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -12,6 +12,7 @@ Unreleased
 Added
 -----
 - Allow limiting counts by message type
 - stats: Added words statistic
 Fixed
 -----
--- a/README.rst
+++ b/README.rst
@ -57,6 +57,8 @@ Table of contents
  - `types`_
  - `words`_
  - `random`_
 - `License`_
@ -286,6 +288,35 @@ types
   document          1.0            0.0         1.0           0.0
      Total     598640.0          100.0     16693.0         100.0
 words
 -----
 ``/stats words`` returns a table of the most commonly used lexemes
 .. code::
 Most frequently used lexemes:
     Lexeme  Messages  Uses
       like      1265  1334
       well       753   765
     actual       628   645
       make       600   619
       yeah       609   609
       mean       544   553
      thing       473   490
     realli       472   482
     though       467   470
      peopl       415   445
      think       425   433
       know       403   409
       need       396   408
       time       371   389
       want       354   371
      would       345   366
       much       345   357
    probabl       348   356
       even       331   338
      stuff       318   332
 random
 ------
 ``/stats random`` prints a random message from the database.
--- a/telegram_stats_bot/db.py
+++ b/telegram_stats_bot/db.py
@ -21,11 +21,20 @@
 import logging
 from sqlalchemy import Column, Table, MetaData
 from sqlalchemy.engine import Engine
 from sqlalchemy.dialects import postgresql
 from sqlalchemy.types import TIMESTAMP, BigInteger
 logger = logging.getLogger(__name__)
 metadata = MetaData()
 messages = Table('messages_utc', metadata,
                 Column('date', TIMESTAMP),
                 Column('from_user', BigInteger),
                 Column('text_index_col', postgresql.TSVECTOR))
 def init_dbs(engine: Engine):
    sql = """
--- a/telegram_stats_bot/stats.py
+++ b/telegram_stats_bot/stats.py
@ -34,8 +34,11 @@ import numpy as np
 from matplotlib.figure import Figure
 from matplotlib.dates import date2num
 from sqlalchemy.engine import Engine
 from sqlalchemy import select, func
 from sqlalchemy.dialects import postgresql
-from .utils import escape_markdown
+from .utils import escape_markdown, TsStat, random_quote
 from .db import messages
 from . import __version__
 sns.set_context('paper')
@ -77,6 +80,7 @@ class StatsRunner(object):
                       'corr': "get_user_correlation",
                       'delta': "get_message_deltas",
                       'types': "get_type_stats",
                       'words': "get_word_stats",
                       'random': "get_random_message"}
    def __init__(self, engine: Engine, tz: str = 'America/Toronto'):
@ -896,6 +900,53 @@ class StatsRunner(object):
        else:
            return f"**Messages by type:**\n```\n{text}\n```", None
    def get_word_stats(self, n: int = 4, limit: int = 20, start: str = None, end: str = None,
                       user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
        """
        Print table of lexeme statistics.
        :param n: Only consider lexemes with length of at least n
        :param limit: Number of top lexemes to return
        :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
        :param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
        """
        q = select(messages.c['text_index_col'])
        if user:
            q = q.where(messages.c['from_user'] == user[0])
        if start:
            q = q.where(messages.c['date'] >= str(pd.to_datetime('2019')))
        if end:
            q = q.where(messages.c['date'] < str(pd.to_datetime('2019')))
        q = q.scalar_subquery()
        f = TsStat(q)
        stmt = select([f.c['word'], f.c['ndoc'], f.c['nentry']]) \
            .select_from(f)
        if n:
            stmt = stmt.where(func.length(f.c['word']) >= n)
        stmt = stmt.order_by(f.c.nentry.desc(),
                             f.c.ndoc.desc(),
                             f.c.word)
        if limit:
            stmt = stmt.limit(limit)\
                       .compile(dialect=postgresql.dialect())
        with self.engine.connect() as con:
            df = pd.read_sql_query(stmt, con)
        df.columns = ['Lexeme', 'Messages', 'Uses']
        text = df.to_string(index=False, header=True, float_format=lambda x: f"{x:.1f}")
        if user:
            return f"**Most frequently used lexemes, {escape_markdown(user[1].lstrip('@'))}\n```\n{text}\n```", None
        else:
            return f"**Most frequently used lexemes, all users:**\n```\n{text}\n```", None
    def get_random_message(self, lquery: str = None, start: str = None, end: str = None,
                           user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
        """
--- a/telegram_stats_bot/utils.py
+++ b/telegram_stats_bot/utils.py
@ -18,9 +18,16 @@
 #
 # You should have received a copy of the GNU Public License
 # along with this program. If not, see [http://www.gnu.org/licenses/].
-
+import string
 import secrets
 import re
 from sqlalchemy import Column, Integer, Text
 from sqlalchemy.ext.compiler import compiles
 from sqlalchemy.sql.functions import FunctionElement
 from sqlalchemy.sql.base import ColumnCollection
 md_match = re.compile(r"(\[[^][]*]\(http[^()]*\))|([_*[\]()~>#+-=|{}.!\\])")
@ -31,3 +38,31 @@ def escape_markdown(string: str) -> str:
        return f'\\{match.group(2)}'
    return re.sub(md_match, url_match, string)
 # Modified from https://stackoverflow.com/a/49726653/3946475
 class TsStat(FunctionElement):
    name = "ts_stat"
    @property
    def columns(self):
        word = Column('word', Text)
        ndoc = Column('ndoc', Integer)
        nentry = Column('nentry', Integer)
        return ColumnCollection(columns=((col.name, col) for col in (word, ndoc, nentry)))
@compiles(TsStat, 'postgresql')
 def pg_ts_stat(element, compiler, **kw):
    kw.pop("asfrom", None)  # Ignore and set explicitly
    arg1, = element.clauses
    # arg1 is a FromGrouping, which would force parens around the SELECT.
    stmt = compiler.process(
        arg1.element, asfrom=False, literal_binds=True, **kw)
    return f"ts_stat({random_quote(stmt)})"
 def random_quote(statement: str) -> str:
    quote_str = ''.join(secrets.choice(string.ascii_uppercase) for _ in range(8))  # Randomize dollar quotes
    return f"${quote_str}${statement}${quote_str}$"