stats: Add lexeme frequency

2021-06-11 21:38:23 -04:00 · 2021-06-11 21:38:23 -04:00 · 80d2c0848c
commit 80d2c0848c
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -12,6 +12,7 @@ Unreleased
 Added
 -----
 - Allow limiting counts by message type
+- stats: Added words statistic

 Fixed
 -----
--- a/README.rst
+++ b/README.rst
@ -57,6 +57,8 @@ Table of contents

  - `types`_

+  - `words`_
+
  - `random`_

 - `License`_
@ -286,6 +288,35 @@ types
   document          1.0            0.0         1.0           0.0
      Total     598640.0          100.0     16693.0         100.0

+words
+-----
+``/stats words`` returns a table of the most commonly used lexemes
+
+.. code::
+
+ Most frequently used lexemes:
+     Lexeme  Messages  Uses
+       like      1265  1334
+       well       753   765
+     actual       628   645
+       make       600   619
+       yeah       609   609
+       mean       544   553
+      thing       473   490
+     realli       472   482
+     though       467   470
+      peopl       415   445
+      think       425   433
+       know       403   409
+       need       396   408
+       time       371   389
+       want       354   371
+      would       345   366
+       much       345   357
+    probabl       348   356
+       even       331   338
+      stuff       318   332
+
 random
 ------
 ``/stats random`` prints a random message from the database.
--- a/telegram_stats_bot/db.py
+++ b/telegram_stats_bot/db.py
@ -21,11 +21,20 @@

 import logging

+from sqlalchemy import Column, Table, MetaData
 from sqlalchemy.engine import Engine
+from sqlalchemy.dialects import postgresql
+from sqlalchemy.types import TIMESTAMP, BigInteger


 logger = logging.getLogger(__name__)

+metadata = MetaData()
+messages = Table('messages_utc', metadata,
+                 Column('date', TIMESTAMP),
+                 Column('from_user', BigInteger),
+                 Column('text_index_col', postgresql.TSVECTOR))
+

 def init_dbs(engine: Engine):
    sql = """
--- a/telegram_stats_bot/stats.py
+++ b/telegram_stats_bot/stats.py
@ -34,8 +34,11 @@ import numpy as np
 from matplotlib.figure import Figure
 from matplotlib.dates import date2num
 from sqlalchemy.engine import Engine
+from sqlalchemy import select, func
+from sqlalchemy.dialects import postgresql

-from .utils import escape_markdown
+from .utils import escape_markdown, TsStat, random_quote
+from .db import messages
 from . import __version__

 sns.set_context('paper')
@ -77,6 +80,7 @@ class StatsRunner(object):
                       'corr': "get_user_correlation",
                       'delta': "get_message_deltas",
                       'types': "get_type_stats",
+                       'words': "get_word_stats",
                       'random': "get_random_message"}

    def __init__(self, engine: Engine, tz: str = 'America/Toronto'):
@ -896,6 +900,53 @@ class StatsRunner(object):
        else:
            return f"**Messages by type:**\n```\n{text}\n```", None

+    def get_word_stats(self, n: int = 4, limit: int = 20, start: str = None, end: str = None,
+                       user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
+        """
+        Print table of lexeme statistics.
+        :param n: Only consider lexemes with length of at least n
+        :param limit: Number of top lexemes to return
+        :param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
+        :param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
+        """
+
+        q = select(messages.c['text_index_col'])
+
+        if user:
+            q = q.where(messages.c['from_user'] == user[0])
+        if start:
+            q = q.where(messages.c['date'] >= str(pd.to_datetime('2019')))
+        if end:
+            q = q.where(messages.c['date'] < str(pd.to_datetime('2019')))
+
+        q = q.scalar_subquery()
+        f = TsStat(q)
+        stmt = select([f.c['word'], f.c['ndoc'], f.c['nentry']]) \
+            .select_from(f)
+
+        if n:
+            stmt = stmt.where(func.length(f.c['word']) >= n)
+
+        stmt = stmt.order_by(f.c.nentry.desc(),
+                             f.c.ndoc.desc(),
+                             f.c.word)
+
+        if limit:
+            stmt = stmt.limit(limit)\
+                       .compile(dialect=postgresql.dialect())
+
+        with self.engine.connect() as con:
+            df = pd.read_sql_query(stmt, con)
+
+        df.columns = ['Lexeme', 'Messages', 'Uses']
+
+        text = df.to_string(index=False, header=True, float_format=lambda x: f"{x:.1f}")
+
+        if user:
+            return f"**Most frequently used lexemes, {escape_markdown(user[1].lstrip('@'))}\n```\n{text}\n```", None
+        else:
+            return f"**Most frequently used lexemes, all users:**\n```\n{text}\n```", None
+
    def get_random_message(self, lquery: str = None, start: str = None, end: str = None,
                           user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
        """
--- a/telegram_stats_bot/utils.py
+++ b/telegram_stats_bot/utils.py
@ -18,9 +18,16 @@
 #
 # You should have received a copy of the GNU Public License
 # along with this program. If not, see [http://www.gnu.org/licenses/].
-
+import string
+import secrets
 import re

+from sqlalchemy import Column, Integer, Text
+from sqlalchemy.ext.compiler import compiles
+from sqlalchemy.sql.functions import FunctionElement
+from sqlalchemy.sql.base import ColumnCollection
+
+
 md_match = re.compile(r"(\[[^][]*]\(http[^()]*\))|([_*[\]()~>#+-=|{}.!\\])")


@ -31,3 +38,31 @@ def escape_markdown(string: str) -> str:
        return f'\\{match.group(2)}'

    return re.sub(md_match, url_match, string)
+
+
+# Modified from https://stackoverflow.com/a/49726653/3946475
+class TsStat(FunctionElement):
+    name = "ts_stat"
+
+    @property
+    def columns(self):
+        word = Column('word', Text)
+        ndoc = Column('ndoc', Integer)
+        nentry = Column('nentry', Integer)
+        return ColumnCollection(columns=((col.name, col) for col in (word, ndoc, nentry)))
+
+
+@compiles(TsStat, 'postgresql')
+def pg_ts_stat(element, compiler, **kw):
+    kw.pop("asfrom", None)  # Ignore and set explicitly
+    arg1, = element.clauses
+    # arg1 is a FromGrouping, which would force parens around the SELECT.
+    stmt = compiler.process(
+        arg1.element, asfrom=False, literal_binds=True, **kw)
+
+    return f"ts_stat({random_quote(stmt)})"
+
+
+def random_quote(statement: str) -> str:
+    quote_str = ''.join(secrets.choice(string.ascii_uppercase) for _ in range(8))  # Randomize dollar quotes
+    return f"${quote_str}${statement}${quote_str}$"