kopia lustrzana https://github.com/mkdryden/telegram-stats-bot
stats: Add lexeme frequency
rodzic
ed51680630
commit
80d2c0848c
|
@ -12,6 +12,7 @@ Unreleased
|
|||
Added
|
||||
-----
|
||||
- Allow limiting counts by message type
|
||||
- stats: Added words statistic
|
||||
|
||||
Fixed
|
||||
-----
|
||||
|
|
31
README.rst
31
README.rst
|
@ -57,6 +57,8 @@ Table of contents
|
|||
|
||||
- `types`_
|
||||
|
||||
- `words`_
|
||||
|
||||
- `random`_
|
||||
|
||||
- `License`_
|
||||
|
@ -286,6 +288,35 @@ types
|
|||
document 1.0 0.0 1.0 0.0
|
||||
Total 598640.0 100.0 16693.0 100.0
|
||||
|
||||
words
|
||||
-----
|
||||
``/stats words`` returns a table of the most commonly used lexemes
|
||||
|
||||
.. code::
|
||||
|
||||
Most frequently used lexemes:
|
||||
Lexeme Messages Uses
|
||||
like 1265 1334
|
||||
well 753 765
|
||||
actual 628 645
|
||||
make 600 619
|
||||
yeah 609 609
|
||||
mean 544 553
|
||||
thing 473 490
|
||||
realli 472 482
|
||||
though 467 470
|
||||
peopl 415 445
|
||||
think 425 433
|
||||
know 403 409
|
||||
need 396 408
|
||||
time 371 389
|
||||
want 354 371
|
||||
would 345 366
|
||||
much 345 357
|
||||
probabl 348 356
|
||||
even 331 338
|
||||
stuff 318 332
|
||||
|
||||
random
|
||||
------
|
||||
``/stats random`` prints a random message from the database.
|
||||
|
|
|
@ -21,11 +21,20 @@
|
|||
|
||||
import logging
|
||||
|
||||
from sqlalchemy import Column, Table, MetaData
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from sqlalchemy.types import TIMESTAMP, BigInteger
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
metadata = MetaData()
|
||||
messages = Table('messages_utc', metadata,
|
||||
Column('date', TIMESTAMP),
|
||||
Column('from_user', BigInteger),
|
||||
Column('text_index_col', postgresql.TSVECTOR))
|
||||
|
||||
|
||||
def init_dbs(engine: Engine):
|
||||
sql = """
|
||||
|
|
|
@ -34,8 +34,11 @@ import numpy as np
|
|||
from matplotlib.figure import Figure
|
||||
from matplotlib.dates import date2num
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy import select, func
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
from .utils import escape_markdown
|
||||
from .utils import escape_markdown, TsStat, random_quote
|
||||
from .db import messages
|
||||
from . import __version__
|
||||
|
||||
sns.set_context('paper')
|
||||
|
@ -77,6 +80,7 @@ class StatsRunner(object):
|
|||
'corr': "get_user_correlation",
|
||||
'delta': "get_message_deltas",
|
||||
'types': "get_type_stats",
|
||||
'words': "get_word_stats",
|
||||
'random': "get_random_message"}
|
||||
|
||||
def __init__(self, engine: Engine, tz: str = 'America/Toronto'):
|
||||
|
@ -896,6 +900,53 @@ class StatsRunner(object):
|
|||
else:
|
||||
return f"**Messages by type:**\n```\n{text}\n```", None
|
||||
|
||||
def get_word_stats(self, n: int = 4, limit: int = 20, start: str = None, end: str = None,
|
||||
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
|
||||
"""
|
||||
Print table of lexeme statistics.
|
||||
:param n: Only consider lexemes with length of at least n
|
||||
:param limit: Number of top lexemes to return
|
||||
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
||||
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
||||
"""
|
||||
|
||||
q = select(messages.c['text_index_col'])
|
||||
|
||||
if user:
|
||||
q = q.where(messages.c['from_user'] == user[0])
|
||||
if start:
|
||||
q = q.where(messages.c['date'] >= str(pd.to_datetime('2019')))
|
||||
if end:
|
||||
q = q.where(messages.c['date'] < str(pd.to_datetime('2019')))
|
||||
|
||||
q = q.scalar_subquery()
|
||||
f = TsStat(q)
|
||||
stmt = select([f.c['word'], f.c['ndoc'], f.c['nentry']]) \
|
||||
.select_from(f)
|
||||
|
||||
if n:
|
||||
stmt = stmt.where(func.length(f.c['word']) >= n)
|
||||
|
||||
stmt = stmt.order_by(f.c.nentry.desc(),
|
||||
f.c.ndoc.desc(),
|
||||
f.c.word)
|
||||
|
||||
if limit:
|
||||
stmt = stmt.limit(limit)\
|
||||
.compile(dialect=postgresql.dialect())
|
||||
|
||||
with self.engine.connect() as con:
|
||||
df = pd.read_sql_query(stmt, con)
|
||||
|
||||
df.columns = ['Lexeme', 'Messages', 'Uses']
|
||||
|
||||
text = df.to_string(index=False, header=True, float_format=lambda x: f"{x:.1f}")
|
||||
|
||||
if user:
|
||||
return f"**Most frequently used lexemes, {escape_markdown(user[1].lstrip('@'))}\n```\n{text}\n```", None
|
||||
else:
|
||||
return f"**Most frequently used lexemes, all users:**\n```\n{text}\n```", None
|
||||
|
||||
def get_random_message(self, lquery: str = None, start: str = None, end: str = None,
|
||||
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
|
||||
"""
|
||||
|
|
|
@ -18,9 +18,16 @@
|
|||
#
|
||||
# You should have received a copy of the GNU Public License
|
||||
# along with this program. If not, see [http://www.gnu.org/licenses/].
|
||||
|
||||
import string
|
||||
import secrets
|
||||
import re
|
||||
|
||||
from sqlalchemy import Column, Integer, Text
|
||||
from sqlalchemy.ext.compiler import compiles
|
||||
from sqlalchemy.sql.functions import FunctionElement
|
||||
from sqlalchemy.sql.base import ColumnCollection
|
||||
|
||||
|
||||
md_match = re.compile(r"(\[[^][]*]\(http[^()]*\))|([_*[\]()~>#+-=|{}.!\\])")
|
||||
|
||||
|
||||
|
@ -31,3 +38,31 @@ def escape_markdown(string: str) -> str:
|
|||
return f'\\{match.group(2)}'
|
||||
|
||||
return re.sub(md_match, url_match, string)
|
||||
|
||||
|
||||
# Modified from https://stackoverflow.com/a/49726653/3946475
|
||||
class TsStat(FunctionElement):
|
||||
name = "ts_stat"
|
||||
|
||||
@property
|
||||
def columns(self):
|
||||
word = Column('word', Text)
|
||||
ndoc = Column('ndoc', Integer)
|
||||
nentry = Column('nentry', Integer)
|
||||
return ColumnCollection(columns=((col.name, col) for col in (word, ndoc, nentry)))
|
||||
|
||||
|
||||
@compiles(TsStat, 'postgresql')
|
||||
def pg_ts_stat(element, compiler, **kw):
|
||||
kw.pop("asfrom", None) # Ignore and set explicitly
|
||||
arg1, = element.clauses
|
||||
# arg1 is a FromGrouping, which would force parens around the SELECT.
|
||||
stmt = compiler.process(
|
||||
arg1.element, asfrom=False, literal_binds=True, **kw)
|
||||
|
||||
return f"ts_stat({random_quote(stmt)})"
|
||||
|
||||
|
||||
def random_quote(statement: str) -> str:
|
||||
quote_str = ''.join(secrets.choice(string.ascii_uppercase) for _ in range(8)) # Randomize dollar quotes
|
||||
return f"${quote_str}${statement}${quote_str}$"
|
||||
|
|
Ładowanie…
Reference in New Issue