kopia lustrzana https://github.com/mkdryden/telegram-stats-bot
stats: Add lexeme frequency
rodzic
ed51680630
commit
80d2c0848c
|
@ -12,6 +12,7 @@ Unreleased
|
||||||
Added
|
Added
|
||||||
-----
|
-----
|
||||||
- Allow limiting counts by message type
|
- Allow limiting counts by message type
|
||||||
|
- stats: Added words statistic
|
||||||
|
|
||||||
Fixed
|
Fixed
|
||||||
-----
|
-----
|
||||||
|
|
31
README.rst
31
README.rst
|
@ -57,6 +57,8 @@ Table of contents
|
||||||
|
|
||||||
- `types`_
|
- `types`_
|
||||||
|
|
||||||
|
- `words`_
|
||||||
|
|
||||||
- `random`_
|
- `random`_
|
||||||
|
|
||||||
- `License`_
|
- `License`_
|
||||||
|
@ -286,6 +288,35 @@ types
|
||||||
document 1.0 0.0 1.0 0.0
|
document 1.0 0.0 1.0 0.0
|
||||||
Total 598640.0 100.0 16693.0 100.0
|
Total 598640.0 100.0 16693.0 100.0
|
||||||
|
|
||||||
|
words
|
||||||
|
-----
|
||||||
|
``/stats words`` returns a table of the most commonly used lexemes
|
||||||
|
|
||||||
|
.. code::
|
||||||
|
|
||||||
|
Most frequently used lexemes:
|
||||||
|
Lexeme Messages Uses
|
||||||
|
like 1265 1334
|
||||||
|
well 753 765
|
||||||
|
actual 628 645
|
||||||
|
make 600 619
|
||||||
|
yeah 609 609
|
||||||
|
mean 544 553
|
||||||
|
thing 473 490
|
||||||
|
realli 472 482
|
||||||
|
though 467 470
|
||||||
|
peopl 415 445
|
||||||
|
think 425 433
|
||||||
|
know 403 409
|
||||||
|
need 396 408
|
||||||
|
time 371 389
|
||||||
|
want 354 371
|
||||||
|
would 345 366
|
||||||
|
much 345 357
|
||||||
|
probabl 348 356
|
||||||
|
even 331 338
|
||||||
|
stuff 318 332
|
||||||
|
|
||||||
random
|
random
|
||||||
------
|
------
|
||||||
``/stats random`` prints a random message from the database.
|
``/stats random`` prints a random message from the database.
|
||||||
|
|
|
@ -21,11 +21,20 @@
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from sqlalchemy import Column, Table, MetaData
|
||||||
from sqlalchemy.engine import Engine
|
from sqlalchemy.engine import Engine
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
from sqlalchemy.types import TIMESTAMP, BigInteger
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
metadata = MetaData()
|
||||||
|
messages = Table('messages_utc', metadata,
|
||||||
|
Column('date', TIMESTAMP),
|
||||||
|
Column('from_user', BigInteger),
|
||||||
|
Column('text_index_col', postgresql.TSVECTOR))
|
||||||
|
|
||||||
|
|
||||||
def init_dbs(engine: Engine):
|
def init_dbs(engine: Engine):
|
||||||
sql = """
|
sql = """
|
||||||
|
|
|
@ -34,8 +34,11 @@ import numpy as np
|
||||||
from matplotlib.figure import Figure
|
from matplotlib.figure import Figure
|
||||||
from matplotlib.dates import date2num
|
from matplotlib.dates import date2num
|
||||||
from sqlalchemy.engine import Engine
|
from sqlalchemy.engine import Engine
|
||||||
|
from sqlalchemy import select, func
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
from .utils import escape_markdown
|
from .utils import escape_markdown, TsStat, random_quote
|
||||||
|
from .db import messages
|
||||||
from . import __version__
|
from . import __version__
|
||||||
|
|
||||||
sns.set_context('paper')
|
sns.set_context('paper')
|
||||||
|
@ -77,6 +80,7 @@ class StatsRunner(object):
|
||||||
'corr': "get_user_correlation",
|
'corr': "get_user_correlation",
|
||||||
'delta': "get_message_deltas",
|
'delta': "get_message_deltas",
|
||||||
'types': "get_type_stats",
|
'types': "get_type_stats",
|
||||||
|
'words': "get_word_stats",
|
||||||
'random': "get_random_message"}
|
'random': "get_random_message"}
|
||||||
|
|
||||||
def __init__(self, engine: Engine, tz: str = 'America/Toronto'):
|
def __init__(self, engine: Engine, tz: str = 'America/Toronto'):
|
||||||
|
@ -896,6 +900,53 @@ class StatsRunner(object):
|
||||||
else:
|
else:
|
||||||
return f"**Messages by type:**\n```\n{text}\n```", None
|
return f"**Messages by type:**\n```\n{text}\n```", None
|
||||||
|
|
||||||
|
def get_word_stats(self, n: int = 4, limit: int = 20, start: str = None, end: str = None,
|
||||||
|
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
|
||||||
|
"""
|
||||||
|
Print table of lexeme statistics.
|
||||||
|
:param n: Only consider lexemes with length of at least n
|
||||||
|
:param limit: Number of top lexemes to return
|
||||||
|
:param start: Start timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
||||||
|
:param end: End timestamp (e.g. 2019, 2019-01, 2019-01-01, "2019-01-01 14:21")
|
||||||
|
"""
|
||||||
|
|
||||||
|
q = select(messages.c['text_index_col'])
|
||||||
|
|
||||||
|
if user:
|
||||||
|
q = q.where(messages.c['from_user'] == user[0])
|
||||||
|
if start:
|
||||||
|
q = q.where(messages.c['date'] >= str(pd.to_datetime('2019')))
|
||||||
|
if end:
|
||||||
|
q = q.where(messages.c['date'] < str(pd.to_datetime('2019')))
|
||||||
|
|
||||||
|
q = q.scalar_subquery()
|
||||||
|
f = TsStat(q)
|
||||||
|
stmt = select([f.c['word'], f.c['ndoc'], f.c['nentry']]) \
|
||||||
|
.select_from(f)
|
||||||
|
|
||||||
|
if n:
|
||||||
|
stmt = stmt.where(func.length(f.c['word']) >= n)
|
||||||
|
|
||||||
|
stmt = stmt.order_by(f.c.nentry.desc(),
|
||||||
|
f.c.ndoc.desc(),
|
||||||
|
f.c.word)
|
||||||
|
|
||||||
|
if limit:
|
||||||
|
stmt = stmt.limit(limit)\
|
||||||
|
.compile(dialect=postgresql.dialect())
|
||||||
|
|
||||||
|
with self.engine.connect() as con:
|
||||||
|
df = pd.read_sql_query(stmt, con)
|
||||||
|
|
||||||
|
df.columns = ['Lexeme', 'Messages', 'Uses']
|
||||||
|
|
||||||
|
text = df.to_string(index=False, header=True, float_format=lambda x: f"{x:.1f}")
|
||||||
|
|
||||||
|
if user:
|
||||||
|
return f"**Most frequently used lexemes, {escape_markdown(user[1].lstrip('@'))}\n```\n{text}\n```", None
|
||||||
|
else:
|
||||||
|
return f"**Most frequently used lexemes, all users:**\n```\n{text}\n```", None
|
||||||
|
|
||||||
def get_random_message(self, lquery: str = None, start: str = None, end: str = None,
|
def get_random_message(self, lquery: str = None, start: str = None, end: str = None,
|
||||||
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
|
user: Tuple[int, str] = None, **kwargs) -> Tuple[str, None]:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -18,9 +18,16 @@
|
||||||
#
|
#
|
||||||
# You should have received a copy of the GNU Public License
|
# You should have received a copy of the GNU Public License
|
||||||
# along with this program. If not, see [http://www.gnu.org/licenses/].
|
# along with this program. If not, see [http://www.gnu.org/licenses/].
|
||||||
|
import string
|
||||||
|
import secrets
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from sqlalchemy import Column, Integer, Text
|
||||||
|
from sqlalchemy.ext.compiler import compiles
|
||||||
|
from sqlalchemy.sql.functions import FunctionElement
|
||||||
|
from sqlalchemy.sql.base import ColumnCollection
|
||||||
|
|
||||||
|
|
||||||
md_match = re.compile(r"(\[[^][]*]\(http[^()]*\))|([_*[\]()~>#+-=|{}.!\\])")
|
md_match = re.compile(r"(\[[^][]*]\(http[^()]*\))|([_*[\]()~>#+-=|{}.!\\])")
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,3 +38,31 @@ def escape_markdown(string: str) -> str:
|
||||||
return f'\\{match.group(2)}'
|
return f'\\{match.group(2)}'
|
||||||
|
|
||||||
return re.sub(md_match, url_match, string)
|
return re.sub(md_match, url_match, string)
|
||||||
|
|
||||||
|
|
||||||
|
# Modified from https://stackoverflow.com/a/49726653/3946475
|
||||||
|
class TsStat(FunctionElement):
|
||||||
|
name = "ts_stat"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def columns(self):
|
||||||
|
word = Column('word', Text)
|
||||||
|
ndoc = Column('ndoc', Integer)
|
||||||
|
nentry = Column('nentry', Integer)
|
||||||
|
return ColumnCollection(columns=((col.name, col) for col in (word, ndoc, nentry)))
|
||||||
|
|
||||||
|
|
||||||
|
@compiles(TsStat, 'postgresql')
|
||||||
|
def pg_ts_stat(element, compiler, **kw):
|
||||||
|
kw.pop("asfrom", None) # Ignore and set explicitly
|
||||||
|
arg1, = element.clauses
|
||||||
|
# arg1 is a FromGrouping, which would force parens around the SELECT.
|
||||||
|
stmt = compiler.process(
|
||||||
|
arg1.element, asfrom=False, literal_binds=True, **kw)
|
||||||
|
|
||||||
|
return f"ts_stat({random_quote(stmt)})"
|
||||||
|
|
||||||
|
|
||||||
|
def random_quote(statement: str) -> str:
|
||||||
|
quote_str = ''.join(secrets.choice(string.ascii_uppercase) for _ in range(8)) # Randomize dollar quotes
|
||||||
|
return f"${quote_str}${statement}${quote_str}$"
|
||||||
|
|
Ładowanie…
Reference in New Issue