zippy/burstiness.py

42 wiersze
1.4 KiB
Python

#/usr/bin/env python3
# Tool to calculate the burstiness of a text
# (C) 2023 Thinkst Applied Research, PTY
# Author: Jacob Torrey
import argparse, os
from numpy import std, var
from typing import List, Tuple
def calc_burstiness(s : str) -> Tuple[Tuple[float, float]]:
'''
Given a string returns the standard deviation and variance of sentence length in terms of both chars and words
'''
lens : List[Tuple[int, int]] = []
sentences = s.split('.')
for sentence in sentences:
chars = len(sentence)
if chars < 1:
continue
words = len(sentence.split(' '))
lens.append((chars, words))
cd = (std([x[0] for x in lens]), var([x[0] for x in lens]))
wd = (std([x[1] for x in lens]), var([x[1] for x in lens]))
return (cd, wd)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to analyze')
args = parser.parse_args()
ws = 0
wv = 0
for f in args.sample_files:
print(f)
if os.path.isfile(f):
with open(f, 'r') as fp:
text = fp.read()
b = calc_burstiness(text)
ws += b[1][0]
wv += b[1][1]
print(str(b))
print("Average std: " + str(ws/len(args.sample_files)) + " var: " + str(wv/len(args.sample_files)))