kopia lustrzana https://github.com/thinkst/zippy
42 wiersze
1.4 KiB
Python
42 wiersze
1.4 KiB
Python
#/usr/bin/env python3
|
|
|
|
# Tool to calculate the burstiness of a text
|
|
# (C) 2023 Thinkst Applied Research, PTY
|
|
# Author: Jacob Torrey
|
|
|
|
import argparse, os
|
|
from numpy import std, var
|
|
from typing import List, Tuple
|
|
|
|
def calc_burstiness(s : str) -> Tuple[Tuple[float, float]]:
|
|
'''
|
|
Given a string returns the standard deviation and variance of sentence length in terms of both chars and words
|
|
'''
|
|
lens : List[Tuple[int, int]] = []
|
|
sentences = s.split('.')
|
|
for sentence in sentences:
|
|
chars = len(sentence)
|
|
if chars < 1:
|
|
continue
|
|
words = len(sentence.split(' '))
|
|
lens.append((chars, words))
|
|
cd = (std([x[0] for x in lens]), var([x[0] for x in lens]))
|
|
wd = (std([x[1] for x in lens]), var([x[1] for x in lens]))
|
|
return (cd, wd)
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to analyze')
|
|
args = parser.parse_args()
|
|
ws = 0
|
|
wv = 0
|
|
for f in args.sample_files:
|
|
print(f)
|
|
if os.path.isfile(f):
|
|
with open(f, 'r') as fp:
|
|
text = fp.read()
|
|
b = calc_burstiness(text)
|
|
ws += b[1][0]
|
|
wv += b[1][1]
|
|
print(str(b))
|
|
print("Average std: " + str(ws/len(args.sample_files)) + " var: " + str(wv/len(args.sample_files))) |