kopia lustrzana https://github.com/thinkst/zippy
Initial commit of burstiness analysis
Signed-off-by: Jacob Torrey <jacob@thinkst.com>pull/6/head
rodzic
15766880c3
commit
5fe971863e
|
@ -0,0 +1,42 @@
|
|||
#/usr/bin/env python3
|
||||
|
||||
# Tool to calculate the burstiness of a text
|
||||
# (C) 2023 Thinkst Applied Research, PTY
|
||||
# Author: Jacob Torrey
|
||||
|
||||
import argparse, os
|
||||
from numpy import std, var
|
||||
from typing import List, Tuple
|
||||
|
||||
def calc_burstiness(s : str) -> Tuple[Tuple[float, float]]:
|
||||
'''
|
||||
Given a string returns the standard deviation and variance of sentence length in terms of both chars and words
|
||||
'''
|
||||
lens : List[Tuple[int, int]] = []
|
||||
sentences = s.split('.')
|
||||
for sentence in sentences:
|
||||
chars = len(sentence)
|
||||
if chars < 1:
|
||||
continue
|
||||
words = len(sentence.split(' '))
|
||||
lens.append((chars, words))
|
||||
cd = (std([x[0] for x in lens]), var([x[0] for x in lens]))
|
||||
wd = (std([x[1] for x in lens]), var([x[1] for x in lens]))
|
||||
return (cd, wd)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to analyze')
|
||||
args = parser.parse_args()
|
||||
ws = 0
|
||||
wv = 0
|
||||
for f in args.sample_files:
|
||||
print(f)
|
||||
if os.path.isfile(f):
|
||||
with open(f, 'r') as fp:
|
||||
text = fp.read()
|
||||
b = calc_burstiness(text)
|
||||
ws += b[1][0]
|
||||
wv += b[1][1]
|
||||
print(str(b))
|
||||
print("Average std: " + str(ws/len(args.sample_files)) + " var: " + str(wv/len(args.sample_files)))
|
Ładowanie…
Reference in New Issue