diff --git a/burstiness.py b/burstiness.py new file mode 100644 index 0000000..16b8232 --- /dev/null +++ b/burstiness.py @@ -0,0 +1,42 @@ +#/usr/bin/env python3 + +# Tool to calculate the burstiness of a text +# (C) 2023 Thinkst Applied Research, PTY +# Author: Jacob Torrey + +import argparse, os +from numpy import std, var +from typing import List, Tuple + +def calc_burstiness(s : str) -> Tuple[Tuple[float, float]]: + ''' + Given a string returns the standard deviation and variance of sentence length in terms of both chars and words + ''' + lens : List[Tuple[int, int]] = [] + sentences = s.split('.') + for sentence in sentences: + chars = len(sentence) + if chars < 1: + continue + words = len(sentence.split(' ')) + lens.append((chars, words)) + cd = (std([x[0] for x in lens]), var([x[0] for x in lens])) + wd = (std([x[1] for x in lens]), var([x[1] for x in lens])) + return (cd, wd) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to analyze') + args = parser.parse_args() + ws = 0 + wv = 0 + for f in args.sample_files: + print(f) + if os.path.isfile(f): + with open(f, 'r') as fp: + text = fp.read() + b = calc_burstiness(text) + ws += b[1][0] + wv += b[1][1] + print(str(b)) + print("Average std: " + str(ws/len(args.sample_files)) + " var: " + str(wv/len(args.sample_files))) \ No newline at end of file