From cebf1973bcb0e9fd0d3329cf0f52c9a0025d17ef Mon Sep 17 00:00:00 2001 From: Paul Sokolovsky Date: Sat, 10 Oct 2015 13:10:11 +0300 Subject: [PATCH] xmltok: Initial draft version of XML tokenizer. API roughly inspired by stdlib "tokenize" module. --- xmltok/xmltok.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 xmltok/xmltok.py diff --git a/xmltok/xmltok.py b/xmltok/xmltok.py new file mode 100644 index 00000000..4a11a8af --- /dev/null +++ b/xmltok/xmltok.py @@ -0,0 +1,115 @@ +import string + +TEXT = "TEXT" +START_TAG = "START_TAG" +START_TAG_DONE = "START_TAG_DONE" +END_TAG = "END_TAG" +PI = "PI" +PI_DONE = "PI_DONE" +ATTR = "ATTR" +ATTR_VAL = "ATTR_VAL" + +class XMLSyntaxError(Exception): + pass + +class XMLTokenizer: + + def __init__(self, f): + self.f = f + self.nextch() + + def curch(self): + return self.c + + def getch(self): + c = self.c + self.nextch() + return c + + def eof(self): + return self.c == "" + + def nextch(self): + self.c = self.f.read(1) + if not self.c: + raise StopIteration + return self.c + + def skip_ws(self): + while self.curch().isspace(): + self.nextch() + + def isident(self): + self.skip_ws() + return self.curch().isalpha() + + def getident(self): + self.skip_ws() + ident = "" + while self.curch() in (string.ascii_letters + string.digits): + ident += self.getch() + return ident + + def match(self, c): + self.skip_ws() + if self.curch() == c: + self.nextch() + return True + return False + + def expect(self, c): + if not self.match(c): + raise XMLSyntaxError + + def lex_attrs_till(self): + while self.isident(): + attr = self.getident() + yield (ATTR, attr) + self.expect("=") + self.expect('"') + val = "" + while self.curch() != '"': + val += self.getch() + yield (ATTR_VAL, val) + self.expect('"') + + def tokenize(self): + while not self.eof(): + if self.match("<"): + if self.match("/"): + yield (END_TAG, self.getident()) + self.expect(">") + elif self.match("?"): + yield (PI, self.getident()) + yield from self.lex_attrs_till() + self.expect("?") + self.expect(">") + else: + tag = self.getident() + yield (START_TAG, tag) + yield from self.lex_attrs_till() + if self.match("/"): + yield (END_TAG, tag) + self.expect(">") + else: + text = "" + while self.curch() != "<": + text += self.getch() + if text: + yield (TEXT, text) + + +def gfind(gen, pred): + for i in gen: + if pred(i): + return i + +def text_of(gen, tag): + # Return text content of a leaf tag + gfind(gen, lambda i: i == (START_TAG, tag)) + t, val = next(gen) + assert t == TEXT + return val + +def tokenize(file): + return XMLTokenizer(file).tokenize()