xmltok: Initial draft version of XML tokenizer.

API roughly inspired by stdlib "tokenize" module.
2015-10-10 13:10:11 +03:00 · 2015-10-10 13:10:11 +03:00 · cebf1973bc
commit cebf1973bc
--- a/xmltok/xmltok.py
+++ b/xmltok/xmltok.py
@ -0,0 +1,115 @@
+import string
+
+TEXT = "TEXT"
+START_TAG = "START_TAG"
+START_TAG_DONE = "START_TAG_DONE"
+END_TAG = "END_TAG"
+PI = "PI"
+PI_DONE = "PI_DONE"
+ATTR = "ATTR"
+ATTR_VAL = "ATTR_VAL"
+
+class XMLSyntaxError(Exception):
+    pass
+
+class XMLTokenizer:
+
+    def __init__(self, f):
+        self.f = f
+        self.nextch()
+
+    def curch(self):
+        return self.c
+
+    def getch(self):
+        c = self.c
+        self.nextch()
+        return c
+
+    def eof(self):
+        return self.c == ""
+
+    def nextch(self):
+        self.c = self.f.read(1)
+        if not self.c:
+            raise StopIteration
+        return self.c
+
+    def skip_ws(self):
+        while self.curch().isspace():
+            self.nextch()
+
+    def isident(self):
+        self.skip_ws()
+        return self.curch().isalpha()
+
+    def getident(self):
+        self.skip_ws()
+        ident = ""
+        while self.curch() in (string.ascii_letters + string.digits):
+            ident += self.getch()
+        return ident
+
+    def match(self, c):
+        self.skip_ws()
+        if self.curch() == c:
+            self.nextch()
+            return True
+        return False
+
+    def expect(self, c):
+        if not self.match(c):
+            raise XMLSyntaxError
+
+    def lex_attrs_till(self):
+        while self.isident():
+            attr = self.getident()
+            yield (ATTR, attr)
+            self.expect("=")
+            self.expect('"')
+            val = ""
+            while self.curch() != '"':
+                val += self.getch()
+            yield (ATTR_VAL, val)
+            self.expect('"')
+
+    def tokenize(self):
+        while not self.eof():
+            if self.match("<"):
+                if self.match("/"):
+                    yield (END_TAG, self.getident())
+                    self.expect(">")
+                elif self.match("?"):
+                    yield (PI, self.getident())
+                    yield from self.lex_attrs_till()
+                    self.expect("?")
+                    self.expect(">")
+                else:
+                    tag = self.getident()
+                    yield (START_TAG, tag)
+                    yield from self.lex_attrs_till()
+                    if self.match("/"):
+                        yield (END_TAG, tag)
+                    self.expect(">")
+            else:
+                text = ""
+                while self.curch() != "<":
+                    text += self.getch()
+                if text:
+                    yield (TEXT, text)
+
+
+def gfind(gen, pred):
+    for i in gen:
+        if pred(i):
+            return i
+
+def text_of(gen, tag):
+    # Return text content of a leaf tag
+    gfind(gen, lambda i: i == (START_TAG, tag))
+    t, val = next(gen)
+    assert t == TEXT
+    return val
+
+def tokenize(file):
+    return XMLTokenizer(file).tokenize()