From cebf1973bcb0e9fd0d3329cf0f52c9a0025d17ef Mon Sep 17 00:00:00 2001
From: Paul Sokolovsky <pfalcon@users.sourceforge.net>
Date: Sat, 10 Oct 2015 13:10:11 +0300
Subject: [PATCH] xmltok: Initial draft version of XML tokenizer.

API roughly inspired by stdlib "tokenize" module.
---
 xmltok/xmltok.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 xmltok/xmltok.py

diff --git a/xmltok/xmltok.py b/xmltok/xmltok.py
new file mode 100644
index 00000000..4a11a8af
--- /dev/null
+++ b/xmltok/xmltok.py
@@ -0,0 +1,115 @@
+import string
+
+TEXT = "TEXT"
+START_TAG = "START_TAG"
+START_TAG_DONE = "START_TAG_DONE"
+END_TAG = "END_TAG"
+PI = "PI"
+PI_DONE = "PI_DONE"
+ATTR = "ATTR"
+ATTR_VAL = "ATTR_VAL"
+
+class XMLSyntaxError(Exception):
+    pass
+
+class XMLTokenizer:
+
+    def __init__(self, f):
+        self.f = f
+        self.nextch()
+
+    def curch(self):
+        return self.c
+
+    def getch(self):
+        c = self.c
+        self.nextch()
+        return c
+
+    def eof(self):
+        return self.c == ""
+
+    def nextch(self):
+        self.c = self.f.read(1)
+        if not self.c:
+            raise StopIteration
+        return self.c
+
+    def skip_ws(self):
+        while self.curch().isspace():
+            self.nextch()
+
+    def isident(self):
+        self.skip_ws()
+        return self.curch().isalpha()
+
+    def getident(self):
+        self.skip_ws()
+        ident = ""
+        while self.curch() in (string.ascii_letters + string.digits):
+            ident += self.getch()
+        return ident
+
+    def match(self, c):
+        self.skip_ws()
+        if self.curch() == c:
+            self.nextch()
+            return True
+        return False
+
+    def expect(self, c):
+        if not self.match(c):
+            raise XMLSyntaxError
+
+    def lex_attrs_till(self):
+        while self.isident():
+            attr = self.getident()
+            yield (ATTR, attr)
+            self.expect("=")
+            self.expect('"')
+            val = ""
+            while self.curch() != '"':
+                val += self.getch()
+            yield (ATTR_VAL, val)
+            self.expect('"')
+
+    def tokenize(self):
+        while not self.eof():
+            if self.match("<"):
+                if self.match("/"):
+                    yield (END_TAG, self.getident())
+                    self.expect(">")
+                elif self.match("?"):
+                    yield (PI, self.getident())
+                    yield from self.lex_attrs_till()
+                    self.expect("?")
+                    self.expect(">")
+                else:
+                    tag = self.getident()
+                    yield (START_TAG, tag)
+                    yield from self.lex_attrs_till()
+                    if self.match("/"):
+                        yield (END_TAG, tag)
+                    self.expect(">")
+            else:
+                text = ""
+                while self.curch() != "<":
+                    text += self.getch()
+                if text:
+                    yield (TEXT, text)
+
+
+def gfind(gen, pred):
+    for i in gen:
+        if pred(i):
+            return i
+
+def text_of(gen, tag):
+    # Return text content of a leaf tag
+    gfind(gen, lambda i: i == (START_TAG, tag))
+    t, val = next(gen)
+    assert t == TEXT
+    return val
+
+def tokenize(file):
+    return XMLTokenizer(file).tokenize()