micropython-lib/xmltok/xmltok.py

import string

TEXT = "TEXT"
START_TAG = "START_TAG"
START_TAG_DONE = "START_TAG_DONE"
END_TAG = "END_TAG"
PI = "PI"
PI_DONE = "PI_DONE"
ATTR = "ATTR"
ATTR_VAL = "ATTR_VAL"

class XMLSyntaxError(Exception):
    pass

class XMLTokenizer:

    def __init__(self, f):
        self.f = f
        self.nextch()

    def curch(self):
        return self.c

    def getch(self):
        c = self.c
        self.nextch()
        return c

    def eof(self):
        return self.c == ""

    def nextch(self):
        self.c = self.f.read(1)
        if not self.c:
            raise StopIteration
        return self.c

    def skip_ws(self):
        while self.curch().isspace():
            self.nextch()

    def isident(self):
        self.skip_ws()
        return self.curch().isalpha()

    def getident(self):
        self.skip_ws()
        ident = ""
        while self.curch() in (string.ascii_letters + string.digits):
            ident += self.getch()
        return ident

    def match(self, c):
        self.skip_ws()
        if self.curch() == c:
            self.nextch()
            return True
        return False

    def expect(self, c):
        if not self.match(c):
            raise XMLSyntaxError

    def lex_attrs_till(self):
        while self.isident():
            attr = self.getident()
            yield (ATTR, attr)
            self.expect("=")
            self.expect('"')
            val = ""
            while self.curch() != '"':
                val += self.getch()
            yield (ATTR_VAL, val)
            self.expect('"')

    def tokenize(self):
        while not self.eof():
            if self.match("<"):
                if self.match("/"):
                    yield (END_TAG, self.getident())
                    self.expect(">")
                elif self.match("?"):
                    yield (PI, self.getident())
                    yield from self.lex_attrs_till()
                    self.expect("?")
                    self.expect(">")
                else:
                    tag = self.getident()
                    yield (START_TAG, tag)
                    yield from self.lex_attrs_till()
                    if self.match("/"):
                        yield (END_TAG, tag)
                    self.expect(">")
            else:
                text = ""
                while self.curch() != "<":
                    text += self.getch()
                if text:
                    yield (TEXT, text)


def gfind(gen, pred):
    for i in gen:
        if pred(i):
            return i

def text_of(gen, tag):
    # Return text content of a leaf tag
    gfind(gen, lambda i: i == (START_TAG, tag))
    t, val = next(gen)
    assert t == TEXT
    return val

def tokenize(file):
    return XMLTokenizer(file).tokenize()
xmltok: Initial draft version of XML tokenizer. API roughly inspired by stdlib "tokenize" module. 2015-10-10 10:10:11 +00:00			`import string`

			`TEXT = "TEXT"`
			`START_TAG = "START_TAG"`
			`START_TAG_DONE = "START_TAG_DONE"`
			`END_TAG = "END_TAG"`
			`PI = "PI"`
			`PI_DONE = "PI_DONE"`
			`ATTR = "ATTR"`
			`ATTR_VAL = "ATTR_VAL"`

			`class XMLSyntaxError(Exception):`
			`pass`

			`class XMLTokenizer:`

			`def __init__(self, f):`
			`self.f = f`
			`self.nextch()`

			`def curch(self):`
			`return self.c`

			`def getch(self):`
			`c = self.c`
			`self.nextch()`
			`return c`

			`def eof(self):`
			`return self.c == ""`

			`def nextch(self):`
			`self.c = self.f.read(1)`
			`if not self.c:`
			`raise StopIteration`
			`return self.c`

			`def skip_ws(self):`
			`while self.curch().isspace():`
			`self.nextch()`

			`def isident(self):`
			`self.skip_ws()`
			`return self.curch().isalpha()`

			`def getident(self):`
			`self.skip_ws()`
			`ident = ""`
			`while self.curch() in (string.ascii_letters + string.digits):`
			`ident += self.getch()`
			`return ident`

			`def match(self, c):`
			`self.skip_ws()`
			`if self.curch() == c:`
			`self.nextch()`
			`return True`
			`return False`

			`def expect(self, c):`
			`if not self.match(c):`
			`raise XMLSyntaxError`

			`def lex_attrs_till(self):`
			`while self.isident():`
			`attr = self.getident()`
			`yield (ATTR, attr)`
			`self.expect("=")`
			`self.expect('"')`
			`val = ""`
			`while self.curch() != '"':`
			`val += self.getch()`
			`yield (ATTR_VAL, val)`
			`self.expect('"')`

			`def tokenize(self):`
			`while not self.eof():`
			`if self.match("<"):`
			`if self.match("/"):`
			`yield (END_TAG, self.getident())`
			`self.expect(">")`
			`elif self.match("?"):`
			`yield (PI, self.getident())`
			`yield from self.lex_attrs_till()`
			`self.expect("?")`
			`self.expect(">")`
			`else:`
			`tag = self.getident()`
			`yield (START_TAG, tag)`
			`yield from self.lex_attrs_till()`
			`if self.match("/"):`
			`yield (END_TAG, tag)`
			`self.expect(">")`
			`else:`
			`text = ""`
			`while self.curch() != "<":`
			`text += self.getch()`
			`if text:`
			`yield (TEXT, text)`


			`def gfind(gen, pred):`
			`for i in gen:`
			`if pred(i):`
			`return i`

			`def text_of(gen, tag):`
			`# Return text content of a leaf tag`
			`gfind(gen, lambda i: i == (START_TAG, tag))`
			`t, val = next(gen)`
			`assert t == TEXT`
			`return val`

			`def tokenize(file):`
			`return XMLTokenizer(file).tokenize()`