import string

TEXT = "TEXT"
START_TAG = "START_TAG"
START_TAG_DONE = "START_TAG_DONE"
END_TAG = "END_TAG"
PI = "PI"
PI_DONE = "PI_DONE"
ATTR = "ATTR"
ATTR_VAL = "ATTR_VAL"

class XMLSyntaxError(Exception):
    pass

class XMLTokenizer:

    def __init__(self, f):
        self.f = f
        self.nextch()

    def curch(self):
        return self.c

    def getch(self):
        c = self.c
        self.nextch()
        return c

    def eof(self):
        return self.c == ""

    def nextch(self):
        self.c = self.f.read(1)
        if not self.c:
            raise StopIteration
        return self.c

    def skip_ws(self):
        while self.curch().isspace():
            self.nextch()

    def isident(self):
        self.skip_ws()
        return self.curch().isalpha()

    def getident(self):
        self.skip_ws()
        ident = ""
        while self.curch() in (string.ascii_letters + string.digits):
            ident += self.getch()
        return ident

    def match(self, c):
        self.skip_ws()
        if self.curch() == c:
            self.nextch()
            return True
        return False

    def expect(self, c):
        if not self.match(c):
            raise XMLSyntaxError

    def lex_attrs_till(self):
        while self.isident():
            attr = self.getident()
            yield (ATTR, attr)
            self.expect("=")
            self.expect('"')
            val = ""
            while self.curch() != '"':
                val += self.getch()
            yield (ATTR_VAL, val)
            self.expect('"')

    def tokenize(self):
        while not self.eof():
            if self.match("<"):
                if self.match("/"):
                    yield (END_TAG, self.getident())
                    self.expect(">")
                elif self.match("?"):
                    yield (PI, self.getident())
                    yield from self.lex_attrs_till()
                    self.expect("?")
                    self.expect(">")
                else:
                    tag = self.getident()
                    yield (START_TAG, tag)
                    yield from self.lex_attrs_till()
                    if self.match("/"):
                        yield (END_TAG, tag)
                    self.expect(">")
            else:
                text = ""
                while self.curch() != "<":
                    text += self.getch()
                if text:
                    yield (TEXT, text)


def gfind(gen, pred):
    for i in gen:
        if pred(i):
            return i

def text_of(gen, tag):
    # Return text content of a leaf tag
    gfind(gen, lambda i: i == (START_TAG, tag))
    t, val = next(gen)
    assert t == TEXT
    return val

def tokenize(file):
    return XMLTokenizer(file).tokenize()