kopia lustrzana https://github.com/micropython/micropython-lib
116 wiersze
2.7 KiB
Python
116 wiersze
2.7 KiB
Python
![]() |
import string
|
||
|
|
||
|
TEXT = "TEXT"
|
||
|
START_TAG = "START_TAG"
|
||
|
START_TAG_DONE = "START_TAG_DONE"
|
||
|
END_TAG = "END_TAG"
|
||
|
PI = "PI"
|
||
|
PI_DONE = "PI_DONE"
|
||
|
ATTR = "ATTR"
|
||
|
ATTR_VAL = "ATTR_VAL"
|
||
|
|
||
|
class XMLSyntaxError(Exception):
|
||
|
pass
|
||
|
|
||
|
class XMLTokenizer:
|
||
|
|
||
|
def __init__(self, f):
|
||
|
self.f = f
|
||
|
self.nextch()
|
||
|
|
||
|
def curch(self):
|
||
|
return self.c
|
||
|
|
||
|
def getch(self):
|
||
|
c = self.c
|
||
|
self.nextch()
|
||
|
return c
|
||
|
|
||
|
def eof(self):
|
||
|
return self.c == ""
|
||
|
|
||
|
def nextch(self):
|
||
|
self.c = self.f.read(1)
|
||
|
if not self.c:
|
||
|
raise StopIteration
|
||
|
return self.c
|
||
|
|
||
|
def skip_ws(self):
|
||
|
while self.curch().isspace():
|
||
|
self.nextch()
|
||
|
|
||
|
def isident(self):
|
||
|
self.skip_ws()
|
||
|
return self.curch().isalpha()
|
||
|
|
||
|
def getident(self):
|
||
|
self.skip_ws()
|
||
|
ident = ""
|
||
|
while self.curch() in (string.ascii_letters + string.digits):
|
||
|
ident += self.getch()
|
||
|
return ident
|
||
|
|
||
|
def match(self, c):
|
||
|
self.skip_ws()
|
||
|
if self.curch() == c:
|
||
|
self.nextch()
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
def expect(self, c):
|
||
|
if not self.match(c):
|
||
|
raise XMLSyntaxError
|
||
|
|
||
|
def lex_attrs_till(self):
|
||
|
while self.isident():
|
||
|
attr = self.getident()
|
||
|
yield (ATTR, attr)
|
||
|
self.expect("=")
|
||
|
self.expect('"')
|
||
|
val = ""
|
||
|
while self.curch() != '"':
|
||
|
val += self.getch()
|
||
|
yield (ATTR_VAL, val)
|
||
|
self.expect('"')
|
||
|
|
||
|
def tokenize(self):
|
||
|
while not self.eof():
|
||
|
if self.match("<"):
|
||
|
if self.match("/"):
|
||
|
yield (END_TAG, self.getident())
|
||
|
self.expect(">")
|
||
|
elif self.match("?"):
|
||
|
yield (PI, self.getident())
|
||
|
yield from self.lex_attrs_till()
|
||
|
self.expect("?")
|
||
|
self.expect(">")
|
||
|
else:
|
||
|
tag = self.getident()
|
||
|
yield (START_TAG, tag)
|
||
|
yield from self.lex_attrs_till()
|
||
|
if self.match("/"):
|
||
|
yield (END_TAG, tag)
|
||
|
self.expect(">")
|
||
|
else:
|
||
|
text = ""
|
||
|
while self.curch() != "<":
|
||
|
text += self.getch()
|
||
|
if text:
|
||
|
yield (TEXT, text)
|
||
|
|
||
|
|
||
|
def gfind(gen, pred):
|
||
|
for i in gen:
|
||
|
if pred(i):
|
||
|
return i
|
||
|
|
||
|
def text_of(gen, tag):
|
||
|
# Return text content of a leaf tag
|
||
|
gfind(gen, lambda i: i == (START_TAG, tag))
|
||
|
t, val = next(gen)
|
||
|
assert t == TEXT
|
||
|
return val
|
||
|
|
||
|
def tokenize(file):
|
||
|
return XMLTokenizer(file).tokenize()
|