kopia lustrzana https://github.com/micropython/micropython-lib
xmltok: Initial draft version of XML tokenizer.
API roughly inspired by stdlib "tokenize" module.pull/46/head
rodzic
2bb224b562
commit
cebf1973bc
|
@ -0,0 +1,115 @@
|
|||
import string
|
||||
|
||||
TEXT = "TEXT"
|
||||
START_TAG = "START_TAG"
|
||||
START_TAG_DONE = "START_TAG_DONE"
|
||||
END_TAG = "END_TAG"
|
||||
PI = "PI"
|
||||
PI_DONE = "PI_DONE"
|
||||
ATTR = "ATTR"
|
||||
ATTR_VAL = "ATTR_VAL"
|
||||
|
||||
class XMLSyntaxError(Exception):
|
||||
pass
|
||||
|
||||
class XMLTokenizer:
|
||||
|
||||
def __init__(self, f):
|
||||
self.f = f
|
||||
self.nextch()
|
||||
|
||||
def curch(self):
|
||||
return self.c
|
||||
|
||||
def getch(self):
|
||||
c = self.c
|
||||
self.nextch()
|
||||
return c
|
||||
|
||||
def eof(self):
|
||||
return self.c == ""
|
||||
|
||||
def nextch(self):
|
||||
self.c = self.f.read(1)
|
||||
if not self.c:
|
||||
raise StopIteration
|
||||
return self.c
|
||||
|
||||
def skip_ws(self):
|
||||
while self.curch().isspace():
|
||||
self.nextch()
|
||||
|
||||
def isident(self):
|
||||
self.skip_ws()
|
||||
return self.curch().isalpha()
|
||||
|
||||
def getident(self):
|
||||
self.skip_ws()
|
||||
ident = ""
|
||||
while self.curch() in (string.ascii_letters + string.digits):
|
||||
ident += self.getch()
|
||||
return ident
|
||||
|
||||
def match(self, c):
|
||||
self.skip_ws()
|
||||
if self.curch() == c:
|
||||
self.nextch()
|
||||
return True
|
||||
return False
|
||||
|
||||
def expect(self, c):
|
||||
if not self.match(c):
|
||||
raise XMLSyntaxError
|
||||
|
||||
def lex_attrs_till(self):
|
||||
while self.isident():
|
||||
attr = self.getident()
|
||||
yield (ATTR, attr)
|
||||
self.expect("=")
|
||||
self.expect('"')
|
||||
val = ""
|
||||
while self.curch() != '"':
|
||||
val += self.getch()
|
||||
yield (ATTR_VAL, val)
|
||||
self.expect('"')
|
||||
|
||||
def tokenize(self):
|
||||
while not self.eof():
|
||||
if self.match("<"):
|
||||
if self.match("/"):
|
||||
yield (END_TAG, self.getident())
|
||||
self.expect(">")
|
||||
elif self.match("?"):
|
||||
yield (PI, self.getident())
|
||||
yield from self.lex_attrs_till()
|
||||
self.expect("?")
|
||||
self.expect(">")
|
||||
else:
|
||||
tag = self.getident()
|
||||
yield (START_TAG, tag)
|
||||
yield from self.lex_attrs_till()
|
||||
if self.match("/"):
|
||||
yield (END_TAG, tag)
|
||||
self.expect(">")
|
||||
else:
|
||||
text = ""
|
||||
while self.curch() != "<":
|
||||
text += self.getch()
|
||||
if text:
|
||||
yield (TEXT, text)
|
||||
|
||||
|
||||
def gfind(gen, pred):
|
||||
for i in gen:
|
||||
if pred(i):
|
||||
return i
|
||||
|
||||
def text_of(gen, tag):
|
||||
# Return text content of a leaf tag
|
||||
gfind(gen, lambda i: i == (START_TAG, tag))
|
||||
t, val = next(gen)
|
||||
assert t == TEXT
|
||||
return val
|
||||
|
||||
def tokenize(file):
|
||||
return XMLTokenizer(file).tokenize()
|
Ładowanie…
Reference in New Issue