From ae436797927c3c9f7ccdc25dd78af3dd279ca7ff Mon Sep 17 00:00:00 2001 From: Damien George Date: Fri, 17 Feb 2017 11:10:35 +1100 Subject: [PATCH] py/lexer: Use strcmp to make keyword searching more efficient. Since the table of keywords is sorted, we can use strcmp to do the search and stop part way through the search if the comparison is less-than. Because all tokens that are names are subject to this search, this optimisation will improve the overall speed of the lexer when processing a script. The change also decreases code size by a little bit because we now use strcmp instead of the custom str_strn_equal function. --- py/lexer.c | 31 +++++++++++-------------------- py/lexer.h | 17 +++++++++-------- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/py/lexer.c b/py/lexer.c index 6a3fa656b1..5c942f9344 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -25,6 +25,7 @@ */ #include +#include #include #include "py/mpstate.h" @@ -39,19 +40,6 @@ // TODO seems that CPython allows NULL byte in the input stream // don't know if that's intentional or not, but we don't allow it -// TODO replace with a call to a standard function -STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) { - mp_uint_t i = 0; - - while (i < len && *str == *strn) { - ++i; - ++str; - ++strn; - } - - return i == len && *str == 0; -} - #define MP_LEXER_EOF ((unichar)MP_READER_EOF) #define CUR_CHAR(lex) ((lex)->chr0) @@ -225,10 +213,12 @@ STATIC const uint8_t tok_enc_kind[] = { }; // must have the same order as enum in lexer.h +// must be sorted according to strcmp STATIC const char *const tok_kw[] = { "False", "None", "True", + "__debug__", "and", "as", "assert", @@ -263,7 +253,6 @@ STATIC const char *const tok_kw[] = { "while", "with", "yield", - "__debug__", }; // This is called with CUR_CHAR() before first hex digit, and should return with @@ -531,16 +520,18 @@ void mp_lexer_to_next(mp_lexer_t *lex) { // We also check for __debug__ here and convert it to its value. This is // so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we // need to check for this special token in many places in the compiler. - // TODO improve speed of these string comparisons + const char *s = vstr_null_terminated_str(&lex->vstr); for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) { - if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) { - if (i == MP_ARRAY_SIZE(tok_kw) - 1) { - // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__" + int cmp = strcmp(s, tok_kw[i]); + if (cmp == 0) { + lex->tok_kind = MP_TOKEN_KW_FALSE + i; + if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) { lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE); - } else { - lex->tok_kind = MP_TOKEN_KW_FALSE + i; } break; + } else if (cmp < 0) { + // Table is sorted and comparison was less-than, so stop searching + break; } } diff --git a/py/lexer.h b/py/lexer.h index 32aef96266..d407192856 100644 --- a/py/lexer.h +++ b/py/lexer.h @@ -61,6 +61,7 @@ typedef enum _mp_token_kind_t { MP_TOKEN_KW_FALSE, // 14 MP_TOKEN_KW_NONE, MP_TOKEN_KW_TRUE, + MP_TOKEN_KW___DEBUG__, MP_TOKEN_KW_AND, MP_TOKEN_KW_AS, MP_TOKEN_KW_ASSERT, @@ -71,7 +72,7 @@ typedef enum _mp_token_kind_t { MP_TOKEN_KW_BREAK, MP_TOKEN_KW_CLASS, MP_TOKEN_KW_CONTINUE, - MP_TOKEN_KW_DEF, // 23 + MP_TOKEN_KW_DEF, MP_TOKEN_KW_DEL, MP_TOKEN_KW_ELIF, MP_TOKEN_KW_ELSE, @@ -81,7 +82,7 @@ typedef enum _mp_token_kind_t { MP_TOKEN_KW_FROM, MP_TOKEN_KW_GLOBAL, MP_TOKEN_KW_IF, - MP_TOKEN_KW_IMPORT, // 33 + MP_TOKEN_KW_IMPORT, MP_TOKEN_KW_IN, MP_TOKEN_KW_IS, MP_TOKEN_KW_LAMBDA, @@ -91,12 +92,12 @@ typedef enum _mp_token_kind_t { MP_TOKEN_KW_PASS, MP_TOKEN_KW_RAISE, MP_TOKEN_KW_RETURN, - MP_TOKEN_KW_TRY, // 43 + MP_TOKEN_KW_TRY, MP_TOKEN_KW_WHILE, MP_TOKEN_KW_WITH, MP_TOKEN_KW_YIELD, - MP_TOKEN_OP_PLUS, // 47 + MP_TOKEN_OP_PLUS, MP_TOKEN_OP_MINUS, MP_TOKEN_OP_STAR, MP_TOKEN_OP_DBL_STAR, @@ -106,7 +107,7 @@ typedef enum _mp_token_kind_t { MP_TOKEN_OP_LESS, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_OP_MORE, - MP_TOKEN_OP_DBL_MORE, // 57 + MP_TOKEN_OP_DBL_MORE, MP_TOKEN_OP_AMPERSAND, MP_TOKEN_OP_PIPE, MP_TOKEN_OP_CARET, @@ -116,7 +117,7 @@ typedef enum _mp_token_kind_t { MP_TOKEN_OP_DBL_EQUAL, MP_TOKEN_OP_NOT_EQUAL, - MP_TOKEN_DEL_PAREN_OPEN, // 66 + MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE, MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE, @@ -126,7 +127,7 @@ typedef enum _mp_token_kind_t { MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_PERIOD, MP_TOKEN_DEL_SEMICOLON, - MP_TOKEN_DEL_AT, // 76 + MP_TOKEN_DEL_AT, MP_TOKEN_DEL_EQUAL, MP_TOKEN_DEL_PLUS_EQUAL, MP_TOKEN_DEL_MINUS_EQUAL, @@ -136,7 +137,7 @@ typedef enum _mp_token_kind_t { MP_TOKEN_DEL_PERCENT_EQUAL, MP_TOKEN_DEL_AMPERSAND_EQUAL, MP_TOKEN_DEL_PIPE_EQUAL, - MP_TOKEN_DEL_CARET_EQUAL, // 86 + MP_TOKEN_DEL_CARET_EQUAL, MP_TOKEN_DEL_DBL_MORE_EQUAL, MP_TOKEN_DEL_DBL_LESS_EQUAL, MP_TOKEN_DEL_DBL_STAR_EQUAL,