Rework TokenStream

2025-03-05 16:07:05 +01:00 · 2025-03-05 16:07:05 +01:00 · 0c3ac0a308
commit 0c3ac0a308
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/EcmaScriptTokenStream.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/EcmaScriptTokenStream.java
@ -1,17 +1,28 @@
-package org.schabi.newpipe.extractor.utils.jsextractor;
-
-import org.mozilla.javascript.Context;
-import org.mozilla.javascript.Kit;
-import org.mozilla.javascript.ScriptRuntime;
-import org.schabi.newpipe.extractor.exceptions.ParsingException;
-
-/* Source: Mozilla Rhino, org.mozilla.javascript.Token
+/*
+ * Source: Mozilla Rhino, org.mozilla.javascript.TokenStream
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- * */
-class TokenStream {
+ *
+ */
+package org.schabi.newpipe.extractor.utils.jsextractor;
+
+import org.mozilla.javascript.Kit;
+import org.mozilla.javascript.ScriptRuntime;
+import org.schabi.newpipe.extractor.exceptions.ParsingException;
+
+/**
+ * Based on Mozilla Rhino's (v1.7.14) org.mozilla.javascript.TokenStream
+ * <p/>
+ * Changes:
+ * <ul>
+ *     <li>Tailored for {@link Lexer}</li>
+ *     <li>Removed all not needed code to improve performance</li>
+ *     <li>Optimized for ECMAScript6/2015</li>
+ * </ul>
+ */
+class EcmaScriptTokenStream {
    /*
     * For chars - because we need something out-of-range
     * to check.  (And checking EOF by exception is annoying.)
@ -28,125 +39,17 @@ class TokenStream {
    private static final char BYTE_ORDER_MARK = '\uFEFF';
    private static final char NUMERIC_SEPARATOR = '_';

-    TokenStream(final String sourceString, final int lineno, final int languageVersion) {
+    EcmaScriptTokenStream(final String sourceString, final int lineno, final boolean strictMode) {
        this.sourceString = sourceString;
        this.sourceCursor = 0;
        this.cursor = 0;

        this.lineno = lineno;
-        this.languageVersion = languageVersion;
+        this.strictMode = strictMode;
    }

-    private static Token stringToKeyword(
-            final String name,
-            final int version,
-            final boolean isStrict) {
-        if (version < Context.VERSION_ES6) {
-            return stringToKeywordForJS(name);
-        }
-        return stringToKeywordForES(name, isStrict);
-    }
-
-    /** JavaScript 1.8 and earlier */
-    private static Token stringToKeywordForJS(final String name) {
-        switch (name) {
-            case "break":
-                return Token.BREAK;
-            case "case":
-                return Token.CASE;
-            case "continue":
-                return Token.CONTINUE;
-            case "default":
-                return Token.DEFAULT;
-            case "delete":
-                return Token.DELPROP;
-            case "do":
-                return Token.DO;
-            case "else":
-                return Token.ELSE;
-            case "export":
-                return Token.EXPORT;
-            case "false":
-                return Token.FALSE;
-            case "for":
-                return Token.FOR;
-            case "function":
-                return Token.FUNCTION;
-            case "if":
-                return Token.IF;
-            case "in":
-                return Token.IN;
-            case "let":
-                return Token.LET;
-            case "new":
-                return Token.NEW;
-            case "null":
-                return Token.NULL;
-            case "return":
-                return Token.RETURN;
-            case "switch":
-                return Token.SWITCH;
-            case "this":
-                return Token.THIS;
-            case "true":
-                return Token.TRUE;
-            case "typeof":
-                return Token.TYPEOF;
-            case "var":
-                return Token.VAR;
-            case "void":
-                return Token.VOID;
-            case "while":
-                return Token.WHILE;
-            case "with":
-                return Token.WITH;
-            case "yield":
-                return Token.YIELD;
-            case "throw":
-                return Token.THROW;
-            case "catch":
-                return Token.CATCH;
-            case "const":
-                return Token.CONST;
-            case "debugger":
-                return Token.DEBUGGER;
-            case "finally":
-                return Token.FINALLY;
-            case "instanceof":
-                return Token.INSTANCEOF;
-            case "try":
-                return Token.TRY;
-            case "abstract":
-            case "boolean":
-            case "byte":
-            case "char":
-            case "class":
-            case "double":
-            case "enum":
-            case "extends":
-            case "final":
-            case "float":
-            case "goto":
-            case "implements":
-            case "import":
-            case "int":
-            case "interface":
-            case "long":
-            case "native":
-            case "package":
-            case "private":
-            case "protected":
-            case "public":
-            case "short":
-            case "static":
-            case "super":
-            case "synchronized":
-            case "throws":
-            case "transient":
-            case "volatile":
-                return Token.RESERVED;
-        }
-        return Token.EOF;
+    private Token stringToKeyword(final String name) {
+        return stringToKeywordForES(name, strictMode);
    }

    /** ECMAScript 6. */
@ -346,19 +249,9 @@ class TokenStream {
                    // check if it's a keyword.

                    // Return the corresponding token if it's a keyword
-                    Token result = stringToKeyword(str, languageVersion, STRICT_MODE);
+                    final Token result = stringToKeyword(str);
                    if (result != Token.EOF) {
-                        if ((result == Token.LET || result == Token.YIELD)
-                                && languageVersion < Context.VERSION_1_7) {
-                            result = Token.NAME;
-                        }
-                        // Save the string in case we need to use in
-                        // object literal definitions.
-                        if (result != Token.RESERVED
-                                || languageVersion >= Context.VERSION_ES6
-                                || !IS_RESERVED_KEYWORD_AS_IDENTIFIER) {
-                            return result;
-                        }
+                        return result; // Always needed due to ECMAScript
                    }
                }
                return Token.NAME;
@ -368,7 +261,6 @@ class TokenStream {
            if (isDigit(c) || (c == '.' && isDigit(peekChar()))) {
                stringBufferTop = 0;
                int base = 10;
-                final boolean es6 = languageVersion >= Context.VERSION_ES6;
                boolean isOldOctal = false;

                if (c == '0') {
@ -376,10 +268,10 @@ class TokenStream {
                    if (c == 'x' || c == 'X') {
                        base = 16;
                        c = getChar();
-                    } else if (es6 && (c == 'o' || c == 'O')) {
+                    } else if (c == 'o' || c == 'O') {
                        base = 8;
                        c = getChar();
-                    } else if (es6 && (c == 'b' || c == 'B')) {
+                    } else if (c == 'b' || c == 'B') {
                        base = 2;
                        c = getChar();
                    } else if (isDigit(c)) {
@ -422,7 +314,7 @@ class TokenStream {
                    throw new ParsingException("number format error");
                }

-                if (es6 && c == 'n') {
+                if (c == 'n') {
                    c = getChar();
                } else if (base == 10 && (c == '.' || c == 'e' || c == 'E')) {
                    if (c == '.') {
@ -705,7 +597,7 @@ class TokenStream {
                    return Token.GT;

                case '*':
-                    if (languageVersion >= Context.VERSION_ES6 && matchChar('*')) {
+                    if (matchChar('*')) {
                        if (matchChar('=')) {
                            return Token.ASSIGN_EXP;
                        }
@ -1080,18 +972,16 @@ class TokenStream {

    // sourceCursor is an index into a small buffer that keeps a
    // sliding window of the source stream.
-    int sourceCursor;
+    private int sourceCursor;

    // cursor is a monotonically increasing index into the original
    // source stream, tracking exactly how far scanning has progressed.
    // Its value is the index of the next character to be scanned.
-    int cursor;
+    private int cursor;

    // Record start and end positions of last scanned token.
    int tokenBeg;
    int tokenEnd;

-    private final int languageVersion;
-    private static final boolean IS_RESERVED_KEYWORD_AS_IDENTIFIER = true;
-    private static final boolean STRICT_MODE = false;
+    private final boolean strictMode;
 }
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java
@ -1,6 +1,5 @@
 package org.schabi.newpipe.extractor.utils.jsextractor;

-import org.mozilla.javascript.Context;
 import org.schabi.newpipe.extractor.exceptions.ParsingException;

 import java.util.Stack;
@ -119,7 +118,7 @@ public class Lexer {
        }
    }

-    private final TokenStream stream;
+    private final EcmaScriptTokenStream stream;
    private final LookBehind lastThree;
    private final Stack<Brace> braceStack;
    private final Stack<Paren> parenStack;
@ -128,24 +127,14 @@ public class Lexer {
     * Create a new JavaScript lexer with the given source code
     *
     * @param js JavaScript code
-     * @param languageVersion JavaScript version (from Rhino)
     */
-    public Lexer(final String js, final int languageVersion) {
-        stream = new TokenStream(js, 0, languageVersion);
+    public Lexer(final String js) {
+        stream = new EcmaScriptTokenStream(js, 0, false);
        lastThree = new LookBehind();
        braceStack = new Stack<>();
        parenStack = new Stack<>();
    }

-    /**
-     * Create a new JavaScript lexer with the given source code
-     *
-     * @param js JavaScript code
-     */
-    public Lexer(final String js) {
-        this(js, Context.VERSION_DEFAULT);
-    }
-
    /**
     * Continue parsing and return the next token
     * @return next token