feat: NumberTokenScanner 增加规则约束

2025-06-29 23:56:21 +08:00 · 2025-06-29 23:56:21 +08:00 · 6a247f456c
commit 6a247f456c
parent cb4faf0632
2 changed files with 130 additions and 101 deletions
--- a/src/main/java/org/jcnc/snow/compiler/lexer/core/LexerContext.java
+++ b/src/main/java/org/jcnc/snow/compiler/lexer/core/LexerContext.java
@ -3,60 +3,72 @@ package org.jcnc.snow.compiler.lexer.core;
 import org.jcnc.snow.compiler.lexer.base.TokenScanner;

 /**
- * {@code LexerContext} 是词法分析阶段的上下文状态管理器。
+ * {@code LexerContext} —— 词法分析阶段的上下文状态管理器。<br>
 * <p>
- * 该类提供对源代码字符流的读取访问，追踪当前行号与列号，
- * 并支持字符匹配、回看与指针推进等操作，是 {@link TokenScanner} 实现进行词法识别的重要支撑工具。
+ * 提供对源代码字符流的读取访问、行列号追踪、指针推进与字符匹配等操作，
+ * 是 {@link TokenScanner} 实现进行词法识别的基础设施。
 * </p>
 * <p>
- * 所有源代码输入在构造时统一将 Windows 风格的换行符（\r\n）转换为 Unix 风格（\n），
- * 保证换行行为一致性。
+ * 设计要点：
+ * <ul>
+ *     <li>构造时统一将 Windows 换行符 (<code>\r\n</code>) 转换为 Unix 风格 (<code>\n</code>)。</li>
+ *     <li>所有坐标均以 <strong>1</strong> 为起始行／列号，更贴合人类直觉。</li>
+ *     <li>提供 {@link #peekAhead(int)} 方法以支持“向前多字符查看”而不移动游标。</li>
+ * </ul>
 * </p>
 */
 public class LexerContext {

-    /** 源代码字符串，换行符已标准化为 \n */
+    /* ───────────────────────────────── 私有字段 ───────────────────────────────── */
+
+    /** 源代码字符串（换行符已标准化为 \n） */
    private final String source;

-    /** 当前扫描位置（从 0 开始的偏移） */
+    /** 当前扫描位置（自 0 起算的全局偏移量） */
    private int pos = 0;

-    /** 当前行号，从 1 开始 */
+    /** 当前行号（从 1 开始） */
    private int line = 1;

-    /** 当前列号，从 1 开始 */
+    /** 当前列号（从 1 开始） */
    private int col = 1;

-    /** 上一个字符对应的列号（用于位置精确记录） */
+    /** 上一个字符对应的列号（用于异常定位） */
    private int lastCol = 1;

+    /* ──────────────────────────────── 构造 & 基本信息 ─────────────────────────────── */
+
    /**
-     * 构造一个新的 {@code LexerContext} 实例，并标准化换行符。
+     * 创建新的 {@code LexerContext}，并完成换行符标准化。
     *
-     * @param source 原始源代码字符串
+     * @param rawSource 原始源代码文本
     */
-    public LexerContext(String source) {
-        this.source = source.replace("\r\n", "\n");
+    public LexerContext(String rawSource) {
+        this.source = rawSource.replace("\r\n", "\n");
    }

    /**
-     * 判断是否已读取到源代码末尾。
+     * 判断是否已到达源代码结尾。
     *
-     * @return 若已结束，返回 {@code true}；否则返回 {@code false}
+     * @return 若游标位于终点之后返回 {@code true}
     */
    public boolean isAtEnd() {
        return pos >= source.length();
    }

+    /* ──────────────────────────────── 指针推进与查看 ─────────────────────────────── */
+
    /**
-     * 消费当前字符并前进一个位置，自动更新行列信息。
+     * 消费 <em>当前</em> 字符并前进一个位置，同时更新行列号。
     *
-     * @return 当前字符，若已结束则返回空字符（'\0'）
+     * @return 被消费的字符；若已结束则返回空字符 {@code '\0'}
     */
    public char advance() {
        if (isAtEnd()) return '\0';
+
        char c = source.charAt(pos++);
        lastCol = col;
+
        if (c == '\n') {
            line++;
            col = 1;
@ -67,9 +79,9 @@ public class LexerContext {
    }

    /**
-     * 查看当前位置的字符，但不前进。
+     * 查看当前位置字符，但不移动游标。
     *
-     * @return 当前字符，若结束则返回空字符
+     * @return 当前字符；若越界则返回 {@code '\0'}
     */
    public char peek() {
        return isAtEnd() ? '\0' : source.charAt(pos);
@ -78,17 +90,29 @@ public class LexerContext {
    /**
     * 查看下一个字符，但不改变位置。
     *
-     * @return 下一个字符，若结束则返回空字符
+     * @return 下一字符；若越界则返回 {@code '\0'}
     */
    public char peekNext() {
        return pos + 1 >= source.length() ? '\0' : source.charAt(pos + 1);
    }

    /**
-     * 若当前字符与期望字符相同，则前进并返回 {@code true}，否则不动并返回 {@code false}。
+     * 向前查看 <em>offset</em> 个字符（不移动游标）。offset=1 等价于 {@link #peekNext()}。
     *
-     * @param expected 期待匹配的字符
-     * @return 是否匹配成功并消费
+     * @param offset 偏移量 (≥ 1)
+     * @return 指定偏移处的字符；若越界返回 {@code '\0'}
+     */
+    public char peekAhead(int offset) {
+        if (offset <= 0) return peek();
+        int idx = pos + offset;
+        return idx >= source.length() ? '\0' : source.charAt(idx);
+    }
+
+    /**
+     * 若当前位置字符等于 {@code expected}，则消费并返回 {@code true}；否则保持原位返回 {@code false}。
+     *
+     * @param expected 期望匹配的字符
+     * @return 是否匹配并消费
     */
    public boolean match(char expected) {
        if (isAtEnd() || source.charAt(pos) != expected) return false;
@ -96,30 +120,17 @@ public class LexerContext {
        return true;
    }

-    /**
-     * 获取当前位置的行号。
-     *
-     * @return 当前行号（从 1 开始）
-     */
-    public int getLine() {
-        return line;
-    }
+    /* ──────────────────────────────── 坐标查询 ─────────────────────────────── */

-    /**
-     * 获取当前位置的列号。
-     *
-     * @return 当前列号（从 1 开始）
-     */
-    public int getCol() {
-        return col;
-    }
+    /** @return 当前行号 (1-based) */
+    public int getLine()    { return line; }

-    /**
-     * 获取上一个字符所在的列号。
-     *
-     * @return 上一个字符对应的列位置
-     */
-    public int getLastCol() {
-        return lastCol;
-    }
+    /** @return 当前列号 (1-based) */
+    public int getCol()     { return col;  }
+
+    /** @return 上一个字符的列号 */
+    public int getLastCol() { return lastCol; }
+
+    /** @return 当前指针在源文件中的全局偏移 (0-based) */
+    public int getPos()     { return pos;  }
 }
--- a/src/main/java/org/jcnc/snow/compiler/lexer/scanners/NumberTokenScanner.java
+++ b/src/main/java/org/jcnc/snow/compiler/lexer/scanners/NumberTokenScanner.java
@ -1,92 +1,110 @@
 package org.jcnc.snow.compiler.lexer.scanners;

 import org.jcnc.snow.compiler.lexer.core.LexerContext;
+import org.jcnc.snow.compiler.lexer.core.LexicalException;
 import org.jcnc.snow.compiler.lexer.token.Token;
 import org.jcnc.snow.compiler.lexer.token.TokenType;

 /**
- * 数字扫描器：识别整数、小数以及带有<strong>类型后缀</strong>的数字字面量。
+ * 数字扫描器：识别整数、小数以及带有 <strong>类型后缀</strong> 的数字字面量。<br>
 * <p>
- * 支持的格式示例：
+ * 支持格式示例：
 * <ul>
- *     <li>整数：123、0、45678</li>
- *     <li>小数：3.14、0.5、12.0</li>
- *     <li>带类型后缀：2.0f、42L、7s、255B</li>
+ *     <li>整数：<code>123</code>、<code>0</code>、<code>45678</code></li>
+ *     <li>小数：<code>3.14</code>、<code>0.5</code>、<code>12.0</code></li>
+ *     <li>带后缀：<code>2.0f</code>、<code>42L</code>、<code>7s</code>、<code>255B</code></li>
 * </ul>
+ * </p>
 * <p>
- * 语法允许在数字 (整数或小数) 末尾添加以下<strong>单字符后缀</strong>来显式指定常量类型：
- * <pre>b | s | l | f | d   // 分别对应 byte、short、long、float、double
- * B | S | L | F | D   // 同上，大小写皆可</pre>
- * 生成的 Token 类型始终为 {@code NUMBER_LITERAL}，词法单元将携带完整的文本（含后缀，若存在）。
+ * 单字符类型后缀：
+ * <pre>
+ * b | s | l | f | d   // byte, short, long, float, double
+ * B | S | L | F | D   // 同上（大小写均可）
+ * </pre>
+ * </p>
+ * <p>
+ * 规则约束：<br>
+ * 若数字主体之后出现以下情况，将在词法阶段抛出 {@link LexicalException}：
+ * <ul>
+ *     <li>空白 + 字母（如 <code>3&nbsp;L</code>）</li>
+ *     <li>未知字母紧邻（如 <code>3E</code>）</li>
+ *     <li><code>'/'</code> 紧邻（如 <code>3/</code>、<code>3/*</code>）</li>
+ * </ul>
+ * 以避免编译器陷入死循环。
+ * </p>
 */
 public class NumberTokenScanner extends AbstractTokenScanner {

-    /**
-     * 可选类型后缀字符集合 (大小写均可)。
-     * 与 {@code ExpressionBuilder} 内的后缀解析逻辑保持一致。
-     */
+    /** 合法类型后缀字符集合 */
    private static final String SUFFIX_CHARS = "bslfdBSLFD";

-    /**
-     * 判断是否可以处理当前位置的字符。
-     * <p>当字符为数字时，表示可能是数字字面量的起始。</p>
-     *
-     * @param c   当前字符
-     * @param ctx 当前词法上下文
-     * @return 如果为数字字符，则返回 true
-     */
    @Override
    public boolean canHandle(char c, LexerContext ctx) {
        return Character.isDigit(c);
    }

-    /**
-     * 执行数字扫描逻辑。
-     * <ol>
-     *     <li>连续读取数字字符，允许出现<strong>一个</strong>小数点，用于识别整数或小数。</li>
-     *     <li>读取完主体后，<strong>一次性</strong>检查下一个字符，若属于合法类型后缀则吸收。</li>
-     * </ol>
-     * 这样可以保证诸如 {@code 2.0f} 被视为一个整体的 {@code NUMBER_LITERAL}，
-     * 而不是拆分成 "2.0" 与 "f" 两个 Token。
-     *
-     * @param ctx  词法上下文
-     * @param line 当前行号
-     * @param col  当前列号
-     * @return 表示数字字面量的 Token
-     */
    @Override
    protected Token scanToken(LexerContext ctx, int line, int col) {
-        StringBuilder sb = new StringBuilder();
-        boolean hasDot = false; // 标识是否已经遇到过小数点
+        StringBuilder literal = new StringBuilder();
+        boolean hasDot = false; // 是否已遇到小数点

-        /*
-         * 1️⃣ 扫描整数或小数主体
-         * 允许出现一个小数点，其余必须是数字。
-         */
+        /* 1. 读取数字主体（整数 / 小数） */
        while (!ctx.isAtEnd()) {
            char c = ctx.peek();
            if (c == '.' && !hasDot) {
                hasDot = true;
-                sb.append(ctx.advance());
+                literal.append(ctx.advance());
            } else if (Character.isDigit(c)) {
-                sb.append(ctx.advance());
+                literal.append(ctx.advance());
            } else {
-                break; // 遇到非数字/第二个点 => 主体结束
+                break;
            }
        }

-        /*
-         * 2️⃣ 可选类型后缀
-         * 如果下一字符是合法后缀字母，则一起纳入当前 Token。
-         */
+        /* 2. 处理后缀或非法跟随字符 */
        if (!ctx.isAtEnd()) {
-            char suffix = ctx.peek();
-            if (SUFFIX_CHARS.indexOf(suffix) >= 0) {
-                sb.append(ctx.advance());
+            char next = ctx.peek();
+
+            /* 2-A: 合法类型后缀，直接吸收 */
+            if (SUFFIX_CHARS.indexOf(next) >= 0) {
+                literal.append(ctx.advance());
            }
+            /* 2-B: 未知字母紧邻 → 抛异常 */
+            else if (Character.isLetter(next)) {
+                throw new LexicalException(
+                        "Unknown numeric suffix '" + next + "'",
+                        line, col
+                );
+            }
+            /* 2-C: 数字后空白（非换行）→ 若空白后跟字母，抛异常 */
+            else if (Character.isWhitespace(next) && next != '\n') {
+                int off = 1;
+                char look;
+                do {
+                    look = ctx.peekAhead(off);
+                    if (look == '\n' || look == '\0') break;
+                    if (!Character.isWhitespace(look)) break;
+                    off++;
+                } while (true);
+
+                if (Character.isLetter(look)) {
+                    throw new LexicalException(
+                            "Whitespace between numeric literal and an alphabetic character is not allowed",
+                            line, col
+                    );
+                }
+            }
+            /* 2-D: 紧邻字符为 '/' → 抛异常以避免死循环 */
+            else if (next == '/') {
+                throw new LexicalException(
+                        "Unexpected '/' after numeric literal",
+                        line, col
+                );
+            }
+            /* 其余字符（运算符、分隔符等）留给后续扫描器处理 */
        }

-        // 构造并返回 NUMBER_LITERAL Token，文本内容形如 "123", "3.14f" 等。
-        return new Token(TokenType.NUMBER_LITERAL, sb.toString(), line, col);
+        /* 3. 返回 NUMBER_LITERAL Token */
+        return new Token(TokenType.NUMBER_LITERAL, literal.toString(), line, col);
    }
 }