the-super-tiny-compiler 源碼解析

一。目標

通過實現編譯器來支持簡單的表達式轉換，把 Lisp 風格的函數調用轉換成 C 風格的，例如：

              LISP-style              C-style
2 + 2         (add 2 2)               add(2, 2)
4 - 2         (subtract 4 2)          subtract(4, 2)
2 + (4 - 2)   (add 2 (subtract 4 2))  add(2, subtract(4, 2))

即輸入 (add 2 2)，要求輸出 add(2, 2)

二。設計

把編譯工作拆分成 3 部分：

解析：把代碼字符串轉成抽象表示
轉換：把抽象表示修改成想要的樣子
代碼生成：把轉換過的抽象表示轉成新代碼字符串

這裡的解析包括詞法分析及語法分析，由詞法分析器把代碼串轉換成一系列詞法單元（token），再由語法分析器生成能夠描述語法結構（包括語法成分及其關係）的中間表示形式（Intermediate Representation）或抽象語法樹（Abstract Syntax Tree）

從結構上看，詞法單元是一組描述獨立語法成分（比如數值，標籤，標點符號，操作符等）的小對象，抽象語法樹（簡稱 AST）是個深層嵌套的對象，易於處理並且攜帶着語法結構信息，例如：

// 代碼字符串
(add 2 (subtract 4 2))
// 詞法單元
[
  { type: 'paren',  value: '('        },
  { type: 'name',   value: 'add'      },
  { type: 'number', value: '2'        },
  { type: 'paren',  value: '('        },
  { type: 'name',   value: 'subtract' },
  { type: 'number', value: '4'        },
  { type: 'number', value: '2'        },
  { type: 'paren',  value: ')'        },
  { type: 'paren',  value: ')'        },
]
// AST
{
  type: 'Program',
  body: [{
    type: 'CallExpression',
    name: 'add',
    params: [{
      type: 'NumberLiteral',
      value: '2',
    }, {
      type: 'CallExpression',
      name: 'subtract',
      params: [{
        type: 'NumberLiteral',
        value: '4',
      }, {
        type: 'NumberLiteral',
        value: '2',
      }]
    }]
  }]
}

轉換階段要修改 AST，也就是遍歷上面生成的樹結構，進行節點級操作（增/刪/改節點）和屬性級操作（增/刪/改屬性）。當然，也可以創建一棵新樹

實現上，可以深度優先遍歷，沒什麼特別的，逐節點處理倒是有點意思：

var visitor = {
  NumberLiteral: {
    enter(node, parent) {},
    exit(node, parent) {}
  },
  CallExpression: {
    enter(node, parent) {},
    exit(node, parent) {}
  },
  //...
};

把訪問節點的操作提出來作為 visitor 層，遍��過程中按詞法單元類型調用對應的 enter/exit 方法即可，算是個小技巧

改完 AST，就到了最後的代碼生成環節，遍歷收集，把 AST 還原成代碼串就好了

三。實現

詞法分析

// 接受代碼字符串 input
function tokenizer(input) {
  // 當前正在處理的字符索引
  let current = 0;
  // 輸出結果集合，存放詞法單元
  let tokens = [];

  // 遍歷字符串，挑出詞法單元
  while (current < input.length) {
    let char = input[current];

    // 匹配左括號、右括號
    if (char === '(') {
      tokens.push({
        type: 'paren',
        value: '(',
      });
      current++;

      continue;
    }
    if (char === ')') {
      tokens.push({
        type: 'paren',
        value: ')',
      });
      current++;

      continue;
    }

    // 跳過空白字符
    let WHITESPACE = /\s/;
    if (WHITESPACE.test(char)) {
      current++;
      continue;
    }

    // 匹配數值
    let NUMBERS = /[0-9]/;
    if (NUMBERS.test(char)) {
      let value = '';

      // 匹配連續數字，作為數值
      while (NUMBERS.test(char)) {
        value += char;
        char = input[++current];
      }
      tokens.push({ type: 'number', value });

      continue;
    }

    // 匹配形如 "abc" 的字符串
    if (char === '"') {
      let value = '';

      // 吃掉左雙引號
      char = input[++current];
      // 收集兩個雙引號之間的所有內容，作為字符串值
      while (char !== '"') {
        value += char;
        char = input[++current];
      }
      // 吃掉右邊雙引號
      char = input[++current];
      tokens.push({ type: 'string', value });

      continue;
    }

    // 匹配函數名，要求只含大小寫字母
    let LETTERS = /[a-z]/i;
    if (LETTERS.test(char)) {
      let value = '';

      // 收集連續字母
      while (LETTERS.test(char)) {
        value += char;
        char = input[++current];
      }
      tokens.push({ type: 'name', value });

      continue;
    }

    // 無法識別的字符，報錯
    throw new TypeError('I dont know what this character is: ' + char);
  }

  return tokens;
}

遍歷代碼字符串，拆出各個詞素，包成詞法單元的形式

語法分析

function parser(tokens) {
  // 當前正在處理的 token 索引
  let current = 0;

  // 遞歸遍歷（因為函數調用允許嵌套），把 token 轉成 AST 節點
  function walk() {
    let token = tokens[current];

    // 數值
    if (token.type === 'number') {
      current++;

      // 生成一個 AST 節點，表示數值字面量
      return {
        type: 'NumberLiteral',
        value: token.value,
      };
    }

    // 字符串
    if (token.type === 'string') {
      current++;

      return {
        type: 'StringLiteral',
        value: token.value,
      };
    }

    // 函數調用
    if (
      token.type === 'paren' &&
      token.value === '('
    ) {
      // 丟掉左括號，取下一個 token 作為函數名
      token = tokens[++current];

      let node = {
        type: 'CallExpression',
        name: token.value,
        params: [],
      };

      // 看下一個 token
      token = tokens[++current];

      // 右括號之前的所有 token 解析完都是參數
      while (
        (token.type !== 'paren') ||
        (token.type === 'paren' && token.value !== ')')
      ) {
        node.params.push(walk());
        token = tokens[current];
      }
      // 吃掉右括號
      current++;

      return node;
    }

    // 無法識別的 token，報錯
    throw new TypeError(token.type);
  }

  // AST 的根節點
  let ast = {
    type: 'Program',
    body: [],
  };
  // 填充 ast.body，允許多條語句，所以放循環裡
  while (current < tokens.length) {
    ast.body.push(walk());
  }

  return ast;
}

為了應對函數調用嵌套的情況，改用遞歸來遍歷 token 序列

遍歷

function traverser(ast, visitor) {
  // 遍歷 AST 節點數組
  function traverseArray(array, parent) {
    array.forEach(child => {
      traverseNode(child, parent);
    });
  }

  function traverseNode(node, parent) {
    // 從 visitor 取出對應的一組方法
    let methods = visitor[node.type];
    // 通知 visitor 我們正在訪問 node
    if (methods && methods.enter) {
      methods.enter(node, parent);
    }

    switch (node.type) {
      // 根節點
      case 'Program':
        traverseArray(node.body, node);
        break;
      // 函數調用
      case 'CallExpression':
        traverseArray(node.params, node);
        break;
      // 數值和字符串，沒孩子，不用處理
      case 'NumberLiteral':
      case 'StringLiteral':
        break;

      // 無法識別的 AST 節點，報錯
      default:
        throw new TypeError(node.type);
    }

    // 通知 visitor 我們要離開 node 了
    if (methods && methods.exit) {
      methods.exit(node, parent);
    }
  }

  // 開始遍歷
  traverseNode(ast, null);
}

遞歸遍歷 AST 結構，遍歷過程中通知 visitor，有點切面的意思

轉換

// 輸入 Lisp AST，輸出 C AST
function transformer(ast) {
  // 新 AST 的根節點
  let newAst = {
    type: 'Program',
    body: [],
  };

  // 偷懶以簡單粗暴的方式維持新舊 AST 的聯繫，方便在遍歷過程中操作新 AST
  ast._context = newAst.body;

  // 創建 vistor，開始遍歷
  traverser(ast, {
    // 數值和字符串，直接原樣插入新 AST
    NumberLiteral: {
      enter(node, parent) {
        parent._context.push({
          type: 'NumberLiteral',
          value: node.value,
        });
      },
    },
    StringLiteral: {
      enter(node, parent) {
        parent._context.push({
          type: 'StringLiteral',
          value: node.value,
        });
      },
    },

    // 函數調用
    CallExpression: {
      enter(node, parent) {
        // 創建不同的 AST 節點
        let expression = {
          type: 'CallExpression',
          callee: {
            type: 'Identifier',
            name: node.name,
          },
          arguments: [],
        };

        // 函數調用可以有孩子，建立節點對應關係，供子節點使用
        node._context = expression.arguments;

        // 頂層函數調用算是語句，包裝成特殊的 AST 節點
        if (parent.type !== 'CallExpression') {
          expression = {
            type: 'ExpressionStatement',
            expression: expression,
          };
        }

        parent._context.push(expression);
      },
    }
  });

  return newAst;
}

由於遍歷（traverser）與修改（visitor）操作分開成了 2 層，不太容易在遍歷舊 AST 節點的同時明確知道對應的新 AST 父節點，這裡採用了簡單粗暴 的方式，直接通過新增 _context 屬性讓舊 AST 節點的父節點持有待操作的新 AST 節點引用，能用，但污染了舊 AST

代碼生成

// 遞歸遍歷新 AST，輸出代碼字符串
function codeGenerator(node) {
  switch (node.type) {
    // 根節點，把 body 裡的所有內容都生成一遍，按行輸出
    case 'Program':
      return node.body.map(codeGenerator).join('\n');

    // 表達式語句，處理其表達式內容，並添上分號
    case 'ExpressionStatement':
      return (
        codeGenerator(node.expression) + ';'
      );

    // 函數調用，添上括號，參數用逗號分隔
    case 'CallExpression':
      return (
        codeGenerator(node.callee) +
        '(' +
        node.arguments.map(codeGenerator).join(', ') +
        ')'
      );

    // 標識符，數值，原樣輸出
    case 'Identifier':
      return node.name;
    case 'NumberLiteral':
      return node.value;

    // 字符串，用雙引號包起來再輸出
    case 'StringLiteral':
      return '"' + node.value + '"';

    // 無法識別的新 AST 節點，報錯
    default:
      throw new TypeError(node.type);
  }
}

再把 AST 轉回代碼字符串，該加分號的加分號，該添括號的添括號……

流程串接

function compiler(input) {
  let tokens = tokenizer(input);
  let ast    = parser(tokens);
  let newAst = transformer(ast);
  let output = codeGenerator(newAst);

  return output;
}

把上面所有環節串起來，構成簡單的編譯器，確實只有 200 行的樣子，試玩一下：

const input  = '(add 2 (subtract 4 2))';
let output = compiler(input);
// add(2, subtract(4, 2));
console.log(output);

四。優化

2 個可優化點：

詞法分析：逐字符匹配比較囉嗦，效率可接受的話，正則表達式更清晰
轉換：生成新 AST 的方式比較髒，污染了舊 AST，可以通過額外的數據結構記錄新舊 AST 的聯繫，避免影響舊 AST

更清晰的詞法分析

比如各詞素對應的正則表達式：

const REGEX = {
  PAREN: /^\(|^\)/,
  WHITESPACE: /^\s+/,
  NUMBERS: /^\d+/,
  STRING: /^"([^"]+)?"/,
  NAME: /^[a-z]+/i
};

while (rest.length > 0) {
  let type, value;
  // 匹配結果，本次匹配消費掉的串長度
  let matched, span;

  // 匹配左括號、右括號
  if (matched = rest.match(REGEX.PAREN)) {
    type = 'paren';
  }
  //... 匹配其它詞素

  value = value || matched[0];
  tokens.push({type, value});
  rest = rest.slice(span || matched[0].length);
}

比較清晰了，如果想要提升擴展性的話，可以把詞素相關的 type, value, span 進一步配置化：

// 詞素相關配置
const lexemes = [
  {
    regex: /^\(|^\)/,
    type: 'paren',
    value: 0,
    span: 0
  }, {
    regex: /^\d+/,
    type: 'number',
    value: 0,
    span: 0
  }, {
    regex: /^"([^"]+)?"/,
    type: 'string',
    value: 1,
    span: 0
  }, {
    regex: /^[a-z]+/i,
    type: 'name',
    value: 0,
    span: 0
  }
];

然後以統一的方式處理這些詞素：

while (rest.length > 0) {
  // 跳過空白字符
  if (matched = rest.match(/^\s+/)) {
    rest = rest.slice(matched[0].length);
    continue;
  }

  lexemes.forEach(({regex, type : t, value: v, span: s}) => {
    if (matched) return;
    if (matched = rest.match(regex)) {
      type = t;
      value = matched[v];
      span = matched[s].length;
    }
  });

  if (matched) {
    tokens.push({type, value});
    rest = rest.slice(span);
  }
  else {
    // 無法識別的字符，報錯
    throw new TypeError('Unexpected character: ' + rest);
  }
}

更乾淨的轉換

生成新樹要做兩件事：

節點映射
創建樹結構

節點映射好辦，該把新節點往哪掛是個問題。visitor 與 transformer 實現上是獨立的兩層，所以需要手動記錄新舊兩棵樹的聯繫，比如上面轉換部分源碼中的：

// 偷懶以簡單粗暴的方式維持新舊 AST 的聯繫，方便在遍歷過程中操作新 AST
ast._context = newAst.body;
// 函數調用可以有孩子，建立節點對應關係，供子節點使用
node._context = expression.arguments;

這樣就知道當前正在訪問的舊節點對應的新節點應該掛到新樹的哪個位置了，例如：

// 舊樹中父節點身上掛着對應新樹節點的孩子數組，把新節點填進去
parent._context.push({
  type: 'NumberLiteral',
  value: node.value,
});

但這樣實現比較髒（污染了傳入的舊樹 ast）。更合理的做法是以非侵入的方式記錄新樹中當前活躍的節點容器，由於函數調用允許嵌套，需要用棧結構來記錄：

// 用額外的數據結構維持新舊 AST 的聯繫
let stack = [newAst.body];
function peak() {
  return stack[stack.length - 1];
}

並在遞歸遍歷舊樹過程中維持這種聯繫：

// 函數調用
CallExpression: {
  enter(node, parent) {
    // 取出活躍的新樹節點容器
    let newASTHost = peak();

    // 創建新樹節點
    let expression = {
      type: 'CallExpression',
      callee: {
        type: 'Identifier',
        name: node.name,
      },
      arguments: [],
    };

    // 函數調用可以有孩子，建立節點對應關係，供子節點使用
    stack.push(expression.arguments);

    // 填充新樹結構
    newASTHost.push(expression);
  },
  leave(node, parent) {
    // 參數收集結束，回到上一層
    stack.pop();
  }
}

這樣就不會再污染舊樹了，並且代價不很高（一個額外的數據結構 stack）

五。源碼

Github 地址：https://github.com/ayqy/the-super-tiny-compiler

P.S. 包括帶詳細註釋的原版，以及優化之後的實現