Abstract Syntax Trees with a Recursive Descent Parser
Greetings, friends! This is my fifteenth post in the math expression parser series. In this tutorial, we'll make adjustments to the recursive descent parser we made in the previous tutorial, so that we can generate abstract syntax tree (AST) representations of math expressions. Let's begin!
Revisiting our Recursive Descent Parser
Let's revisit the recursive descent parser we developed in the previous tutorial. First, we created a tokenizer in a file named Tokenizer.js
with the following contents.
const TokenTypes = {
NUMBER: 'NUMBER',
IDENTIFIER: 'IDENTIFIER',
ADDITION: '+',
SUBTRACTION: '-',
MULTIPLICATION: '*',
DIVISION: '/',
EXPONENTIATION: '^',
PARENTHESIS_LEFT: '(',
PARENTHESIS_RIGHT: ')'
};
const TokenSpec = [
[/^\s+/, null],
[/^(?:\d+(?:\.\d*)?|\.\d+)/, TokenTypes.NUMBER],
[/^[a-z]+/, TokenTypes.IDENTIFIER],
[/^\+/, TokenTypes.ADDITION],
[/^\-/, TokenTypes.SUBTRACTION],
[/^\*/, TokenTypes.MULTIPLICATION],
[/^\//, TokenTypes.DIVISION],
[/^\^/, TokenTypes.EXPONENTIATION],
[/^\(/, TokenTypes.PARENTHESIS_LEFT],
[/^\)/, TokenTypes.PARENTHESIS_RIGHT]
];
class Tokenizer {
constructor(input) {
this.input = input;
this.cursor = 0;
}
hasMoreTokens() {
return this.cursor < this.input.length;
}
match(regex, inputSlice) {
const matched = regex.exec(inputSlice);
if (matched === null) {
return null;
}
this.cursor += matched[0].length;
return matched[0];
}
getNextToken() {
if (!this.hasMoreTokens()) {
return null;
}
const inputSlice = this.input.slice(this.cursor);
for (let [regex, type] of TokenSpec) {
const tokenValue = this.match(regex, inputSlice);
if (tokenValue === null) {
continue;
}
if (type === null) {
return this.getNextToken();
}
return {
type,
value: tokenValue,
};
}
throw new SyntaxError(`Unexpected token: "${inputSlice[0]}"`);
}
}
module.exports = {
TokenTypes,
Tokenizer
}
Then, we used this tokenizer inside our recursive descent parser. We had created a file called Parser.js
and added a Parser
class that contained methods for evaluating math expressions. By the end of the previous tutorial, our parser had the following contents.
const { TokenTypes, Tokenizer } = require('./Tokenizer')
class Parser {
parse(input) {
this.input = input
this.tokenizer = new Tokenizer(input)
this.lookahead = this.tokenizer.getNextToken()
return this.Expression()
}
eat(tokenType) {
const token = this.lookahead
if (token == null) {
throw new SyntaxError(`Unexpected end of input, expected "${tokenType}"`)
}
if (token.type !== tokenType) {
throw new SyntaxError(
`Unexpected token: "${token.value}", expected "${tokenType}"`
)
}
this.lookahead = this.tokenizer.getNextToken()
return token
}
trig(id, value) {
switch (id) {
case 'sin':
return Math.sin(value)
case 'cos':
return Math.cos(value)
case 'tan':
return Math.tan(value)
default:
throw new Error(`Invalid operation: ${id}`)
}
}
BinaryExpression(leftRule, rightRule, operatorType1, operatorType2) {
let left = leftRule()
while (
this.lookahead &&
(this.lookahead.type === operatorType1 ||
this.lookahead.type === operatorType2)
) {
const operator = this.eat(this.lookahead.type).type
switch (operator) {
case TokenTypes.ADDITION:
left = left + rightRule()
break
case TokenTypes.SUBTRACTION:
left = left - rightRule()
break
case TokenTypes.MULTIPLICATION:
left = left * rightRule()
break
case TokenTypes.DIVISION:
left = left / rightRule()
break
case TokenTypes.EXPONENTIATION:
left = left ** rightRule()
break
}
}
return left
}
/**
* Expression
* = Term (("+" / "-") Term)*
*/
Expression() {
return this.BinaryExpression(
() => this.Term(),
() => this.Term(),
TokenTypes.ADDITION,
TokenTypes.SUBTRACTION
)
}
/**
* Term
* = Factor (("*" / "/") Factor)*
*/
Term() {
return this.BinaryExpression(
() => this.Factor(),
() => this.Factor(),
TokenTypes.MULTIPLICATION,
TokenTypes.DIVISION
)
}
/**
* Factor
* = Primary ("^" Factor)*
*/
Factor() {
return this.BinaryExpression(
() => this.Primary(),
() => this.Factor(),
TokenTypes.EXPONENTIATION
)
}
/**
* Primary
* = ParenthesizedExpression
* / UnaryExpression
* / FunctionExpression
* / NUMBER
*/
Primary() {
if (this.lookahead.type === TokenTypes.PARENTHESIS_LEFT) {
return this.ParenthesizedExpression()
}
if (this.lookahead.type === TokenTypes.SUBTRACTION) {
return this.UnaryExpression()
}
if (this.lookahead.type === TokenTypes.IDENTIFIER) {
return this.FunctionExpression()
}
const token = this.eat(TokenTypes.NUMBER)
return Number(token.value)
}
/**
* ParenthesizedExpression
* = "(" Expression ")"
*/
ParenthesizedExpression() {
this.eat(TokenTypes.PARENTHESIS_LEFT)
const expression = this.Expression()
this.eat(TokenTypes.PARENTHESIS_RIGHT)
return expression
}
/**
* UnaryExpression
* = "-" Factor
*/
UnaryExpression() {
this.eat(TokenTypes.SUBTRACTION)
return -this.Factor()
}
/**
* FunctionExpression
* = IDENTIFIER ParenthesizedExpression
*/
FunctionExpression() {
const id = this.eat(TokenTypes.IDENTIFIER).value
return this.trig(id, this.ParenthesizedExpression())
}
}
Below is an example of how we used our recursive descent parser to evaluate math expressions.
const parser = new Parser()
console.log(parser.parse('1 + 2 * 3')) // 7
Building Abstract Syntax Trees
Now, it's time to change the return value of some of the methods in our recursive descent parser such that they return an AST node instead of a number. Our AST will be identical to the ones we've constructed in Part 8, Part 11, and Part 13 of this tutorial series.
Create a new file named Parser-AST.js
with the following contents:
const { TokenTypes, Tokenizer } = require('./Tokenizer')
class Parser {
parse(input) {
this.input = input
this.tokenizer = new Tokenizer(input)
this.lookahead = this.tokenizer.getNextToken()
return this.Expression()
}
eat(tokenType) {
const token = this.lookahead
if (token == null) {
throw new SyntaxError(`Unexpected end of input, expected "${tokenType}"`)
}
if (token.type !== tokenType) {
throw new SyntaxError(
`Unexpected token: "${token.value}", expected "${tokenType}"`
)
}
this.lookahead = this.tokenizer.getNextToken()
return token
}
BinaryExpression(leftRule, rightRule, operatorType1, operatorType2) {
let left = leftRule()
while (
this.lookahead &&
(this.lookahead.type === operatorType1 ||
this.lookahead.type === operatorType2)
) {
const operator = this.eat(this.lookahead.type).value
left = {
type: 'BinaryExpression',
operator,
left,
right: rightRule()
}
}
return left
}
/**
* Expression
* = Term (("+" / "-") Term)*
*/
Expression() {
return this.BinaryExpression(
() => this.Term(),
() => this.Term(),
TokenTypes.ADDITION,
TokenTypes.SUBTRACTION
)
}
/**
* Term
* = Factor (("*" / "/") Factor)*
*/
Term() {
return this.BinaryExpression(
() => this.Factor(),
() => this.Factor(),
TokenTypes.MULTIPLICATION,
TokenTypes.DIVISION
)
}
/**
* Factor
* = Primary ("^" Factor)*
*/
Factor() {
return this.BinaryExpression(
() => this.Primary(),
() => this.Factor(),
TokenTypes.EXPONENTIATION
)
}
/**
* Primary
* = ParenthesizedExpression
* / UnaryExpression
* / FunctionExpression
* / NUMBER
*/
Primary() {
if (this.lookahead.type === TokenTypes.PARENTHESIS_LEFT) {
return this.ParenthesizedExpression()
}
if (this.lookahead.type === TokenTypes.SUBTRACTION) {
return this.UnaryExpression()
}
if (this.lookahead.type === TokenTypes.IDENTIFIER) {
return this.FunctionExpression()
}
const token = this.eat(TokenTypes.NUMBER)
return {
type: 'Number',
value: Number(token.value)
}
}
/**
* ParenthesizedExpression
* = "(" Expression ")"
*/
ParenthesizedExpression() {
this.eat(TokenTypes.PARENTHESIS_LEFT)
const expression = this.Expression()
this.eat(TokenTypes.PARENTHESIS_RIGHT)
return expression
}
/**
* UnaryExpression
* = "-" Factor
*/
UnaryExpression() {
this.eat(TokenTypes.SUBTRACTION)
return {
type: 'UnaryExpression',
value: this.Factor()
}
}
/**
* FunctionExpression
* = IDENTIFIER ParenthesizedExpression
*/
FunctionExpression() {
const token = this.eat(TokenTypes.IDENTIFIER)
return {
type: 'Function',
name: token.value,
value: this.ParenthesizedExpression()
}
}
}
module.exports = { Parser }
Notice all the changes we made to our parser. Since we drastically cleaned up common code in the previous tutorial using the BinaryExpression
method, it's super easy to return a single AST node that works for all mathematical operators.
We no longer need the trig
method because we'll handle computations in an AST evaluator later. We then had to update the UnaryExpression
and FunctionExpression
methods to return AST nodes. Finally, we updated the Primary
method to return a Number
AST node. We actually didn't have to make too many changes!
We can import this parser into a file named ast-example.js
and insert the following contents:
const { Parser } = require('./Parser-AST')
const parser = new Parser()
const ast = parser.parse('1 + 2 * 3')
console.dir(ast, { depth: null }) // log nested objects
We can execute this code using the following command:
node ast-example
We should see the AST for the math expression, 1 + 2 * 3
, displayed in the terminal.
{
type: 'BinaryExpression',
operator: '+',
left: { type: 'Number', value: 1 },
right: {
type: 'BinaryExpression',
operator: '*',
left: { type: 'Number', value: 2 },
right: { type: 'Number', value: 3 }
}
}
Evaluating and Testing the AST
Since our AST is identical to the one we created in Part 8 of this tutorial series, we can reuse our AST evaluator from that tutorial.
Let's create a new file called NodeVisitor.js
and add the following contents.
class NodeVisitor {
visit(node) {
switch (node.type) {
case 'Number':
return this.visitNumber(node);
case 'BinaryExpression':
return this.visitBinaryExpression(node);
case 'UnaryExpression':
return this.visitUnaryExpression(node);
case 'Function':
return this.visitFunction(node);
}
}
visitNumber(node) {
return node.value;
}
visitBinaryExpression(node) {
switch (node.operator) {
case '+':
return this.visit(node.left) + this.visit(node.right);
case '-':
return this.visit(node.left) - this.visit(node.right);
case '*':
return this.visit(node.left) * this.visit(node.right);
case '/':
return this.visit(node.left) / this.visit(node.right);
case '^':
return this.visit(node.left) ** this.visit(node.right);
default:
throw new Error(`Invalid operation: ${node.operator}`);
}
}
visitUnaryExpression(node) {
return -this.visit(node.value);
}
visitFunction(node) {
switch (node.name) {
case 'sin':
return Math.sin(this.visit(node.value));
case 'cos':
return Math.cos(this.visit(node.value));
case 'tan':
return Math.tan(this.visit(node.value));
default:
throw new Error(`Invalid function: ${node.name}`);
}
}
}
module.exports = { NodeVisitor }
We can test our AST evaluator against a variety of math expressions using the tests.js
file we wrote in the previous tutorial.
Inside the tests.js
file, import the Parser
class from the Parser-AST.js
file and import the NodeVisitor
class from the NodeVisitor.js
file.
Then, we'll update the tests to run against an evaluated AST instead of using the parser from the previous tutorial that directly returned a value.
const assert = require('assert').strict
const { Parser } = require('./Parser-AST')
const { NodeVisitor } = require('./NodeVisitor')
const mathTests = [
['1', 1],
[' 2 ', 2],
['1 + 2', 3],
[' 1 + 2 ', 3],
['1 + 2 * 3', 7],
['(1 + 2) * 3', 9],
['5 - 2', 3],
['5 - 2 - 1', 2],
['12 / 2 / 3', 2],
['2 ^ 3 + 1', 9],
['-2 ^ 2', -4],
['(-2) ^ 2', 4],
['-2 ^ 2 + 1', -3],
['cos(0) + 3 * -4 / -2 ^ 2', 4]
]
const parser = new Parser()
const visitor = new NodeVisitor()
let result
for (const [expression, expected] of mathTests) {
try {
const ast = parser.parse(expression)
result = visitor.visit(ast)
assert.equal(result, expected)
} catch (err) {
const lines = '-'.repeat(process.stdout.columns)
console.log(lines)
console.log(`Expression failed: "${expression}"`)
console.log(`Expected result: ${expected}`)
console.log(`Actual result: ${result}`)
console.log(lines)
throw err
}
}
console.log('All tests passed! 🎉')
We can execute this file using the following command:
node tests
If everything goes well, then all the tests should pass, and we should feel proud of our new recursive descent parser that can build ASTs 😁
Conclusion
In this tutorial, we learned how to make a recursive descent parser that can produce an AST instead of evaluating a result. By generating an AST, it gives the user freedom to choose how they would like to evaluate a math expression. We also learned how to run test cases against various math expressions to make sure the AST was built correctly.
Throughout this math expression parser series, we've learned six different ways of building math expression parsers:
- Reverse Polish Notation (RPN) Evaluator
- Shunting Yard Algorithm
- LR Parser Generator
- LL Parser Generator
- PEG Parser Generator
- Recursive Descent Parser
In the next tutorial, we'll learn the seventh and final way to build a math expression parser. There are definitely other ways, but I only promised seven at the beginning of this tutorial series 😅
Anyways, see you there! Happy coding!