Abstract Syntax Trees with a Pratt Parser
Greetings, friends! This is my seventeenth post in the math expression parser series. In this tutorial, we'll make adjustments to the Pratt parser we created in the previous tutorial, so that we can generate abstract syntax tree (AST) representations of math expressions. Let's get started!
Revisiting our Pratt Parser
Let's revisit the Pratt parser we developed in the previous tutorial. We created a file named Tokenizer.js
that contained the same contents as the tokenizer we used in the recursive descent parser. Our tokenizer should have the following contents.
const TokenTypes = {
NUMBER: 'NUMBER',
IDENTIFIER: 'IDENTIFIER',
ADDITION: '+',
SUBTRACTION: '-',
MULTIPLICATION: '*',
DIVISION: '/',
EXPONENTIATION: '^',
PARENTHESIS_LEFT: '(',
PARENTHESIS_RIGHT: ')'
};
const TokenSpec = [
[/^\s+/, null],
[/^(?:\d+(?:\.\d*)?|\.\d+)/, TokenTypes.NUMBER],
[/^[a-z]+/, TokenTypes.IDENTIFIER],
[/^\+/, TokenTypes.ADDITION],
[/^\-/, TokenTypes.SUBTRACTION],
[/^\*/, TokenTypes.MULTIPLICATION],
[/^\//, TokenTypes.DIVISION],
[/^\^/, TokenTypes.EXPONENTIATION],
[/^\(/, TokenTypes.PARENTHESIS_LEFT],
[/^\)/, TokenTypes.PARENTHESIS_RIGHT]
];
class Tokenizer {
constructor(input) {
this.input = input;
this.cursor = 0;
}
hasMoreTokens() {
return this.cursor < this.input.length;
}
match(regex, inputSlice) {
const matched = regex.exec(inputSlice);
if (matched === null) {
return null;
}
this.cursor += matched[0].length;
return matched[0];
}
getNextToken() {
if (!this.hasMoreTokens()) {
return null;
}
const inputSlice = this.input.slice(this.cursor);
for (let [regex, type] of TokenSpec) {
const tokenValue = this.match(regex, inputSlice);
if (tokenValue === null) {
continue;
}
if (type === null) {
return this.getNextToken();
}
return {
type,
value: tokenValue,
};
}
throw new SyntaxError(`Unexpected token: "${inputSlice[0]}"`);
}
}
module.exports = {
TokenTypes,
Tokenizer
}
Then, we created a file called Parser.js
that used the tokenizer and contained our implementation of a Pratt parser. By the end of the previous tutorial, our parser had the following contents.
const { TokenTypes, Tokenizer } = require('./Tokenizer')
class Parser {
parse(input) {
this.input = input
this.tokenizer = new Tokenizer(input)
this.lookahead = this.tokenizer.getNextToken()
this.operators = {
'^': 5,
unary: 4,
'*': 3,
'/': 3,
'+': 2,
'-': 2
}
return this.Expression()
}
eat(tokenType) {
const token = this.lookahead
if (token == null) {
throw new SyntaxError(`Unexpected end of input, expected "${tokenType}"`)
}
if (token.type !== tokenType) {
throw new SyntaxError(
`Unexpected token: "${token.value}", expected "${tokenType}"`
)
}
this.lookahead = this.tokenizer.getNextToken()
return token
}
getPrecedence(token) {
if (token === 'unary') {
return this.operators.unary
}
if (token?.type) {
return this.operators[token.value]
}
return 0
}
trig(id, value) {
switch (id) {
case 'sin':
return Math.sin(value)
case 'cos':
return Math.cos(value)
case 'tan':
return Math.tan(value)
default:
throw new Error(`Invalid operation: ${id}`)
}
}
/**
* Expression
* = Prefix (Infix)*
*/
Expression(prec = 0) {
let left = this.Prefix()
while (prec < this.getPrecedence(this.lookahead)) {
left = this.Infix(left, this.lookahead?.type)
}
return left
}
/**
* Prefix
* = ParenthesizedExpression
* / UnaryExpression
* / FunctionExpression
* / NUMBER
*/
Prefix() {
if (this.lookahead.type === TokenTypes.PARENTHESIS_LEFT) {
return this.ParenthesizedExpression()
}
if (this.lookahead.type === TokenTypes.SUBTRACTION) {
return this.UnaryExpression()
}
if (this.lookahead.type === TokenTypes.IDENTIFIER) {
return this.FunctionExpression()
}
const token = this.eat(TokenTypes.NUMBER)
return Number(token.value)
}
/**
* Infix
* = ("+" / "-" / "*" / "/" / "^") Expression
*/
Infix(left, operatorType) {
let token = this.eat(operatorType)
let newPrec = this.operators[token.value]
switch (token.type) {
case TokenTypes.ADDITION:
return left + this.Expression(newPrec)
case TokenTypes.SUBTRACTION:
return left - this.Expression(newPrec)
case TokenTypes.MULTIPLICATION:
return left * this.Expression(newPrec)
case TokenTypes.DIVISION:
return left / this.Expression(newPrec)
case TokenTypes.EXPONENTIATION:
return left ** this.Expression(newPrec - 1)
}
}
/**
* ParenthesizedExpression
* = "(" Expression ")"
*/
ParenthesizedExpression() {
this.eat(TokenTypes.PARENTHESIS_LEFT)
const expression = this.Expression()
this.eat(TokenTypes.PARENTHESIS_RIGHT)
return expression
}
/**
* UnaryExpression
* = "-" Expression
*/
UnaryExpression() {
this.eat(TokenTypes.SUBTRACTION)
return -this.Expression(this.getPrecedence('unary'))
}
/**
* FunctionExpression
* = IDENTIFIER ParenthesizedExpression
*/
FunctionExpression() {
const id = this.eat(TokenTypes.IDENTIFIER).value
const expression = this.ParenthesizedExpression()
return this.trig(id, expression)
}
}
Below is an example of how we used our Pratt parser to evaluate math expressions.
const parser = new Parser()
console.log(parser.parse('1 + 2 * 3')) // 7
Building Abstract Syntax Trees
In Part 15 of this tutorial series, we created abstract syntax trees (AST) using a recursive descent parser. We will create the same ASTs using the Pratt parser we built in the previous tutorial instead.
Create a new file named Parser-AST.js
with the following contents:
const { TokenTypes, Tokenizer } = require('./Tokenizer')
class Parser {
parse(input) {
this.input = input
this.tokenizer = new Tokenizer(input)
this.lookahead = this.tokenizer.getNextToken()
this.operators = {
'^': 5,
unary: 4,
'*': 3,
'/': 3,
'+': 2,
'-': 2
}
return this.Expression()
}
eat(tokenType) {
const token = this.lookahead
if (token == null) {
throw new SyntaxError(`Unexpected end of input, expected "${tokenType}"`)
}
if (token.type !== tokenType) {
throw new SyntaxError(
`Unexpected token: "${token.value}", expected "${tokenType}"`
)
}
this.lookahead = this.tokenizer.getNextToken()
return token
}
getPrecedence(token) {
if (token === 'unary') {
return this.operators.unary
}
if (token?.type) {
return this.operators[token.value]
}
return 0
}
/**
* Expression
* = Prefix (Infix)*
*/
Expression(prec = 0) {
let left = this.Prefix()
while (prec < this.getPrecedence(this.lookahead)) {
left = this.Infix(left, this.lookahead?.type)
}
return left
}
/**
* Prefix
* = ParenthesizedExpression
* / UnaryExpression
* / FunctionExpression
* / NUMBER
*/
Prefix() {
if (this.lookahead.type === TokenTypes.PARENTHESIS_LEFT) {
return this.ParenthesizedExpression()
}
if (this.lookahead.type === TokenTypes.SUBTRACTION) {
return this.UnaryExpression()
}
if (this.lookahead.type === TokenTypes.IDENTIFIER) {
return this.FunctionExpression()
}
const token = this.eat(TokenTypes.NUMBER)
return {
type: 'Number',
value: Number(token.value)
}
}
/**
* Infix
* = ("+" / "-" / "*" / "/" / "^") Expression
*/
Infix(left, operatorType) {
let token = this.eat(operatorType)
let newPrec = this.operators[token.value]
return {
type: 'BinaryExpression',
operator: token.value,
left,
right: this.Expression(
token.type === TokenTypes.EXPONENTIATION ? newPrec - 1 : newPrec
)
}
}
/**
* ParenthesizedExpression
* = "(" Expression ")"
*/
ParenthesizedExpression() {
this.eat(TokenTypes.PARENTHESIS_LEFT)
const expression = this.Expression()
this.eat(TokenTypes.PARENTHESIS_RIGHT)
return expression
}
/**
* UnaryExpression
* = "-" Expression
*/
UnaryExpression() {
this.eat(TokenTypes.SUBTRACTION)
return {
type: 'UnaryExpression',
value: this.Expression(this.getPrecedence('unary'))
}
}
/**
* FunctionExpression
* = IDENTIFIER ParenthesizedExpression
*/
FunctionExpression() {
const id = this.eat(TokenTypes.IDENTIFIER).value
const expression = this.ParenthesizedExpression()
return {
type: 'Function',
name: id,
value: expression
}
}
}
module.exports = { Parser }
Notice how we didn't have to make too many changes to our original parser. Instead of returning a single value in most of our methods, we return an AST node. For example, the Infix
method was modified to output an AST node with a type
property of BinaryExpression
.
Infix(left, operatorType) {
let token = this.eat(operatorType)
let newPrec = this.operators[token.value]
return {
type: 'BinaryExpression',
operator: token.value,
left,
right: this.Expression(
token.type === TokenTypes.EXPONENTIATION ? newPrec - 1 : newPrec
)
}
}
The operator
property will contain the type of mathematical operation we want to perform. The left
and right
properties will contain AST nodes that may resolve to other nodes, recursively.
We also no longer need the trig
method because we'll handle computations in an AST evaluator later.
We can import this parser into a file named ast-example.js
with the following contents:
const { Parser } = require('./Parser-AST')
const parser = new Parser()
const ast = parser.parse('1 + 2 * 3')
console.dir(ast, { depth: null }) // log nested objects
We can execute this code using the following command:
node ast-example
We should see the AST for the math expression, 1 + 2 * 3
, displayed in the terminal.
{
type: 'BinaryExpression',
operator: '+',
left: { type: 'Number', value: 1 },
right: {
type: 'BinaryExpression',
operator: '*',
left: { type: 'Number', value: 2 },
right: { type: 'Number', value: 3 }
}
}
Evaluating and Testing the AST
Since our AST is identical to the one we created in Part 8 of this tutorial series, we can reuse our AST evaluator from that tutorial.
Let's create a new file called NodeVisitor.js
and add the following contents.
class NodeVisitor {
visit(node) {
switch (node.type) {
case 'Number':
return this.visitNumber(node);
case 'BinaryExpression':
return this.visitBinaryExpression(node);
case 'UnaryExpression':
return this.visitUnaryExpression(node);
case 'Function':
return this.visitFunction(node);
}
}
visitNumber(node) {
return node.value;
}
visitBinaryExpression(node) {
switch (node.operator) {
case '+':
return this.visit(node.left) + this.visit(node.right);
case '-':
return this.visit(node.left) - this.visit(node.right);
case '*':
return this.visit(node.left) * this.visit(node.right);
case '/':
return this.visit(node.left) / this.visit(node.right);
case '^':
return this.visit(node.left) ** this.visit(node.right);
default:
throw new Error(`Invalid operation: ${node.operator}`);
}
}
visitUnaryExpression(node) {
return -this.visit(node.value);
}
visitFunction(node) {
switch (node.name) {
case 'sin':
return Math.sin(this.visit(node.value));
case 'cos':
return Math.cos(this.visit(node.value));
case 'tan':
return Math.tan(this.visit(node.value));
default:
throw new Error(`Invalid function: ${node.name}`);
}
}
}
module.exports = { NodeVisitor }
We can test our AST evaluator against a variety of math expressions using the same tests.js
file we wrote in Part 15.
Inside the tests.js
file, import the Parser
class from the Parser-AST.js
file and the NodeVisitor
class from the NodeVisitor.js
file.
Then, we'll update the tests to run against an evaluated AST instead of using the parser from the previous tutorial that directly returned a value.
const assert = require('assert').strict
const { Parser } = require('./Parser-AST')
const { NodeVisitor } = require('./NodeVisitor')
const mathTests = [
['1', 1],
[' 2 ', 2],
['1 + 2', 3],
[' 1 + 2 ', 3],
['1 + 2 * 3', 7],
['(1 + 2) * 3', 9],
['5 - 2', 3],
['5 - 2 - 1', 2],
['12 / 2 / 3', 2],
['2 ^ 3 + 1', 9],
['-2 ^ 2', -4],
['(-2) ^ 2', 4],
['-2 ^ 2 + 1', -3],
['cos(0) + 3 * -4 / -2 ^ 2', 4]
]
const parser = new Parser()
const visitor = new NodeVisitor()
let result
for (const [expression, expected] of mathTests) {
try {
const ast = parser.parse(expression)
result = visitor.visit(ast)
assert.equal(result, expected)
} catch (err) {
const lines = '-'.repeat(process.stdout.columns)
console.log(lines)
console.log(`Expression failed: "${expression}"`)
console.log(`Expected result: ${expected}`)
console.log(`Actual result: ${result}`)
console.log(lines)
throw err
}
}
console.log('All tests passed! 🎉')
We can execute this file using the following command:
node tests
All the tests should pass, and you should feel proud knowing that you created a parser, that generates ASTs, using an algorithm found in many popular compilers and interpreters! 🥳
Conclusion
In this tutorial, we learned how to make a Pratt parser that can produce an AST instead of evaluating a result. As always, an AST gives the user freedom to choose how they would like to evaluate a math expression. We also learned how to run test cases against various math expressions to make sure the AST was built correctly.
If you've been following my tutorial series from the beginning, I would like to say thank you! 💚 I see people reading my website all across the globe, and it feels nice knowing so many people from different places found my website to learn about math expression parsers.
Many thanks to everyone who read only this tutorial too 😁
As promised in Part 1 of this tutorial series, I have now taught SEVEN different ways of building math expression parsers:
- Reverse Polish Notation (RPN) Evaluator
- Shunting Yard Algorithm
- LR Parser Generator
- LL Parser Generator
- PEG Parser Generator
- Recursive Descent Parser
- Pratt Parser
Congrats, friend! You finally did it! 🎉🎉🎉
As a bonus for sticking with me, I'd like to present a few more articles in this tutorial series! In the next tutorial, we'll learn how to make a tiny programming language called "Tiny Math Language" or TML for short. TML is also an acronym for "thank me later" 😂
We'll learn how to create and store variables, so we can use them in math expressions! See you there! ⭐