I am trying to generate a syntax tree, for a given string with simple math operators (+, -, *, /, and parenthesis). Given the string "1 + 2 * 3":
It should return an array like this:
["+",
[1,
["*",
[2,3]
]
]
]
I made a function to transform "1 + 2 * 3" in [1,"+",2,"*",3].
The problem is: I have no idea to give priority to certain operations.
My code is:
function isNumber(ch){
switch (ch) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '.':
return true;
break;
default:
return false;
break;
}
}
function generateSyntaxTree(text){
if (typeof text != 'string') return [];
var code = text.replace(new RegExp("[ \t\r\n\v\f]", "gm"), "");
var codeArray = [];
var syntaxTree = [];
// Put it in its on scope
(function(){
var lastPos = 0;
var wasNum = false;
for (var i = 0; i < code.length; i++) {
var cChar = code[i];
if (isNumber(cChar)) {
if (!wasNum) {
if (i != 0) {
codeArray.push(code.slice(lastPos, i));
}
lastPos = i;
wasNum = true;
}
} else {
if (wasNum) {
var n = Number(code.slice(lastPos, i));
if (isNaN(n)) {
throw new Error("Invalid Number");
return [];
} else {
codeArray.push(n);
}
wasNum = false;
lastPos = i;
}
}
}
if (wasNum) {
var n = Number(code.slice(lastPos, code.length));
if (isNaN(n)) {
throw new Error("Invalid Number");
return [];
} else {
codeArray.push(n);
}
}
})();
// At this moment, codeArray = [1,"+",2,"*",3]
return syntaxTree;
}
alert('Returned: ' + generateSyntaxTree("1 + 2 * 3"));
I am trying to generate a syntax tree, for a given string with simple math operators (+, -, *, /, and parenthesis). Given the string "1 + 2 * 3":
It should return an array like this:
["+",
[1,
["*",
[2,3]
]
]
]
I made a function to transform "1 + 2 * 3" in [1,"+",2,"*",3].
The problem is: I have no idea to give priority to certain operations.
My code is:
function isNumber(ch){
switch (ch) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '.':
return true;
break;
default:
return false;
break;
}
}
function generateSyntaxTree(text){
if (typeof text != 'string') return [];
var code = text.replace(new RegExp("[ \t\r\n\v\f]", "gm"), "");
var codeArray = [];
var syntaxTree = [];
// Put it in its on scope
(function(){
var lastPos = 0;
var wasNum = false;
for (var i = 0; i < code.length; i++) {
var cChar = code[i];
if (isNumber(cChar)) {
if (!wasNum) {
if (i != 0) {
codeArray.push(code.slice(lastPos, i));
}
lastPos = i;
wasNum = true;
}
} else {
if (wasNum) {
var n = Number(code.slice(lastPos, i));
if (isNaN(n)) {
throw new Error("Invalid Number");
return [];
} else {
codeArray.push(n);
}
wasNum = false;
lastPos = i;
}
}
}
if (wasNum) {
var n = Number(code.slice(lastPos, code.length));
if (isNaN(n)) {
throw new Error("Invalid Number");
return [];
} else {
codeArray.push(n);
}
}
})();
// At this moment, codeArray = [1,"+",2,"*",3]
return syntaxTree;
}
alert('Returned: ' + generateSyntaxTree("1 + 2 * 3"));
Share
Improve this question
edited Jul 24, 2019 at 9:07
Glorfindel
22.6k13 gold badges89 silver badges118 bronze badges
asked Apr 24, 2010 at 18:51
user216441user216441
3
- Are you required to do this manually for a class? Normally these things are done with a parser generator, like ANTLR or Bison – Michael Mrozek Commented Apr 24, 2010 at 19:01
- I am doing it from the zero, and yes, I am doing it for a parser that I am creating. – user216441 Commented Apr 24, 2010 at 19:03
- If you check my updated answer you will have a skeleton for building a more advanced parser, based on a working implementation for your example. It is good to understand how parsing works from ground up, since then it is easier to use tools like ANTLR, Flex, Bison, yacc etc. – Ernelli Commented Apr 26, 2010 at 14:47
5 Answers
Reset to default 7The way to do a top down parser, if not using FLEX/BISON or any other similar package is to first write a tokenizer that can parse input and serve tokens.
Basically you need a tokenizer that provides getNextToken, peekNextToken and skipNextToken.
Then you work your way down using this structure.
// parser.js
var input, currToken, pos;
var TOK_OPERATOR = 1;
var TOK_NUMBER = 2;
var TOK_EOF = 3;
function nextToken() {
var c, tok = {};
while(pos < input.length) {
c = input.charAt(pos++);
switch(c) {
case '+':
case '-':
case '*':
case '/':
case '(':
case ')':
tok.op = c;
tok.type = TOK_OPERATOR;
return tok;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
tok.value = c;
tok.type = TOK_NUMBER;
return tok;
default:
throw "Unexpected character: " + c;
}
}
tok.type = TOK_EOF;
return tok;
}
function getNextToken() {
var ret;
if(currToken)
ret = currToken;
else
ret = nextToken();
currToken = undefined;
return ret;
}
function peekNextToken() {
if(!currToken)
currToken = nextToken();
return currToken;
}
function skipNextToken() {
if(!currToken)
currToken = nextToken();
currToken = undefined;
}
function parseString(str) {
input = str;
pos = 0;
return expression();
}
function expression() {
return additiveExpression();
}
function additiveExpression() {
var left = multiplicativeExpression();
var tok = peekNextToken();
while(tok.type == TOK_OPERATOR && (tok.op == '+' || tok.op == '-') ) {
skipNextToken();
var node = {};
node.op = tok.op;
node.left = left;
node.right = multiplicativeExpression();
left = node;
tok = peekNextToken();
}
return left;
}
function multiplicativeExpression() {
var left = primaryExpression();
var tok = peekNextToken();
while(tok.type == TOK_OPERATOR && (tok.op == '*' || tok.op == '/') ) {
skipNextToken();
var node = {};
node.op = tok.op;
node.left = left;
node.right = primaryExpression();
left = node;
tok = peekNextToken();
}
return left;
}
function primaryExpression() {
var tok = peekNextToken();
if(tok.type == TOK_NUMBER) {
skipNextToken();
node = {};
node.value = tok.value;
return node;
}
else
if(tok.type == TOK_OPERATOR && tok.op == '(') {
skipNextToken();
var node = expression(); // The beauty of recursion
tok = getNextToken();
if(tok.type != TOK_OPERATOR || tok.op != ')')
throw "Error ) expected";
return node
}
else
throw "Error " + tok + " not exptected";
}
As you can see, you start by requesting the least privileged operation, which requires the next higher privileged operation as its left and right term and so on. Unary operators has a little different structure. The neat thing is the recursion at the end when a parenthesis is encountered.
Here is a demo page that uses the parser and renders the parse-tree (had the code for it laying around...)
<html>
<head>
<title>tree</title>
<script src="parser.js"></script>
</head>
<body onload="testParser()">
<script>
function createTreeNode(x, y, val, color) {
var node = document.createElement("div");
node.style.position = "absolute";
node.style.left = "" + x;
node.style.top = "" + y;
node.style.border= "solid";
node.style.borderWidth= 1;
node.style.backgroundColor= color;
node.appendChild(document.createTextNode(val));
return node;
};
var yStep = 24;
var width = 800;
var height = 600;
var RED = "#ffc0c0";
var BLUE = "#c0c0ff";
container = document.createElement("div");
container.style.width = width;
container.style.height = height;
container.style.border = "solid";
document.body.appendChild(container);
var svgNS = "http://www.w3.org/2000/svg";
function renderLink(x1, y1, x2, y2)
{
var left = Math.min(x1,x2);
var top = Math.min(y1,y2);
var width = 1+Math.abs(x2-x1);
var height = 1+Math.abs(y2-y1);
var svg = document.createElementNS(svgNS, "svg");
svg.setAttribute("x", left);
svg.setAttribute("y", top);
svg.setAttribute("width", width );
svg.setAttribute("height", height );
var line = document.createElementNS(svgNS,"line");
line.setAttribute("x1", (x1 - left) );
line.setAttribute("x2", (x2 - left) );
line.setAttribute("y1", (y1 - top) );
line.setAttribute("y2", (y2 - top) );
line.setAttribute("stroke-width", "1");
line.setAttribute("stroke", "black");
svg.appendChild(line);
var div = document.createElement("div");
div.style.position = "absolute";
div.style.left = left;
div.style.top = top;
div.style.width = width;
div.style.height = height;
div.appendChild(svg);
container.appendChild(div);
}
function getHeight(dom) {
var h = dom.offsetHeight;
return h;
}
function getWidth(dom) {
var w = dom.offsetWidth;
return w;
}
function renderTree(x, y, node, width, height)
{
if(height < 1.5*yStep)
height = 1.5*yStep;
var val;
if(node.op) {
val = node.op;
color = BLUE;
}
else
if(node.value) {
val = node.value;
color = RED;
}
else
val = "?";
var dom = createTreeNode(x, y, val, color);
container.appendChild(dom);
var w = getWidth(dom);
var h = getHeight(dom);
var nx, ny;
var child;
if(node.left) {
nx = x - width/2;
ny = y+height;
var child = renderTree(nx, ny, node.left, width/2, height/2);
renderLink(x+w/2, y+h, nx+getWidth(child)/2, ny);
}
if(node.right) {
nx = x + width/2;
ny = y+height;
child = renderTree(nx, ny, node.right, width/2, height/2);
renderLink(x+w/2, y+h, nx+getWidth(child)/2, ny);
}
return dom;
}
var root;
function testParser()
{
var str = "1+2*5-5*(9+2)";
var exp = document.createElement("div");
exp.appendChild(document.createTextNode(str));
container.appendChild(exp);
var tree = parseString(str);
renderTree(width/2, 20, tree, width/2, 4*yStep);
}
</script>
</body>
</html>
The thing to do is to use a parser generator like flex or ANTLR (searching at google will find one for your language).
But if you are doing this for fun or to learn how parsers work, look up wikipedia for recursive descent parser.
A simple recursive descent parser can be easily made for simple expressions like this. You can define the grammar as:
<expression> ::= <term> | <term> <add_op> <expression>
<term> ::= <factor> | <factor> <mul_op> <term>
<factor> ::= ( <expression> ) | <number>
<add_op> ::= + | -
<mul_op> ::= * | /
Notice that by making the rule for <term>
contain the rule for <factor>
this grammar makes sure all multiplication/division operations occur lower in the parse tree than any addition/subtraction. This ensures those operations are evaluated first.
Similar to approach in other answers, here is another recursive implementation. It has the following distinctive characteristics:
- It produces the nested array structure that is described in the question.
- It supports signed numbers, so that
-1
(without intermediate space) can be interpreted as a literal, not necessarily as an operator. - It supports unary minus, such as the first minus in this example:
-(-1)
. It would also accept the string- -1
or--1
, ...etc. - It supports decimal numbers with a mandatory digit before the decimal point.
- It uses a regular expression to identify tokens. This will match number literals as one token, and any other, single non white space character.
- Throws an error when there is a syntax validation error, with an indication where in the input string the error occurred.
The supported grammar can be described as:
<literal> ::= [ '-' ] <digit> { <digit> } [ '.' { <digit> } ] ; no white space allowed
<operator2> ::= '*' | '/'
<operator1> ::= '+' | '-'
<factor> ::= '-' <factor> | '(' <expression> ')' | <literal>
<term> ::= [ <term> <operator2> ] <factor>
<expression> ::= [ <expression> <operator1> ] <term>
Precedence is given to match the minus sign as part of a <literal>
when possible.
Interactive snippet
function parse(s) {
// Create a closure for the two variables needed to iterate the input:
const
get = ((tokens, match=tokens.next().value) =>
// get: return current token when it is of the required group, and move forward,
// else if it was mandatory, throw an error, otherwise return undefined
(group, mandatory) => {
if (match?.groups[group] !== undefined)
return [match?.groups[group], match = tokens.next().value][0];
if (mandatory)
throw `${s}\n${' '.repeat(match?.index ?? s.length)}^ Expected ${group}`;
}
)( // Get iterator that matches tokens with named capture groups.
s.matchAll(/(?<number>(?:(?<![\d.)]\s*)-)?\d+(?:\.\d*)?)|(?<open>\()|(?<close>\))|(?<add>\+|(?<unary>-))|(?<mul>[*\/])|(?<end>$)|\S/g)
),
// node: Creates a tree node from given operation
node = (operation, ...values) => [operation, values],
// Grammar rules implementation, using names of regex capture groups, returning nodes
factor = (op=get("unary")) =>
op ? node(op, factor()) : get("open") ? expr("close") : +get("number", 1),
term = (arg=factor(), op=get("mul")) =>
op ? term(node(op, arg, factor())) : arg,
expr = (end, arg=term(), op=get("add")) =>
op ? expr(end, node(op, arg, term())) : (get(end, 1), arg);
return expr("end");
}
// I/O Management
const [input, output] = document.querySelectorAll("input, pre");
(input.oninput = () => {
try {
output.textContent = JSON.stringify(parse(input.value), null, 2)
} catch(err) {
output.textContent = err;
}
})();
input { width: 100%; margin-bottom: 10px; }
Math expression: <input value="1 + 2 * 3">
<pre></pre>
Explanations
tokens
is an iterator over the input based on a regular expression. The regex has a look-behind assertion to ensure that the minus -- if present -- is not a binary operator, and can be included in the match of the numerical literal. The regex defines named groups, so that the code can rely on names and doesn't have to refer to literal characters.
get
uses this iterator to get the next token in a shared variable (match
) and return the previous one. get
takes an argument to specify which named group is expected to have a match. If this is indeed the case, the next token be read, otherwise get
checks whether the match was mandatory. If so, an exception is thrown, otherwise the function returns undefined, so the caller can try another grammar rule.
term
, factor
and expr
implement the grammar rules with the corresponding names. They rely on get
(with argument) to decide which way to go in the grammar rules. These functions all return trees (root nodes).
node
constructs a node in the output tree, bottom up. If nodes in the tree should be something different than arrays, or some reduction should be performed (merging nodes) then this is the function to change.
Have you read up on the theory behind parsers? Wikipedia (as always) has some good articles to read:
- LR parser
- Recursive descent parser
I built a fun little calculator once and had the same problem as you, which I solved by building the syntax tree without keeping the order precedence in mind,firstly. Each node has a precedence value, and when eval'ing non-constants, I'd check the left node: if it has lower precedence, I'd rotate the tree clockwise: bring it into evaluation and evaluate that first, likewise for the right node. then I'd just try to evaluate again. It seemed to work well enough for me.