improve tokenizer

This commit is contained in:
Pekka Laiho 2020-05-31 09:56:23 +07:00
parent debda2e602
commit 5869b2f483

View File

@ -7,8 +7,12 @@ class Tokenizer
{ {
$tokens = []; $tokens = [];
$current = ''; $current = '';
$string = false;
$parens = 0; $isString = false;
$isComment = false;
$parens = [0, 0, 0];
$parenIndexes = ['(' => 0, ')' => 0, '[' => 1, ']' => 1, '{' => 2, '}' => 2];
$addCurrent = function () use (&$tokens, &$current) { $addCurrent = function () use (&$tokens, &$current) {
if ($current !== '') { if ($current !== '') {
@ -20,39 +24,52 @@ class Tokenizer
for ($i = 0; $i < strlen($a); $i++) { for ($i = 0; $i < strlen($a); $i++) {
$c = substr($a, $i, 1); $c = substr($a, $i, 1);
if ($string) { if ($isString) {
// Inside string, add all characters // Inside string, add all characters
$current .= $c; $current .= $c;
// Stop at " // Stop at double quote
if ($c == '"') { if ($c == '"') {
$addCurrent(); $addCurrent();
$string = false; $isString = false;
}
} elseif ($isComment) {
// Comments stop at first newline
if ($c == "\n" || $c == "\r") {
$isComment = false;
} }
} else { } else {
// Not inside string // Not inside string or comment
if ($c == '"') { if ($c == '"') {
// Start of string // Start of string
$addCurrent(); $addCurrent();
$current .= $c; $current .= $c;
$string = true; $isString = true;
} elseif ($c == ';') {
// Start of comment
$addCurrent();
$isComment = true;
} elseif ($c == ' ' || $c == "\t" || $c == "\n" || $c == "\r") { } elseif ($c == ' ' || $c == "\t" || $c == "\n" || $c == "\r") {
// Whitespace is ignored // Whitespace is ignored
$addCurrent(); $addCurrent();
} elseif ($c == '(') { } elseif ($c == '(' || $c == '[' || $c == '{') {
// Start of list // Start of collection
$addCurrent(); $addCurrent();
$tokens[] = '('; $tokens[] = $c;
$parens++; $parens[$parenIndexes[$c]]++;
} elseif ($c == ')') { } elseif ($c == ')' || $c == ']' || $c == '}') {
// End of list // End of collection
if ($parens == 0) { if ($parens[$parenIndexes[$c]] == 0) {
throw new MadLispException("unexpected closing parenthesis"); throw new MadLispException("unexpected closing $c");
} }
$addCurrent(); $addCurrent();
$tokens[] = ')'; $tokens[] = $c;
$parens--; $parens[$parenIndexes[$c]]--;
} elseif ($c == "'") {
// Other special characters
$addCurrent();
$tokens[] = $c;
} else { } else {
// All other characters // All other characters
$current .= $c; $current .= $c;
@ -60,13 +77,17 @@ class Tokenizer
} }
} }
// Add last also // Add last token
$addCurrent(); $addCurrent();
// Check for errors // Check for errors
if ($parens != 0) { if ($parens[0] != 0) {
throw new MadLispException("missing closing parenthesis"); throw new MadLispException("missing closing )");
} elseif ($string) { } elseif ($parens[1] != 0) {
throw new MadLispException("missing closing ]");
} elseif ($parens[2] != 0) {
throw new MadLispException("missing closing }");
} elseif ($isString) {
throw new MadLispException("unterminated string"); throw new MadLispException("unterminated string");
} }