2020-05-28 10:10:00 +00:00
|
|
|
<?php
|
2020-12-14 01:49:07 +00:00
|
|
|
/**
|
|
|
|
* MadLisp language
|
|
|
|
* @link http://madlisp.com/
|
|
|
|
* @copyright Copyright (c) 2020 Pekka Laiho
|
|
|
|
*/
|
|
|
|
|
2020-05-28 10:10:00 +00:00
|
|
|
namespace MadLisp;
|
|
|
|
|
|
|
|
class Tokenizer
|
|
|
|
{
|
|
|
|
public function tokenize(string $a): array
|
|
|
|
{
|
|
|
|
$tokens = [];
|
|
|
|
$current = '';
|
2020-05-31 02:56:23 +00:00
|
|
|
|
|
|
|
$isString = false;
|
|
|
|
$isComment = false;
|
2020-12-15 02:03:21 +00:00
|
|
|
$isEscape = false;
|
2020-05-31 02:56:23 +00:00
|
|
|
|
|
|
|
$parens = [0, 0, 0];
|
|
|
|
$parenIndexes = ['(' => 0, ')' => 0, '[' => 1, ']' => 1, '{' => 2, '}' => 2];
|
2020-05-28 10:10:00 +00:00
|
|
|
|
|
|
|
$addCurrent = function () use (&$tokens, &$current) {
|
|
|
|
if ($current !== '') {
|
|
|
|
$tokens[] = $current;
|
|
|
|
$current = '';
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-12-15 11:28:02 +00:00
|
|
|
// Use mbstring extension if available to support Unicode characters
|
|
|
|
if (extension_loaded('mbstring')) {
|
|
|
|
$lenfn = 'mb_strlen';
|
|
|
|
$subfn = 'mb_substr';
|
|
|
|
} else {
|
|
|
|
$lenfn = 'strlen';
|
|
|
|
$subfn = 'substr';
|
|
|
|
}
|
|
|
|
|
|
|
|
for ($i = 0; $i < $lenfn($a); $i++) {
|
|
|
|
$c = $subfn($a, $i, 1);
|
2020-05-28 10:10:00 +00:00
|
|
|
|
2020-05-31 02:56:23 +00:00
|
|
|
if ($isString) {
|
2020-12-15 02:03:21 +00:00
|
|
|
if ($isEscape) {
|
|
|
|
if ($c == 'n') {
|
|
|
|
$current .= "\n";
|
|
|
|
} elseif ($c == 'r') {
|
|
|
|
$current .= "\r";
|
|
|
|
} elseif ($c == 't') {
|
|
|
|
$current .= "\t";
|
2020-12-21 02:13:27 +00:00
|
|
|
} elseif ($c == 'v') {
|
|
|
|
$current .= "\v";
|
|
|
|
} elseif ($c == '0') {
|
|
|
|
$current .= "\0";
|
2020-12-15 02:03:21 +00:00
|
|
|
} elseif ($c == "\\" || $c == '"') {
|
|
|
|
$current .= $c;
|
|
|
|
} else {
|
|
|
|
throw new MadLispException("invalid escape sequence \\$c");
|
|
|
|
}
|
|
|
|
$isEscape = false;
|
|
|
|
} elseif ($c == "\\") {
|
|
|
|
$isEscape = true;
|
|
|
|
} else {
|
|
|
|
// Not handling escape sequence
|
|
|
|
$current .= $c;
|
|
|
|
if ($c == '"') {
|
2020-06-16 13:15:57 +00:00
|
|
|
$addCurrent();
|
|
|
|
$isString = false;
|
|
|
|
}
|
2020-05-31 02:56:23 +00:00
|
|
|
}
|
|
|
|
} elseif ($isComment) {
|
|
|
|
// Comments stop at first newline
|
|
|
|
if ($c == "\n" || $c == "\r") {
|
|
|
|
$isComment = false;
|
2020-05-28 10:10:00 +00:00
|
|
|
}
|
|
|
|
} else {
|
2020-05-31 02:56:23 +00:00
|
|
|
// Not inside string or comment
|
2020-05-28 10:10:00 +00:00
|
|
|
if ($c == '"') {
|
|
|
|
// Start of string
|
|
|
|
$addCurrent();
|
|
|
|
$current .= $c;
|
2020-05-31 02:56:23 +00:00
|
|
|
$isString = true;
|
2020-12-15 02:03:21 +00:00
|
|
|
$isEscape = false;
|
2020-05-31 02:56:23 +00:00
|
|
|
} elseif ($c == ';') {
|
|
|
|
// Start of comment
|
|
|
|
$addCurrent();
|
|
|
|
$isComment = true;
|
2020-12-21 02:13:27 +00:00
|
|
|
} elseif ($c == ' ' || $c == "\t" || $c == "\n" || $c == "\r" || $c == "\v" || $c == "\0" || $c == ':') {
|
2020-06-04 01:28:54 +00:00
|
|
|
// Whitespace and colon are ignored
|
2020-05-28 10:10:00 +00:00
|
|
|
$addCurrent();
|
2020-05-31 02:56:23 +00:00
|
|
|
} elseif ($c == '(' || $c == '[' || $c == '{') {
|
|
|
|
// Start of collection
|
2020-05-28 10:10:00 +00:00
|
|
|
$addCurrent();
|
2020-05-31 02:56:23 +00:00
|
|
|
$tokens[] = $c;
|
|
|
|
$parens[$parenIndexes[$c]]++;
|
|
|
|
} elseif ($c == ')' || $c == ']' || $c == '}') {
|
|
|
|
// End of collection
|
|
|
|
if ($parens[$parenIndexes[$c]] == 0) {
|
|
|
|
throw new MadLispException("unexpected closing $c");
|
2020-05-28 10:10:00 +00:00
|
|
|
}
|
|
|
|
$addCurrent();
|
2020-05-31 02:56:23 +00:00
|
|
|
$tokens[] = $c;
|
|
|
|
$parens[$parenIndexes[$c]]--;
|
2020-12-05 10:38:42 +00:00
|
|
|
} elseif ($c == "'" || $c == "`" || $c == "~") {
|
2020-05-31 02:56:23 +00:00
|
|
|
// Other special characters
|
|
|
|
$addCurrent();
|
|
|
|
$tokens[] = $c;
|
2020-12-16 15:01:50 +00:00
|
|
|
} elseif ($c == '@') {
|
|
|
|
// If the last token was ~ then add @ to it
|
|
|
|
if (count($tokens) > 0 && $tokens[count($tokens) - 1] == '~') {
|
|
|
|
$tokens[count($tokens) - 1] .= $c;
|
|
|
|
} else {
|
|
|
|
// Otherwise treat it like normal character
|
|
|
|
$current .= $c;
|
|
|
|
}
|
2020-05-28 10:10:00 +00:00
|
|
|
} else {
|
|
|
|
// All other characters
|
|
|
|
$current .= $c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-31 02:56:23 +00:00
|
|
|
// Add last token
|
2020-05-28 10:10:00 +00:00
|
|
|
$addCurrent();
|
|
|
|
|
|
|
|
// Check for errors
|
2020-12-05 07:53:21 +00:00
|
|
|
if ($isString) {
|
|
|
|
throw new MadLispException("unterminated string");
|
|
|
|
} elseif ($parens[0] != 0) {
|
2020-05-31 02:56:23 +00:00
|
|
|
throw new MadLispException("missing closing )");
|
|
|
|
} elseif ($parens[1] != 0) {
|
|
|
|
throw new MadLispException("missing closing ]");
|
|
|
|
} elseif ($parens[2] != 0) {
|
|
|
|
throw new MadLispException("missing closing }");
|
2020-05-28 10:10:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return $tokens;
|
|
|
|
}
|
|
|
|
}
|