rewrite Tokenizer to handle strings better

This commit is contained in:
Pekka Laiho 2020-12-15 09:03:21 +07:00
parent 2a20690b17
commit ea7382caa3
2 changed files with 23 additions and 15 deletions

View File

@ -92,13 +92,8 @@ class Reader
} elseif ($a === 'null') { } elseif ($a === 'null') {
return null; return null;
} elseif (substr($a, 0, 1) === '"') { } elseif (substr($a, 0, 1) === '"') {
// string, handle special characters // remove quotes around string
$a = substr($a, 1, -1); return substr($a, 1, -1);
$a = str_replace("\\\\", chr(0x7f), $a);
$a = str_replace("\\n", "\n", $a);
$a = str_replace("\\r", "\r", $a);
$a = str_replace("\\\"", "\"", $a);
return str_replace(chr(0x7f), "\\", $a);
} elseif (is_numeric($a)) { } elseif (is_numeric($a)) {
if (filter_var($a, FILTER_VALIDATE_INT) !== false) { if (filter_var($a, FILTER_VALIDATE_INT) !== false) {
return intval($a); return intval($a);

View File

@ -16,6 +16,7 @@ class Tokenizer
$isString = false; $isString = false;
$isComment = false; $isComment = false;
$isEscape = false;
$parens = [0, 0, 0]; $parens = [0, 0, 0];
$parenIndexes = ['(' => 0, ')' => 0, '[' => 1, ']' => 1, '{' => 2, '}' => 2]; $parenIndexes = ['(' => 0, ')' => 0, '[' => 1, ']' => 1, '{' => 2, '}' => 2];
@ -31,13 +32,25 @@ class Tokenizer
$c = substr($a, $i, 1); $c = substr($a, $i, 1);
if ($isString) { if ($isString) {
// Inside string, add all characters if ($isEscape) {
$current .= $c; if ($c == 'n') {
$current .= "\n";
// Stop at first double quote } elseif ($c == 'r') {
if ($c == '"') { $current .= "\r";
// If previous character is not a backslash } elseif ($c == 't') {
if (strlen($current) < 2 || substr($current, -2, 1) != "\\") { $current .= "\t";
} elseif ($c == "\\" || $c == '"') {
$current .= $c;
} else {
throw new MadLispException("invalid escape sequence \\$c");
}
$isEscape = false;
} elseif ($c == "\\") {
$isEscape = true;
} else {
// Not handling escape sequence
$current .= $c;
if ($c == '"') {
$addCurrent(); $addCurrent();
$isString = false; $isString = false;
} }
@ -49,12 +62,12 @@ class Tokenizer
} }
} else { } else {
// Not inside string or comment // Not inside string or comment
if ($c == '"') { if ($c == '"') {
// Start of string // Start of string
$addCurrent(); $addCurrent();
$current .= $c; $current .= $c;
$isString = true; $isString = true;
$isEscape = false;
} elseif ($c == ';') { } elseif ($c == ';') {
// Start of comment // Start of comment
$addCurrent(); $addCurrent();