mirror of
https://github.com/peklaiho/madlisp.git
synced 2024-11-22 13:24:46 +00:00
add Unicode support to Tokenizer
This commit is contained in:
parent
ed18cb2fd5
commit
170b8d424f
@ -92,7 +92,11 @@ class Reader
|
|||||||
} elseif ($a === 'null') {
|
} elseif ($a === 'null') {
|
||||||
return null;
|
return null;
|
||||||
} elseif (substr($a, 0, 1) === '"') {
|
} elseif (substr($a, 0, 1) === '"') {
|
||||||
// remove quotes around string
|
// Remove quotes around string.
|
||||||
|
//
|
||||||
|
// Hopefully this should work correctly with Unicode strings as well,
|
||||||
|
// because we just want to remove one byte from beginning and end,
|
||||||
|
// so mb_substr should not be needed?
|
||||||
return substr($a, 1, -1);
|
return substr($a, 1, -1);
|
||||||
} elseif (is_numeric($a)) {
|
} elseif (is_numeric($a)) {
|
||||||
if (filter_var($a, FILTER_VALIDATE_INT) !== false) {
|
if (filter_var($a, FILTER_VALIDATE_INT) !== false) {
|
||||||
|
@ -28,8 +28,17 @@ class Tokenizer
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
for ($i = 0; $i < strlen($a); $i++) {
|
// Use mbstring extension if available to support Unicode characters
|
||||||
$c = substr($a, $i, 1);
|
if (extension_loaded('mbstring')) {
|
||||||
|
$lenfn = 'mb_strlen';
|
||||||
|
$subfn = 'mb_substr';
|
||||||
|
} else {
|
||||||
|
$lenfn = 'strlen';
|
||||||
|
$subfn = 'substr';
|
||||||
|
}
|
||||||
|
|
||||||
|
for ($i = 0; $i < $lenfn($a); $i++) {
|
||||||
|
$c = $subfn($a, $i, 1);
|
||||||
|
|
||||||
if ($isString) {
|
if ($isString) {
|
||||||
if ($isEscape) {
|
if ($isEscape) {
|
||||||
|
@ -103,4 +103,31 @@ class TokenizerTest extends TestCase
|
|||||||
$result = $tokenizer->tokenize($input);
|
$result = $tokenizer->tokenize($input);
|
||||||
$this->assertSame($expected, $result);
|
$this->assertSame($expected, $result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function unicodeProvider(): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
["(∫≈♡)", ['(', '∫≈♡', ')']],
|
||||||
|
[
|
||||||
|
"αβγδ\"εζηθ\"ικλμ;νξοπ\nρς[σ τ]υ{\"φ\":\"χ\"}ψω",
|
||||||
|
['αβγδ', "\"εζηθ\"", 'ικλμ', 'ρς', '[', 'σ', 'τ', ']', 'υ', '{', '"φ"', '"χ"', '}', 'ψω']
|
||||||
|
],
|
||||||
|
["[←↑\"→↓\"⇐⇑⇒⇓]", ['[', '←↑', '"→↓"', '⇐⇑⇒⇓', ']']],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test Unicode inputs.
|
||||||
|
* @dataProvider unicodeProvider
|
||||||
|
*/
|
||||||
|
public function testUnicode(string $input, array $expected)
|
||||||
|
{
|
||||||
|
if (!extension_loaded('mbstring')) {
|
||||||
|
$this->markTestSkipped('The mbstring extension is not available.');
|
||||||
|
}
|
||||||
|
|
||||||
|
$tokenizer = new Tokenizer();
|
||||||
|
$result = $tokenizer->tokenize($input);
|
||||||
|
$this->assertSame($expected, $result);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user