add Unicode support to Tokenizer

This commit is contained in:
Pekka Laiho 2020-12-15 18:28:02 +07:00
parent ed18cb2fd5
commit 170b8d424f
3 changed files with 43 additions and 3 deletions

View File

@ -92,7 +92,11 @@ class Reader
} elseif ($a === 'null') { } elseif ($a === 'null') {
return null; return null;
} elseif (substr($a, 0, 1) === '"') { } elseif (substr($a, 0, 1) === '"') {
// remove quotes around string // Remove quotes around string.
//
// Hopefully this should work correctly with Unicode strings as well,
// because we just want to remove one byte from beginning and end,
// so mb_substr should not be needed?
return substr($a, 1, -1); return substr($a, 1, -1);
} elseif (is_numeric($a)) { } elseif (is_numeric($a)) {
if (filter_var($a, FILTER_VALIDATE_INT) !== false) { if (filter_var($a, FILTER_VALIDATE_INT) !== false) {

View File

@ -28,8 +28,17 @@ class Tokenizer
} }
}; };
for ($i = 0; $i < strlen($a); $i++) { // Use mbstring extension if available to support Unicode characters
$c = substr($a, $i, 1); if (extension_loaded('mbstring')) {
$lenfn = 'mb_strlen';
$subfn = 'mb_substr';
} else {
$lenfn = 'strlen';
$subfn = 'substr';
}
for ($i = 0; $i < $lenfn($a); $i++) {
$c = $subfn($a, $i, 1);
if ($isString) { if ($isString) {
if ($isEscape) { if ($isEscape) {

View File

@ -103,4 +103,31 @@ class TokenizerTest extends TestCase
$result = $tokenizer->tokenize($input); $result = $tokenizer->tokenize($input);
$this->assertSame($expected, $result); $this->assertSame($expected, $result);
} }
public function unicodeProvider(): array
{
return [
["(∫≈♡)", ['(', '∫≈♡', ')']],
[
"αβγδ\"εζηθ\"ικλμ;νξοπ\nρς[σ τ]υ{\"φ\":\"χ\"}ψω",
['αβγδ', "\"εζηθ\"", 'ικλμ', 'ρς', '[', 'σ', 'τ', ']', 'υ', '{', '"φ"', '"χ"', '}', 'ψω']
],
["[←↑\"→↓\"⇐⇑⇒⇓]", ['[', '←↑', '"→↓"', '⇐⇑⇒⇓', ']']],
];
}
/**
* Test Unicode inputs.
* @dataProvider unicodeProvider
*/
public function testUnicode(string $input, array $expected)
{
if (!extension_loaded('mbstring')) {
$this->markTestSkipped('The mbstring extension is not available.');
}
$tokenizer = new Tokenizer();
$result = $tokenizer->tokenize($input);
$this->assertSame($expected, $result);
}
} }