From 170b8d424fa4a78724f83cbe4b4807ea25ecf3cb Mon Sep 17 00:00:00 2001 From: Pekka Laiho Date: Tue, 15 Dec 2020 18:28:02 +0700 Subject: [PATCH] add Unicode support to Tokenizer --- src/Reader.php | 6 +++++- src/Tokenizer.php | 13 +++++++++++-- test/TokenizerTest.php | 27 +++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/Reader.php b/src/Reader.php index 72a06b7..1471d3c 100644 --- a/src/Reader.php +++ b/src/Reader.php @@ -92,7 +92,11 @@ class Reader } elseif ($a === 'null') { return null; } elseif (substr($a, 0, 1) === '"') { - // remove quotes around string + // Remove quotes around string. + // + // Hopefully this should work correctly with Unicode strings as well, + // because we just want to remove one byte from beginning and end, + // so mb_substr should not be needed? return substr($a, 1, -1); } elseif (is_numeric($a)) { if (filter_var($a, FILTER_VALIDATE_INT) !== false) { diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 705a5bf..98d0d36 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -28,8 +28,17 @@ class Tokenizer } }; - for ($i = 0; $i < strlen($a); $i++) { - $c = substr($a, $i, 1); + // Use mbstring extension if available to support Unicode characters + if (extension_loaded('mbstring')) { + $lenfn = 'mb_strlen'; + $subfn = 'mb_substr'; + } else { + $lenfn = 'strlen'; + $subfn = 'substr'; + } + + for ($i = 0; $i < $lenfn($a); $i++) { + $c = $subfn($a, $i, 1); if ($isString) { if ($isEscape) { diff --git a/test/TokenizerTest.php b/test/TokenizerTest.php index af03490..f8e9bbc 100644 --- a/test/TokenizerTest.php +++ b/test/TokenizerTest.php @@ -103,4 +103,31 @@ class TokenizerTest extends TestCase $result = $tokenizer->tokenize($input); $this->assertSame($expected, $result); } + + public function unicodeProvider(): array + { + return [ + ["(∫≈♡)", ['(', '∫≈♡', ')']], + [ + "αβγδ\"εζηθ\"ικλμ;νξοπ\nρς[σ τ]υ{\"φ\":\"χ\"}ψω", + ['αβγδ', "\"εζηθ\"", 'ικλμ', 'ρς', '[', 'σ', 'τ', ']', 'υ', '{', '"φ"', '"χ"', '}', 'ψω'] + ], + ["[←↑\"→↓\"⇐⇑⇒⇓]", ['[', '←↑', '"→↓"', '⇐⇑⇒⇓', ']']], + ]; + } + + /** + * Test Unicode inputs. + * @dataProvider unicodeProvider + */ + public function testUnicode(string $input, array $expected) + { + if (!extension_loaded('mbstring')) { + $this->markTestSkipped('The mbstring extension is not available.'); + } + + $tokenizer = new Tokenizer(); + $result = $tokenizer->tokenize($input); + $this->assertSame($expected, $result); + } }