From 295736c9cba714fb5de7d1c3dd31d86e50091cf8 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 2 Feb 2023 14:28:32 +0000 Subject: [PATCH] [jsinterp] Improve parsing * support subset `... else if ...` * support `while` * add `RegExp` class * generalise `new` support * limited more debug strings * matching test changes --- test/test_jsinterp.py | 53 +++++++++++++- youtube_dl/jsinterp.py | 156 +++++++++++++++++++++++++++-------------- 2 files changed, 154 insertions(+), 55 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c47def737..b5962356c 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -11,8 +11,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import math import re -from youtube_dl.compat import compat_re_Pattern - from youtube_dl.jsinterp import JS_Undefined, JSInterpreter @@ -140,15 +138,23 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertTrue(math.isnan(jsi.call_function('x'))) + def test_Date(self): jsi = JSInterpreter(''' function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } ''') self.assertEqual(jsi.call_function('x'), 86000) + jsi = JSInterpreter(''' function x(dt) { return new Date(dt) - 0; } ''') self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + # date format m/d/y + jsi = JSInterpreter(''' + function x() { return new Date('12/31/1969 18:01:26 MDT') - 0; } + ''') + self.assertEqual(jsi.call_function('x'), 86000) + def test_call(self): jsi = JSInterpreter(''' function x() { return 2; } @@ -181,6 +187,15 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('x'), 10) """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) return 1; + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + """ + + def test_elseif(self): jsi = JSInterpreter(''' function x() { if (0!=0) {return 1} @@ -188,6 +203,16 @@ class TestJSInterpreter(unittest.TestCase): else {return 10} }''') self.assertEqual(jsi.call_function('x'), 10) + + """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) return 1; + else if (1==0) {return 2} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + # etc """ def test_for_loop(self): @@ -197,6 +222,13 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x'), 10) + def test_while_loop(self): + # function x() { a=0; while (a<10) {a++} a } + jsi = JSInterpreter(''' + function x() { a=0; while (a<10) {a++} return a } + ''') + self.assertEqual(jsi.call_function('x'), 10) + def test_switch(self): jsi = JSInterpreter(''' function x(f) { switch(f){ @@ -415,13 +447,28 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/; return a; } ''') - self.assertIsInstance(jsi.call_function('x'), compat_re_Pattern) + attrs = set(('findall', 'finditer', 'flags', 'groupindex', + 'groups', 'match', 'pattern', 'scanner', + 'search', 'split', 'sub', 'subn')) + self.assertTrue(set(dir(jsi.call_function('x'))) > attrs) jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/i; return a; } ''') self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) + jsi = JSInterpreter(r''' + function x() { let a=[/[)\\]/]; return a[0]; } + ''') + self.assertEqual(jsi.call_function('x').pattern, r'[)\\]') + + """ # fails + jsi = JSInterpreter(r''' + function x() { let a=100; a/=/[0-9]+/.exec('divide by 20 today')[0]; } + ''') + self.assertEqual(jsi.call_function('x'), 5) + """ + def test_char_code_at(self): jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') self.assertEqual(jsi.call_function('x', 0), 116) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 9a3b8d7f2..1e7b342ac 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -187,19 +187,6 @@ class LocalNameSpace(ChainMap): class JSInterpreter(object): __named_object_counter = 0 - _RE_FLAGS = { - # special knowledge: Python's re flags are bitmask values, current max 128 - # invent new bitmask values well above that for literal parsing - # TODO: new pattern class to execute matches with these flags - 'd': 1024, # Generate indices for substring matches - 'g': 2048, # Global search - 'i': re.I, # Case-insensitive search - 'm': re.M, # Multi-line search - 's': re.S, # Allows . to match newline characters - 'u': re.U, # Treat a pattern as a sequence of unicode code points - 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string - } - _OBJ_NAME = '__youtube_dl_jsinterp_obj' OP_CHARS = None @@ -217,9 +204,48 @@ class JSInterpreter(object): msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) + class JS_RegExp(object): + _RE_FLAGS = { + # special knowledge: Python's re flags are bitmask values, current max 128 + # invent new bitmask values well above that for literal parsing + # TODO: new pattern class to execute matches with these flags + 'd': 1024, # Generate indices for substring matches + 'g': 2048, # Global search + 'i': re.I, # Case-insensitive search + 'm': re.M, # Multi-line search + 's': re.S, # Allows . to match newline characters + 'u': re.U, # Treat a pattern as a sequence of unicode code points + 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string + } + + def __init__(self, pattern_txt, flags=''): + if isinstance(flags, compat_str): + flags, _ = self.regex_flags(flags) + # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern + # First, avoid https://github.com/python/cpython/issues/74534 + self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags) + for name in dir(self.__self): + # Only these? Obviously __class__, __init__. + # PyPy creates a __weakref__ attribute with value None + # that can't be setattr'd but also can't need to be copied. + if name in ('__class__', '__init__', '__weakref__'): + continue + setattr(self, name, getattr(self.__self, name)) + + @classmethod + def regex_flags(cls, expr): + flags = 0 + if not expr: + return flags, expr + for idx, ch in enumerate(expr): + if ch not in cls._RE_FLAGS: + break + flags |= cls._RE_FLAGS[ch] + return flags, expr[idx + 1:] + @classmethod def __op_chars(cls): - op_chars = set(';,') + op_chars = set(';,[') for op in cls._all_operators(): for c in op[0]: op_chars.add(c) @@ -231,17 +257,6 @@ class JSInterpreter(object): namespace[name] = obj return name - @classmethod - def _regex_flags(cls, expr): - flags = 0 - if not expr: - return flags, expr - for idx, ch in enumerate(expr): - if ch not in cls._RE_FLAGS: - break - flags |= cls._RE_FLAGS[ch] - return flags, expr[idx + 1:] - @classmethod def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): if not expr: @@ -328,7 +343,7 @@ class JSInterpreter(object): try: return opfunc(left_val, right_val) except Exception as e: - raise self.Exception('Failed to evaluate {left_val!r} {op} {right_val!r}'.format(**locals()), expr, cause=e) + raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e) def _index(self, obj, idx, allow_undefined=False): if idx == 'length': @@ -338,7 +353,7 @@ class JSInterpreter(object): except Exception as e: if allow_undefined: return JS_Undefined - raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e) + raise self.Exception('Cannot get index {idx:.100}'.format(**locals()), expr=repr(obj), cause=e) def _dump(self, obj, namespace): try: @@ -352,6 +367,7 @@ class JSInterpreter(object): allow_recursion -= 1 should_return = False + # fails on (eg) if (...) stmt1; else stmt2; sub_statements = list(self._separate(stmt, ';')) or [''] expr = stmt = sub_statements.pop().strip() for sub_stmt in sub_statements: @@ -371,25 +387,30 @@ class JSInterpreter(object): if expr[0] in _QUOTES: inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': - flags, outer = self._regex_flags(outer) - inner = re.compile(inner[1:], flags=flags) # , strict=True)) + flags, outer = self.JS_RegExp.regex_flags(outer) + inner = self.JS_RegExp(inner[1:], flags=flags) else: inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) if not outer: return inner, should_return expr = self._named_object(local_vars, inner) + outer - if expr.startswith('new '): - obj = expr[4:] - if obj.startswith('Date('): - left, right = self._separate_at_paren(obj[4:]) - expr = unified_timestamp( - self.interpret_expression(left, local_vars, allow_recursion), False) + new_kw, _, obj = expr.partition('new ') + if not new_kw: + for klass, konstr in (('Date', lambda x: int(unified_timestamp(x, False) * 1000)), + ('RegExp', self.JS_RegExp), + ('Error', self.Exception)): + if not obj.startswith(klass + '('): + continue + left, right = self._separate_at_paren(obj[len(klass):]) + argvals = self.interpret_iter(left, local_vars, allow_recursion) + expr = konstr(*argvals) if not expr: - raise self.Exception('Failed to parse date {left!r}'.format(**locals()), expr=expr) - expr = self._dump(int(expr * 1000), local_vars) + right + raise self.Exception('Failed to parse {klass} {left!r:.100}'.format(**locals()), expr=expr) + expr = self._dump(expr, local_vars) + right + break else: - raise self.Exception('Unsupported object {obj}'.format(**locals()), expr=expr) + raise self.Exception('Unsupported object {obj:.100}'.format(**locals()), expr=expr) if expr.startswith('void '): left = self.interpret_expression(expr[5:], local_vars, allow_recursion) @@ -430,24 +451,45 @@ class JSInterpreter(object): (?Ptry)\s*\{| (?Pif)\s*\(| (?Pswitch)\s*\(| - (?Pfor)\s*\( + (?Pfor)\s*\(| + (?Pwhile)\s*\( ''', expr) md = m.groupdict() if m else {} if md.get('if'): cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) - if_expr, expr = self._separate_at_paren(expr.lstrip()) - # TODO: "else if" is not handled + if expr.startswith('{'): + if_expr, expr = self._separate_at_paren(expr) + else: + # may lose ... else ... because of ll.368-374 + if_expr, expr = self._separate_at_paren(expr, delim=';') else_expr = None - m = re.match(r'else\s*{', expr) + m = re.match(r'else\s*(?P\{)?', expr) if m: - else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + if m.group('block'): + else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + else: + # handle subset ... else if (...) {...} else ... + # TODO: make interpret_statement do this properly, if possible + exprs = list(self._separate(expr[m.end():], delim='}', max_split=2)) + if len(exprs) > 1: + if re.match(r'\s*if\s*\(', exprs[0]) and re.match(r'\s*else\b', exprs[1]): + else_expr = exprs[0] + '}' + exprs[1] + expr = (exprs[2] + '}') if len(exprs) == 3 else None + else: + else_expr = exprs[0] + exprs.append('') + expr = '}'.join(exprs[1:]) + else: + else_expr = exprs[0] + expr = None + else_expr = else_expr.lstrip() + '}' cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)) ret, should_abort = self.interpret_statement( if_expr if cndn else else_expr, local_vars, allow_recursion) if should_abort: return ret, True - if md.get('try'): + elif md.get('try'): try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) err = None try: @@ -484,8 +526,8 @@ class JSInterpreter(object): if err: raise err - elif md.get('for'): - constructor, remaining = self._separate_at_paren(expr[m.end() - 1:]) + elif md.get('for') or md.get('while'): + init_or_cond, remaining = self._separate_at_paren(expr[m.end() - 1:]) if remaining.startswith('{'): body, expr = self._separate_at_paren(remaining) else: @@ -496,11 +538,12 @@ class JSInterpreter(object): body = 'switch(%s){%s}' % (switch_val, body) else: body, expr = remaining, '' - start, cndn, increment = self._separate(constructor, ';') - self.interpret_expression(start, local_vars, allow_recursion) - while True: - if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): - break + if md.get('for'): + start, cndn, increment = self._separate(init_or_cond, ';') + self.interpret_expression(start, local_vars, allow_recursion) + else: + cndn, increment = init_or_cond, None + while _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): try: ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) if should_abort: @@ -509,7 +552,8 @@ class JSInterpreter(object): break except JS_Continue: pass - self.interpret_expression(increment, local_vars, allow_recursion) + if increment: + self.interpret_expression(increment, local_vars, allow_recursion) elif md.get('switch'): switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:]) @@ -764,6 +808,10 @@ class JSInterpreter(object): if idx >= len(obj): return None return ord(obj[idx]) + elif member == 'replace': + assertion(isinstance(obj, compat_str), 'must be applied on a string') + assertion(len(argvals) == 2, 'takes exactly two arguments') + return re.sub(argvals[0], argvals[1], obj) idx = int(member) if isinstance(obj, list) else member return obj[idx](argvals, allow_recursion=allow_recursion) @@ -795,6 +843,10 @@ class JSInterpreter(object): raise self.Exception('Cannot return from an expression', expr) return ret + def interpret_iter(self, list_txt, local_vars, allow_recursion): + for v in self._separate(list_txt): + yield self.interpret_expression(v, local_vars, allow_recursion) + def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {}