Support encoding

ShaharNaveh · ShaharNaveh · commit 327052e71c64 · 2025-11-08T12:50:06.000+02:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
@@ -78,6 +78,7 @@ def test_implicit_newline(self):
         self.assertEqual(tokens[-2].type, tokenize.NEWLINE)
         self.assertEqual(tokens[-1].type, tokenize.ENDMARKER)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_basic(self):
         self.check_tokenize("1 + 1", """\
     NUMBER     '1'           (1, 0) (1, 1)
@@ -271,6 +272,7 @@ def test_float(self):
     NUMBER     '3.14e159'    (1, 4) (1, 12)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_underscore_literals(self):
         def number_token(s):
             f = BytesIO(s.encode('utf-8'))
@@ -295,6 +297,7 @@ def number_token(s):
                 continue
             self.assertNotEqual(number_token(lit), lit)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_string(self):
         # String literals
         self.check_tokenize("x = ''; y = \"\"", """\
@@ -675,6 +678,7 @@ def test_function(self):
     NAME       'pass'        (1, 34) (1, 38)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_comparison(self):
         # Comparison
         self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
@@ -713,6 +717,7 @@ def test_comparison(self):
     NAME       'pass'        (1, 84) (1, 88)
     """)
 
+    @unittest.skip('TODO: RUSTPYTHON; This fails but the one in GenerateTokensTest passes')
     def test_shift(self):
         # Shift
         self.check_tokenize("x = 1 << 1 >> 5", """\
@@ -748,6 +753,7 @@ def test_additive(self):
     OP         ']'           (1, 36) (1, 37)
     """)
 
+    @unittest.skip('TODO: RUSTPYTHON; This fails but the one in GenerateTokensTest passes')
     def test_multiplicative(self):
         # Multiplicative
         self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
@@ -768,6 +774,7 @@ def test_multiplicative(self):
     NUMBER     '42'          (1, 21) (1, 23)
     """)
 
+    @unittest.skip('TODO: RUSTPYTHON; This fails but the one in GenerateTokensTest passes')
     def test_unary(self):
         # Unary
         self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
@@ -805,6 +812,7 @@ def test_unary(self):
     NUMBER     '1'           (1, 22) (1, 23)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_selector(self):
         # Selector
         self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
@@ -827,6 +835,7 @@ def test_selector(self):
     OP         ')'           (2, 29) (2, 30)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_method(self):
         # Methods
         self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
@@ -844,6 +853,7 @@ def test_method(self):
     NAME       'pass'        (2, 14) (2, 18)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_tabs(self):
         # Evil tabs
         self.check_tokenize("def f():\n"
@@ -865,6 +875,7 @@ def test_tabs(self):
     DEDENT     ''            (4, 0) (4, 0)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_non_ascii_identifiers(self):
         # Non-ascii identifiers
         self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
@@ -877,6 +888,7 @@ def test_non_ascii_identifiers(self):
     STRING     "'green'"     (2, 7) (2, 14)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_unicode(self):
         # Legacy unicode literals:
         self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
@@ -1164,6 +1176,7 @@ async def bar(): pass
     DEDENT     ''            (7, 0) (7, 0)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_newline_after_parenthesized_block_with_comment(self):
         self.check_tokenize('''\
 [
@@ -1188,6 +1201,7 @@ def test_closing_parenthesis_from_different_line(self):
     NAME       'x'           (1, 3) (1, 4)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_multiline_non_ascii_fstring(self):
         self.check_tokenize("""\
 a = f'''
@@ -1199,6 +1213,7 @@ def test_multiline_non_ascii_fstring(self):
     FSTRING_END "\'\'\'"         (2, 68) (2, 71)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_multiline_non_ascii_fstring_with_expr(self):
         self.check_tokenize("""\
 f'''
@@ -1781,6 +1796,7 @@ def test_bad_input_order(self):
         # raise if previous column in row
         self.assertRaises(ValueError, u.add_whitespace, (2,1))
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_backslash_continuation(self):
         # The problem is that <whitespace>\<newline> leaves no token
         u = tokenize.Untokenizer()
@@ -1826,6 +1842,7 @@ def contains_ambiguous_backslash(source):
 
 class TestRoundtrip(TestCase):
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def check_roundtrip(self, f):
         """
         Test roundtrip for `untokenize`. `f` is an open file or a string.
@@ -2037,6 +2054,7 @@ def test_indentation_semantics_retained(self):
 
 
 class InvalidPythonTests(TestCase):
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_number_followed_by_name(self):
         # See issue #gh-105549
         source = "2sin(x)"
@@ -2053,6 +2071,7 @@ def test_number_followed_by_name(self):
         tokens = list(tokenize.generate_tokens(StringIO(source).readline))
         self.assertEqual(tokens, expected_tokens)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_number_starting_with_zero(self):
         source = "01234"
         expected_tokens = [
@@ -2206,6 +2225,7 @@ def test_float(self):
     NUMBER     '3.14e159'    (1, 4) (1, 12)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_string(self):
 
         self.check_tokenize('x = \'\'; y = ""', """\
@@ -2637,6 +2657,7 @@ def test_unary(self):
     NUMBER     '1'           (1, 22) (1, 23)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_selector(self):
 
         self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
@@ -2659,6 +2680,7 @@ def test_selector(self):
     RPAR       ')'           (2, 29) (2, 30)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_method(self):
 
         self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
@@ -2676,6 +2698,7 @@ def test_method(self):
     NAME       'pass'        (2, 14) (2, 18)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_tabs(self):
 
         self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
@@ -2693,6 +2716,7 @@ def test_tabs(self):
     NAME       'pass'        (2, 14) (2, 18)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_async(self):
 
         self.check_tokenize('async = 1', """\
@@ -2959,6 +2983,7 @@ async def bar(): pass
     DEDENT     ''            (6, -1) (6, -1)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_unicode(self):
 
         self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
@@ -2971,6 +2996,7 @@ def test_unicode(self):
     STRING     "U'green'"    (2, 7) (2, 15)
     """)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_invalid_syntax(self):
         def get_tokens(string):
             the_string = StringIO(string)
@@ -3016,6 +3042,7 @@ def get_tokens(string):
             with self.subTest(case=case):
                 self.assertRaises(tokenize.TokenError, get_tokens, case)
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_max_indent(self):
         MAXINDENT = 100
 
@@ -3038,6 +3065,7 @@ def generate_source(indents):
             IndentationError, compile, invalid, "<string>", "exec"
         )
 
+    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_continuation_lines_indentation(self):
         def get_tokens(string):
             the_string = StringIO(string)
diff --git a/stdlib/src/tokenize.rs b/stdlib/src/tokenize.rs
@@ -6,7 +6,7 @@ mod _tokenize {
         common::lock::PyRwLock,
         vm::{
             Py, PyPayload, PyResult, VirtualMachine,
-            builtins::{PyStr, PyTypeRef},
+            builtins::{PyBytes, PyStr, PyStrRef, PyTypeRef},
             convert::ToPyObject,
             function::ArgCallable,
             protocol::PyIterReturn,
@@ -19,25 +19,42 @@ mod _tokenize {
     use ruff_text_size::{Ranged, TextRange};
     use std::{cmp::Ordering, fmt};
 
+    /// Cpython `__import__("token").OP`
+    const TOKEN_OP: u8 = 55;
+
     #[pyattr]
     #[pyclass(name = "TokenizerIter")]
     #[derive(PyPayload)]
     pub struct PyTokenizerIter {
-        readline: ArgCallable,
+        readline: ArgCallable, // TODO: This should be PyObject
         extra_tokens: bool,
-        encoding: String,
+        encoding: Option<String>,
         state: PyRwLock<PyTokenizerIterState>,
     }
 
     impl PyTokenizerIter {
         fn readline(&self, vm: &VirtualMachine) -> PyResult<String> {
-            // TODO: Downcast to diffrent type based on encoding.
-            Ok(self
-                .readline
-                .invoke((), vm)?
-                .downcast::<PyStr>()
-                .map(|s| s.as_str().to_owned())
-                .map_err(|_| vm.new_type_error("readline() returned a non-string object"))?)
+            // TODO: When `readline` is PyObject,
+            // we need to check if it's callable and raise a type error if it's not.
+            let raw_line = match self.readline.invoke((), vm) {
+                Ok(v) => v,
+                Err(_) => return Ok(String::new()),
+            };
+            Ok(match &self.encoding {
+                Some(encoding) => {
+                    let bytes = raw_line
+                        .downcast::<PyBytes>()
+                        .map_err(|_| vm.new_type_error("readline() returned a non-bytes object"))?;
+                    vm.state
+                        .codec_registry
+                        .decode_text(bytes.into(), &encoding, None, vm)
+                        .map(|s| s.as_str().to_owned())?
+                }
+                None => raw_line
+                    .downcast::<PyStr>()
+                    .map(|s| s.as_str().to_owned())
+                    .map_err(|_| vm.new_type_error("readline() returned a non-string object"))?,
+            })
         }
     }
 
@@ -67,7 +84,7 @@ mod _tokenize {
             Self {
                 readline,
                 extra_tokens,
-                encoding,
+                encoding: encoding.map(|s| s.as_str().to_owned()),
                 state: PyRwLock::new(PyTokenizerIterState::default()),
             }
             .into_ref_with_type(vm, cls)
@@ -124,7 +141,7 @@ mod _tokenize {
 
             let token_kind = token.kind();
             let token_value = if zelf.extra_tokens && token_kind.is_operator() {
-                55 // token.OP
+                TOKEN_OP
             } else {
                 token_kind_value(token_kind)
             };
@@ -160,8 +177,8 @@ mod _tokenize {
         readline: ArgCallable,
         #[pyarg(named)]
         extra_tokens: bool,
-        #[pyarg(named, default = String::from("utf-8"))]
-        encoding: String,
+        #[pyarg(named, optional)]
+        encoding: Option<PyStrRef>,
     }
 
     #[derive(Clone, Debug)]
@@ -179,15 +196,6 @@ mod _tokenize {
         eof: bool,
     }
 
-    impl Ranged for PyTokenizerIterState {
-        fn range(&self) -> TextRange {
-            match self.prev_token {
-                Some(token) => token.range(),
-                None => TextRange::default(),
-            }
-        }
-    }
-
     impl PyTokenizerIterState {
         fn push_line(&mut self, line: &str) {
             self.source.push_str(line);
@@ -229,6 +237,14 @@ mod _tokenize {
             None
         }
 
+        #[must_use]
+        fn range(&self) -> TextRange {
+            match self.prev_token {
+                Some(token) => token.range(),
+                None => TextRange::default(),
+            }
+        }
+
         #[must_use]
         fn start(&self) -> (usize, usize) {
             let lc = self