🌐 AI搜索 & 代理 主页
Skip to content

Commit 327052e

Browse files
committed
Support encoding
1 parent d8a76a3 commit 327052e

File tree

3 files changed

+82
-23
lines changed

3 files changed

+82
-23
lines changed

Cargo.lock

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/test/test_tokenize.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def test_implicit_newline(self):
7878
self.assertEqual(tokens[-2].type, tokenize.NEWLINE)
7979
self.assertEqual(tokens[-1].type, tokenize.ENDMARKER)
8080

81+
@unittest.expectedFailure # TODO: RUSTPYTHON
8182
def test_basic(self):
8283
self.check_tokenize("1 + 1", """\
8384
NUMBER '1' (1, 0) (1, 1)
@@ -271,6 +272,7 @@ def test_float(self):
271272
NUMBER '3.14e159' (1, 4) (1, 12)
272273
""")
273274

275+
@unittest.expectedFailure # TODO: RUSTPYTHON
274276
def test_underscore_literals(self):
275277
def number_token(s):
276278
f = BytesIO(s.encode('utf-8'))
@@ -295,6 +297,7 @@ def number_token(s):
295297
continue
296298
self.assertNotEqual(number_token(lit), lit)
297299

300+
@unittest.expectedFailure # TODO: RUSTPYTHON
298301
def test_string(self):
299302
# String literals
300303
self.check_tokenize("x = ''; y = \"\"", """\
@@ -675,6 +678,7 @@ def test_function(self):
675678
NAME 'pass' (1, 34) (1, 38)
676679
""")
677680

681+
@unittest.expectedFailure # TODO: RUSTPYTHON
678682
def test_comparison(self):
679683
# Comparison
680684
self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
@@ -713,6 +717,7 @@ def test_comparison(self):
713717
NAME 'pass' (1, 84) (1, 88)
714718
""")
715719

720+
@unittest.skip('TODO: RUSTPYTHON; This fails but the one in GenerateTokensTest passes')
716721
def test_shift(self):
717722
# Shift
718723
self.check_tokenize("x = 1 << 1 >> 5", """\
@@ -748,6 +753,7 @@ def test_additive(self):
748753
OP ']' (1, 36) (1, 37)
749754
""")
750755

756+
@unittest.skip('TODO: RUSTPYTHON; This fails but the one in GenerateTokensTest passes')
751757
def test_multiplicative(self):
752758
# Multiplicative
753759
self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
@@ -768,6 +774,7 @@ def test_multiplicative(self):
768774
NUMBER '42' (1, 21) (1, 23)
769775
""")
770776

777+
@unittest.skip('TODO: RUSTPYTHON; This fails but the one in GenerateTokensTest passes')
771778
def test_unary(self):
772779
# Unary
773780
self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
@@ -805,6 +812,7 @@ def test_unary(self):
805812
NUMBER '1' (1, 22) (1, 23)
806813
""")
807814

815+
@unittest.expectedFailure # TODO: RUSTPYTHON
808816
def test_selector(self):
809817
# Selector
810818
self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
@@ -827,6 +835,7 @@ def test_selector(self):
827835
OP ')' (2, 29) (2, 30)
828836
""")
829837

838+
@unittest.expectedFailure # TODO: RUSTPYTHON
830839
def test_method(self):
831840
# Methods
832841
self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
@@ -844,6 +853,7 @@ def test_method(self):
844853
NAME 'pass' (2, 14) (2, 18)
845854
""")
846855

856+
@unittest.expectedFailure # TODO: RUSTPYTHON
847857
def test_tabs(self):
848858
# Evil tabs
849859
self.check_tokenize("def f():\n"
@@ -865,6 +875,7 @@ def test_tabs(self):
865875
DEDENT '' (4, 0) (4, 0)
866876
""")
867877

878+
@unittest.expectedFailure # TODO: RUSTPYTHON
868879
def test_non_ascii_identifiers(self):
869880
# Non-ascii identifiers
870881
self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
@@ -877,6 +888,7 @@ def test_non_ascii_identifiers(self):
877888
STRING "'green'" (2, 7) (2, 14)
878889
""")
879890

891+
@unittest.expectedFailure # TODO: RUSTPYTHON
880892
def test_unicode(self):
881893
# Legacy unicode literals:
882894
self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
@@ -1164,6 +1176,7 @@ async def bar(): pass
11641176
DEDENT '' (7, 0) (7, 0)
11651177
""")
11661178

1179+
@unittest.expectedFailure # TODO: RUSTPYTHON
11671180
def test_newline_after_parenthesized_block_with_comment(self):
11681181
self.check_tokenize('''\
11691182
[
@@ -1188,6 +1201,7 @@ def test_closing_parenthesis_from_different_line(self):
11881201
NAME 'x' (1, 3) (1, 4)
11891202
""")
11901203

1204+
@unittest.expectedFailure # TODO: RUSTPYTHON
11911205
def test_multiline_non_ascii_fstring(self):
11921206
self.check_tokenize("""\
11931207
a = f'''
@@ -1199,6 +1213,7 @@ def test_multiline_non_ascii_fstring(self):
11991213
FSTRING_END "\'\'\'" (2, 68) (2, 71)
12001214
""")
12011215

1216+
@unittest.expectedFailure # TODO: RUSTPYTHON
12021217
def test_multiline_non_ascii_fstring_with_expr(self):
12031218
self.check_tokenize("""\
12041219
f'''
@@ -1781,6 +1796,7 @@ def test_bad_input_order(self):
17811796
# raise if previous column in row
17821797
self.assertRaises(ValueError, u.add_whitespace, (2,1))
17831798

1799+
@unittest.expectedFailure # TODO: RUSTPYTHON
17841800
def test_backslash_continuation(self):
17851801
# The problem is that <whitespace>\<newline> leaves no token
17861802
u = tokenize.Untokenizer()
@@ -1826,6 +1842,7 @@ def contains_ambiguous_backslash(source):
18261842

18271843
class TestRoundtrip(TestCase):
18281844

1845+
@unittest.expectedFailure # TODO: RUSTPYTHON
18291846
def check_roundtrip(self, f):
18301847
"""
18311848
Test roundtrip for `untokenize`. `f` is an open file or a string.
@@ -2037,6 +2054,7 @@ def test_indentation_semantics_retained(self):
20372054

20382055

20392056
class InvalidPythonTests(TestCase):
2057+
@unittest.expectedFailure # TODO: RUSTPYTHON
20402058
def test_number_followed_by_name(self):
20412059
# See issue #gh-105549
20422060
source = "2sin(x)"
@@ -2053,6 +2071,7 @@ def test_number_followed_by_name(self):
20532071
tokens = list(tokenize.generate_tokens(StringIO(source).readline))
20542072
self.assertEqual(tokens, expected_tokens)
20552073

2074+
@unittest.expectedFailure # TODO: RUSTPYTHON
20562075
def test_number_starting_with_zero(self):
20572076
source = "01234"
20582077
expected_tokens = [
@@ -2206,6 +2225,7 @@ def test_float(self):
22062225
NUMBER '3.14e159' (1, 4) (1, 12)
22072226
""")
22082227

2228+
@unittest.expectedFailure # TODO: RUSTPYTHON
22092229
def test_string(self):
22102230

22112231
self.check_tokenize('x = \'\'; y = ""', """\
@@ -2637,6 +2657,7 @@ def test_unary(self):
26372657
NUMBER '1' (1, 22) (1, 23)
26382658
""")
26392659

2660+
@unittest.expectedFailure # TODO: RUSTPYTHON
26402661
def test_selector(self):
26412662

26422663
self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
@@ -2659,6 +2680,7 @@ def test_selector(self):
26592680
RPAR ')' (2, 29) (2, 30)
26602681
""")
26612682

2683+
@unittest.expectedFailure # TODO: RUSTPYTHON
26622684
def test_method(self):
26632685

26642686
self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
@@ -2676,6 +2698,7 @@ def test_method(self):
26762698
NAME 'pass' (2, 14) (2, 18)
26772699
""")
26782700

2701+
@unittest.expectedFailure # TODO: RUSTPYTHON
26792702
def test_tabs(self):
26802703

26812704
self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
@@ -2693,6 +2716,7 @@ def test_tabs(self):
26932716
NAME 'pass' (2, 14) (2, 18)
26942717
""")
26952718

2719+
@unittest.expectedFailure # TODO: RUSTPYTHON
26962720
def test_async(self):
26972721

26982722
self.check_tokenize('async = 1', """\
@@ -2959,6 +2983,7 @@ async def bar(): pass
29592983
DEDENT '' (6, -1) (6, -1)
29602984
""")
29612985

2986+
@unittest.expectedFailure # TODO: RUSTPYTHON
29622987
def test_unicode(self):
29632988

29642989
self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
@@ -2971,6 +2996,7 @@ def test_unicode(self):
29712996
STRING "U'green'" (2, 7) (2, 15)
29722997
""")
29732998

2999+
@unittest.expectedFailure # TODO: RUSTPYTHON
29743000
def test_invalid_syntax(self):
29753001
def get_tokens(string):
29763002
the_string = StringIO(string)
@@ -3016,6 +3042,7 @@ def get_tokens(string):
30163042
with self.subTest(case=case):
30173043
self.assertRaises(tokenize.TokenError, get_tokens, case)
30183044

3045+
@unittest.expectedFailure # TODO: RUSTPYTHON
30193046
def test_max_indent(self):
30203047
MAXINDENT = 100
30213048

@@ -3038,6 +3065,7 @@ def generate_source(indents):
30383065
IndentationError, compile, invalid, "<string>", "exec"
30393066
)
30403067

3068+
@unittest.expectedFailure # TODO: RUSTPYTHON
30413069
def test_continuation_lines_indentation(self):
30423070
def get_tokens(string):
30433071
the_string = StringIO(string)

stdlib/src/tokenize.rs

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ mod _tokenize {
66
common::lock::PyRwLock,
77
vm::{
88
Py, PyPayload, PyResult, VirtualMachine,
9-
builtins::{PyStr, PyTypeRef},
9+
builtins::{PyBytes, PyStr, PyStrRef, PyTypeRef},
1010
convert::ToPyObject,
1111
function::ArgCallable,
1212
protocol::PyIterReturn,
@@ -19,25 +19,42 @@ mod _tokenize {
1919
use ruff_text_size::{Ranged, TextRange};
2020
use std::{cmp::Ordering, fmt};
2121

22+
/// Cpython `__import__("token").OP`
23+
const TOKEN_OP: u8 = 55;
24+
2225
#[pyattr]
2326
#[pyclass(name = "TokenizerIter")]
2427
#[derive(PyPayload)]
2528
pub struct PyTokenizerIter {
26-
readline: ArgCallable,
29+
readline: ArgCallable, // TODO: This should be PyObject
2730
extra_tokens: bool,
28-
encoding: String,
31+
encoding: Option<String>,
2932
state: PyRwLock<PyTokenizerIterState>,
3033
}
3134

3235
impl PyTokenizerIter {
3336
fn readline(&self, vm: &VirtualMachine) -> PyResult<String> {
34-
// TODO: Downcast to diffrent type based on encoding.
35-
Ok(self
36-
.readline
37-
.invoke((), vm)?
38-
.downcast::<PyStr>()
39-
.map(|s| s.as_str().to_owned())
40-
.map_err(|_| vm.new_type_error("readline() returned a non-string object"))?)
37+
// TODO: When `readline` is PyObject,
38+
// we need to check if it's callable and raise a type error if it's not.
39+
let raw_line = match self.readline.invoke((), vm) {
40+
Ok(v) => v,
41+
Err(_) => return Ok(String::new()),
42+
};
43+
Ok(match &self.encoding {
44+
Some(encoding) => {
45+
let bytes = raw_line
46+
.downcast::<PyBytes>()
47+
.map_err(|_| vm.new_type_error("readline() returned a non-bytes object"))?;
48+
vm.state
49+
.codec_registry
50+
.decode_text(bytes.into(), &encoding, None, vm)
51+
.map(|s| s.as_str().to_owned())?
52+
}
53+
None => raw_line
54+
.downcast::<PyStr>()
55+
.map(|s| s.as_str().to_owned())
56+
.map_err(|_| vm.new_type_error("readline() returned a non-string object"))?,
57+
})
4158
}
4259
}
4360

@@ -67,7 +84,7 @@ mod _tokenize {
6784
Self {
6885
readline,
6986
extra_tokens,
70-
encoding,
87+
encoding: encoding.map(|s| s.as_str().to_owned()),
7188
state: PyRwLock::new(PyTokenizerIterState::default()),
7289
}
7390
.into_ref_with_type(vm, cls)
@@ -124,7 +141,7 @@ mod _tokenize {
124141

125142
let token_kind = token.kind();
126143
let token_value = if zelf.extra_tokens && token_kind.is_operator() {
127-
55 // token.OP
144+
TOKEN_OP
128145
} else {
129146
token_kind_value(token_kind)
130147
};
@@ -160,8 +177,8 @@ mod _tokenize {
160177
readline: ArgCallable,
161178
#[pyarg(named)]
162179
extra_tokens: bool,
163-
#[pyarg(named, default = String::from("utf-8"))]
164-
encoding: String,
180+
#[pyarg(named, optional)]
181+
encoding: Option<PyStrRef>,
165182
}
166183

167184
#[derive(Clone, Debug)]
@@ -179,15 +196,6 @@ mod _tokenize {
179196
eof: bool,
180197
}
181198

182-
impl Ranged for PyTokenizerIterState {
183-
fn range(&self) -> TextRange {
184-
match self.prev_token {
185-
Some(token) => token.range(),
186-
None => TextRange::default(),
187-
}
188-
}
189-
}
190-
191199
impl PyTokenizerIterState {
192200
fn push_line(&mut self, line: &str) {
193201
self.source.push_str(line);
@@ -229,6 +237,14 @@ mod _tokenize {
229237
None
230238
}
231239

240+
#[must_use]
241+
fn range(&self) -> TextRange {
242+
match self.prev_token {
243+
Some(token) => token.range(),
244+
None => TextRange::default(),
245+
}
246+
}
247+
232248
#[must_use]
233249
fn start(&self) -> (usize, usize) {
234250
let lc = self

0 commit comments

Comments
 (0)