diff --git a/Cargo.lock b/Cargo.lock index 9903fd1933..7a750e5edc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -191,7 +191,7 @@ version = "0.71.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "cexpr", "clang-sys", "itertools 0.13.0", @@ -213,9 +213,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.9.4" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" [[package]] name = "blake2" @@ -237,9 +237,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" dependencies = [ "memchr", "regex-automata", @@ -296,9 +296,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.41" +version = "1.2.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" +checksum = "739eb0f94557554b3ca9a86d2d37bebd49c5e6d0c1d2bda35ba5bdac830befc2" dependencies = [ "find-msvc-tools", "shlex", @@ -378,18 +378,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.49" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4512b90fa68d3a9932cea5184017c5d200f5921df706d45e853537dea51508f" +checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.49" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0025e98baa12e766c67ba13ff4695a887a1eba19569aad00a472546795bd6730" +checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" dependencies = [ "anstyle", "clap_lex", @@ -796,9 +796,9 @@ dependencies = [ [[package]] name = "dns-lookup" -version = "3.0.0" +version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "853d5bcf0b73bd5e6d945b976288621825c7166e9f06c5a035ae1aaf42d1b64f" +checksum = "6e39034cee21a2f5bbb66ba0e3689819c4bb5d00382a282006e802a7ffa6c41d" dependencies = [ "cfg-if", "libc", @@ -948,9 +948,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", "libz-rs-sys", @@ -990,9 +990,9 @@ dependencies = [ [[package]] name = "get-size-derive2" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3814abc7da8ab18d2fd820f5b540b5e39b6af0a32de1bdd7c47576693074843" +checksum = "46b134aa084df7c3a513a1035c52f623e4b3065dfaf3d905a4f28a2e79b5bb3f" dependencies = [ "attribute-derive", "quote", @@ -1001,9 +1001,9 @@ dependencies = [ [[package]] name = "get-size2" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfe2cec5b5ce8fb94dcdb16a1708baa4d0609cc3ce305ca5d3f6f2ffb59baed" +checksum = "c0d51c9f2e956a517619ad9e7eaebc7a573f9c49b38152e12eade750f89156f9" dependencies = [ "compact_str", "get-size-derive2", @@ -1124,11 +1124,11 @@ checksum = "dfa686283ad6dd069f105e5ab091b04c62850d3e4cf5d67debad1933f55023df" [[package]] name = "home" -version = "0.5.11" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -1167,9 +1167,12 @@ dependencies = [ [[package]] name = "indoc" -version = "2.0.6" +version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] [[package]] name = "insta" @@ -1202,9 +1205,9 @@ dependencies = [ [[package]] name = "is_terminal_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -1373,7 +1376,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "libc", ] @@ -1614,7 +1617,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "cfg-if", "cfg_aliases", "libc", @@ -1627,7 +1630,7 @@ version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "cfg-if", "cfg_aliases", "libc", @@ -1683,9 +1686,9 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a973b4e44ce6cad84ce69d797acf9a044532e4184c4f267913d1b546a0727b7a" +checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" dependencies = [ "num_enum_derive", "rustversion", @@ -1693,9 +1696,9 @@ dependencies = [ [[package]] name = "num_enum_derive" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77e878c846a8abae00dd069496dbe8751b16ac1c3d6bd2a7283a938e8228f90d" +checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" dependencies = [ "proc-macro2", "quote", @@ -1710,9 +1713,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "oorandom" @@ -1726,7 +1729,7 @@ version = "0.10.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24ad14dd45412269e1a30f52ad8f0664f0f4f4a89ee8fe28c3b3527021ebb654" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "cfg-if", "foreign-types", "libc", @@ -1754,9 +1757,9 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "openssl-src" -version = "300.5.3+3.5.4" +version = "300.5.4+3.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc6bad8cd0233b63971e232cc9c5e83039375b8586d2312f31fda85db8f888c2" +checksum = "a507b3792995dae9b0df8a1c1e3771e8418b7c2d9f0baeba32e6fe8b06c7cb72" dependencies = [ "cc", ] @@ -1963,9 +1966,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" dependencies = [ "unicode-ident", ] @@ -2187,7 +2190,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", ] [[package]] @@ -2283,7 +2286,7 @@ version = "0.0.0" source = "git+https://github.com/astral-sh/ruff.git?tag=0.14.1#2bffef59665ce7d2630dfd72ee99846663660db8" dependencies = [ "aho-corasick", - "bitflags 2.9.4", + "bitflags 2.10.0", "compact_str", "get-size2", "is-macro", @@ -2301,7 +2304,7 @@ name = "ruff_python_parser" version = "0.0.0" source = "git+https://github.com/astral-sh/ruff.git?tag=0.14.1#2bffef59665ce7d2630dfd72ee99846663660db8" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "bstr", "compact_str", "get-size2", @@ -2356,7 +2359,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "errno", "libc", "linux-raw-sys", @@ -2391,7 +2394,7 @@ name = "rustpython-codegen" version = "0.4.0" dependencies = [ "ahash", - "bitflags 2.9.4", + "bitflags 2.10.0", "indexmap", "insta", "itertools 0.14.0", @@ -2415,7 +2418,7 @@ name = "rustpython-common" version = "0.4.0" dependencies = [ "ascii", - "bitflags 2.9.4", + "bitflags 2.10.0", "cfg-if", "getrandom 0.3.4", "itertools 0.14.0", @@ -2454,7 +2457,7 @@ dependencies = [ name = "rustpython-compiler-core" version = "0.4.0" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "itertools 0.14.0", "lz4_flex", "malachite-bigint", @@ -2536,7 +2539,7 @@ dependencies = [ name = "rustpython-sre_engine" version = "0.4.0" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "criterion", "num_enum", "optional", @@ -2590,6 +2593,10 @@ dependencies = [ "phf", "pymath", "rand_core 0.9.3", + "ruff_python_ast", + "ruff_python_parser", + "ruff_source_file", + "ruff_text_size", "rustix", "rustpython-common", "rustpython-derive", @@ -2626,7 +2633,7 @@ version = "0.4.0" dependencies = [ "ahash", "ascii", - "bitflags 2.9.4", + "bitflags 2.10.0", "bstr", "caseless", "cfg-if", @@ -2737,7 +2744,7 @@ version = "17.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "cfg-if", "clipboard-win", "fd-lock", @@ -2979,9 +2986,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.107" +version = "2.0.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" +checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" dependencies = [ "proc-macro2", "quote", @@ -3005,7 +3012,7 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "core-foundation", "system-configuration-sys", ] @@ -3337,9 +3344,9 @@ checksum = "061dbb8cc7f108532b6087a0065eff575e892a4bcb503dc57323a197457cc202" [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" [[package]] name = "unicode-normalization" diff --git a/Cargo.toml b/Cargo.toml index 3cdc471dc3..7a958922a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -158,10 +158,11 @@ rustpython-sre_engine = { path = "vm/sre_engine", version = "0.4.0" } rustpython-wtf8 = { path = "wtf8", version = "0.4.0" } rustpython-doc = { git = "https://github.com/RustPython/__doc__", tag = "0.3.0", version = "0.3.0" } -ruff_python_parser = { git = "https://github.com/astral-sh/ruff.git", tag = "0.14.1" } ruff_python_ast = { git = "https://github.com/astral-sh/ruff.git", tag = "0.14.1" } -ruff_text_size = { git = "https://github.com/astral-sh/ruff.git", tag = "0.14.1" } +ruff_python_parser = { git = "https://github.com/astral-sh/ruff.git", tag = "0.14.1" } +ruff_python_trivia = { git = "https://github.com/astral-sh/ruff.git", tag = "0.14.1" } ruff_source_file = { git = "https://github.com/astral-sh/ruff.git", tag = "0.14.1" } +ruff_text_size = { git = "https://github.com/astral-sh/ruff.git", tag = "0.14.1" } ahash = "0.8.12" ascii = "1.1" diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 61929e537f..4d15f8f1d5 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -2457,9 +2457,9 @@ def tarfilecmd_failure(self, *args): return script_helper.assert_python_failure('-m', 'tarfile', *args) def make_simple_tarfile(self, tar_name): - files = [support.findfile('tokenize_tests.txt'), + files = [support.findfile('tokenize_tests.txt', subdir='tokenizedata'), support.findfile('tokenize_tests-no-coding-cookie-' - 'and-utf8-bom-sig-only.txt')] + 'and-utf8-bom-sig-only.txt', subdir='tokenizedata')] self.addCleanup(os_helper.unlink, tar_name) with tarfile.open(tar_name, 'w') as tf: for tardata in files: @@ -2542,9 +2542,9 @@ def test_list_command_invalid_file(self): self.assertEqual(rc, 1) def test_create_command(self): - files = [support.findfile('tokenize_tests.txt'), + files = [support.findfile('tokenize_tests.txt', subdir='tokenizedata'), support.findfile('tokenize_tests-no-coding-cookie-' - 'and-utf8-bom-sig-only.txt')] + 'and-utf8-bom-sig-only.txt', subdir='tokenizedata')] for opt in '-c', '--create': try: out = self.tarfilecmd(opt, tmpname, *files) @@ -2555,9 +2555,9 @@ def test_create_command(self): os_helper.unlink(tmpname) def test_create_command_verbose(self): - files = [support.findfile('tokenize_tests.txt'), + files = [support.findfile('tokenize_tests.txt', subdir='tokenizedata'), support.findfile('tokenize_tests-no-coding-cookie-' - 'and-utf8-bom-sig-only.txt')] + 'and-utf8-bom-sig-only.txt', subdir='tokenizedata')] for opt in '-v', '--verbose': try: out = self.tarfilecmd(opt, '-c', tmpname, *files, @@ -2569,7 +2569,7 @@ def test_create_command_verbose(self): os_helper.unlink(tmpname) def test_create_command_dotless_filename(self): - files = [support.findfile('tokenize_tests.txt')] + files = [support.findfile('tokenize_tests.txt', subdir='tokenizedata')] try: out = self.tarfilecmd('-c', dotlessname, *files) self.assertEqual(out, b'') @@ -2580,7 +2580,7 @@ def test_create_command_dotless_filename(self): def test_create_command_dot_started_filename(self): tar_name = os.path.join(TEMPDIR, ".testtar") - files = [support.findfile('tokenize_tests.txt')] + files = [support.findfile('tokenize_tests.txt', subdir='tokenizedata')] try: out = self.tarfilecmd('-c', tar_name, *files) self.assertEqual(out, b'') @@ -2590,9 +2590,9 @@ def test_create_command_dot_started_filename(self): os_helper.unlink(tar_name) def test_create_command_compressed(self): - files = [support.findfile('tokenize_tests.txt'), + files = [support.findfile('tokenize_tests.txt', subdir='tokenizedata'), support.findfile('tokenize_tests-no-coding-cookie-' - 'and-utf8-bom-sig-only.txt')] + 'and-utf8-bom-sig-only.txt', subdir='tokenizedata')] for filetype in (GzipTest, Bz2Test, LzmaTest): if not filetype.open: continue diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 44ef4e2416..de81a1bfa7 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,17 +1,18 @@ -from test import support -from test.support import os_helper -from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, - STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, - open as tokenize_open, Untokenizer, generate_tokens, - NEWLINE) -from io import BytesIO, StringIO +import os +import re +import token +import tokenize import unittest +from io import BytesIO, StringIO from textwrap import dedent from unittest import TestCase, mock -from test.test_grammar import (VALID_UNDERSCORE_LITERALS, - INVALID_UNDERSCORE_LITERALS) -import os -import token +from test import support +from test.support import os_helper +from test.support.script_helper import run_test_script, make_script, run_python_until_end +from test.support.numbers import ( + VALID_UNDERSCORE_LITERALS, + INVALID_UNDERSCORE_LITERALS, +) # Converts a source string into a list of textual representation @@ -24,12 +25,12 @@ def stringify_tokens_from_source(token_generator, source_string): missing_trailing_nl = source_string[-1] not in '\r\n' for type, token, start, end, line in token_generator: - if type == ENDMARKER: + if type == tokenize.ENDMARKER: break # Ignore the new line on the last line if the input lacks one - if missing_trailing_nl and type == NEWLINE and end[0] == num_lines: + if missing_trailing_nl and type == tokenize.NEWLINE and end[0] == num_lines: continue - type = tok_name[type] + type = tokenize.tok_name[type] result.append(f" {type:10} {token!r:13} {start} {end}") return result @@ -45,19 +46,39 @@ def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. f = BytesIO(s.encode('utf-8')) - result = stringify_tokens_from_source(tokenize(f.readline), s) + result = stringify_tokens_from_source(tokenize.tokenize(f.readline), s) self.assertEqual(result, [" ENCODING 'utf-8' (0, 0) (0, 0)"] + expected.rstrip().splitlines()) + def test_invalid_readline(self): + def gen(): + yield "sdfosdg" + yield "sdfosdg" + with self.assertRaises(TypeError): + list(tokenize.tokenize(gen().__next__)) + + def gen(): + yield b"sdfosdg" + yield b"sdfosdg" + with self.assertRaises(TypeError): + list(tokenize.generate_tokens(gen().__next__)) + + def gen(): + yield "sdfosdg" + 1/0 + with self.assertRaises(ZeroDivisionError): + list(tokenize.generate_tokens(gen().__next__)) + def test_implicit_newline(self): # Make sure that the tokenizer puts in an implicit NEWLINE # when the input lacks a trailing new line. f = BytesIO("x".encode('utf-8')) - tokens = list(tokenize(f.readline)) - self.assertEqual(tokens[-2].type, NEWLINE) - self.assertEqual(tokens[-1].type, ENDMARKER) + tokens = list(tokenize.tokenize(f.readline)) + self.assertEqual(tokens[-2].type, tokenize.NEWLINE) + self.assertEqual(tokens[-1].type, tokenize.ENDMARKER) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_basic(self): self.check_tokenize("1 + 1", """\ NUMBER '1' (1, 0) (1, 1) @@ -83,6 +104,32 @@ def test_basic(self): NEWLINE '\\n' (4, 26) (4, 27) DEDENT '' (5, 0) (5, 0) """) + + self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\ + NAME 'if' (1, 0) (1, 2) + NAME 'True' (1, 3) (1, 7) + OP ':' (1, 7) (1, 8) + NEWLINE '\\r\\n' (1, 8) (1, 10) + COMMENT '# NL' (2, 4) (2, 8) + NL '\\r\\n' (2, 8) (2, 10) + INDENT ' ' (3, 0) (3, 4) + NAME 'foo' (3, 4) (3, 7) + OP '=' (3, 7) (3, 8) + STRING "\'bar\'" (3, 8) (3, 13) + NEWLINE '\\r\\n' (3, 13) (3, 15) + NL '\\r\\n' (4, 0) (4, 2) + DEDENT '' (5, 0) (5, 0) + """) + + self.check_tokenize("x = 1 + \\\r\n1\r\n", """\ + NAME 'x' (1, 0) (1, 1) + OP '=' (1, 2) (1, 3) + NUMBER '1' (1, 4) (1, 5) + OP '+' (1, 6) (1, 7) + NUMBER '1' (2, 0) (2, 1) + NEWLINE '\\r\\n' (2, 1) (2, 3) + """) + indent_error_file = b"""\ def k(x): x += 2 @@ -91,9 +138,18 @@ def k(x): readline = BytesIO(indent_error_file).readline with self.assertRaisesRegex(IndentationError, "unindent does not match any " - "outer indentation level"): - for tok in tokenize(readline): + "outer indentation level") as e: + for tok in tokenize.tokenize(readline): pass + self.assertEqual(e.exception.lineno, 3) + self.assertEqual(e.exception.filename, '') + self.assertEqual(e.exception.end_lineno, None) + self.assertEqual(e.exception.end_offset, None) + self.assertEqual( + e.exception.msg, + 'unindent does not match any outer indentation level') + self.assertEqual(e.exception.offset, 9) + self.assertEqual(e.exception.text, ' x += 5') def test_int(self): # Ordinary integers and binary operators @@ -177,7 +233,7 @@ def test_long(self): """) def test_float(self): - # Floating point numbers + # Floating-point numbers self.check_tokenize("x = 3.14159", """\ NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -219,8 +275,8 @@ def test_float(self): def test_underscore_literals(self): def number_token(s): f = BytesIO(s.encode('utf-8')) - for toktype, token, start, end, line in tokenize(f.readline): - if toktype == NUMBER: + for toktype, token, start, end, line in tokenize.tokenize(f.readline): + if toktype == tokenize.NUMBER: return token return 'invalid token' for lit in VALID_UNDERSCORE_LITERALS: @@ -228,9 +284,19 @@ def number_token(s): # this won't work with compound complex inputs continue self.assertEqual(number_token(lit), lit) + # Valid cases with extra underscores in the tokenize module + # See gh-105549 for context + extra_valid_cases = {"0_7", "09_99"} for lit in INVALID_UNDERSCORE_LITERALS: + if lit in extra_valid_cases: + continue + try: + number_token(lit) + except tokenize.TokenError: + continue self.assertNotEqual(number_token(lit), lit) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_string(self): # String literals self.check_tokenize("x = ''; y = \"\"", """\ @@ -380,21 +446,175 @@ def test_string(self): STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) """) self.check_tokenize('f"abc"', """\ - STRING 'f"abc"' (1, 0) (1, 6) + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc' (1, 2) (1, 5) + FSTRING_END '"' (1, 5) (1, 6) """) self.check_tokenize('fR"a{b}c"', """\ - STRING 'fR"a{b}c"' (1, 0) (1, 9) + FSTRING_START 'fR"' (1, 0) (1, 3) + FSTRING_MIDDLE 'a' (1, 3) (1, 4) + OP '{' (1, 4) (1, 5) + NAME 'b' (1, 5) (1, 6) + OP '}' (1, 6) (1, 7) + FSTRING_MIDDLE 'c' (1, 7) (1, 8) + FSTRING_END '"' (1, 8) (1, 9) + """) + self.check_tokenize('fR"a{{{b!r}}}c"', """\ + FSTRING_START 'fR"' (1, 0) (1, 3) + FSTRING_MIDDLE 'a{' (1, 3) (1, 5) + OP '{' (1, 6) (1, 7) + NAME 'b' (1, 7) (1, 8) + OP '!' (1, 8) (1, 9) + NAME 'r' (1, 9) (1, 10) + OP '}' (1, 10) (1, 11) + FSTRING_MIDDLE '}' (1, 11) (1, 12) + FSTRING_MIDDLE 'c' (1, 13) (1, 14) + FSTRING_END '"' (1, 14) (1, 15) + """) + self.check_tokenize('f"{{{1+1}}}"', """\ + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE '{' (1, 2) (1, 3) + OP '{' (1, 4) (1, 5) + NUMBER '1' (1, 5) (1, 6) + OP '+' (1, 6) (1, 7) + NUMBER '1' (1, 7) (1, 8) + OP '}' (1, 8) (1, 9) + FSTRING_MIDDLE '}' (1, 9) (1, 10) + FSTRING_END '"' (1, 11) (1, 12) + """) + self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + OP '{' (1, 4) (1, 5) + FSTRING_START "f'''" (1, 5) (1, 9) + OP '{' (1, 9) (1, 10) + FSTRING_START "f'" (1, 10) (1, 12) + OP '{' (1, 12) (1, 13) + FSTRING_START 'f"' (1, 13) (1, 15) + OP '{' (1, 15) (1, 16) + NUMBER '1' (1, 16) (1, 17) + OP '+' (1, 17) (1, 18) + NUMBER '1' (1, 18) (1, 19) + OP '}' (1, 19) (1, 20) + FSTRING_END '"' (1, 20) (1, 21) + OP '}' (1, 21) (1, 22) + FSTRING_END "'" (1, 22) (1, 23) + OP '}' (1, 23) (1, 24) + FSTRING_END "'''" (1, 24) (1, 27) + OP '}' (1, 27) (1, 28) + FSTRING_END '\"""' (1, 28) (1, 31) + """) + self.check_tokenize('f""" x\nstr(data, encoding={invalid!r})\n"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE ' x\\nstr(data, encoding=' (1, 4) (2, 19) + OP '{' (2, 19) (2, 20) + NAME 'invalid' (2, 20) (2, 27) + OP '!' (2, 27) (2, 28) + NAME 'r' (2, 28) (2, 29) + OP '}' (2, 29) (2, 30) + FSTRING_MIDDLE ')\\n' (2, 30) (3, 0) + FSTRING_END '\"""' (3, 0) (3, 3) + """) + self.check_tokenize('f"""123456789\nsomething{None}bad"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE '123456789\\nsomething' (1, 4) (2, 9) + OP '{' (2, 9) (2, 10) + NAME 'None' (2, 10) (2, 14) + OP '}' (2, 14) (2, 15) + FSTRING_MIDDLE 'bad' (2, 15) (2, 18) + FSTRING_END '\"""' (2, 18) (2, 21) """) self.check_tokenize('f"""abc"""', """\ - STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10) + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE 'abc' (1, 4) (1, 7) + FSTRING_END '\"""' (1, 7) (1, 10) """) self.check_tokenize(r'f"abc\ def"', """\ - STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4) + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 2) (2, 3) + FSTRING_END '"' (2, 3) (2, 4) """) self.check_tokenize(r'Rf"abc\ def"', """\ - STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4) + FSTRING_START 'Rf"' (1, 0) (1, 3) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3) + FSTRING_END '"' (2, 3) (2, 4) + """) + self.check_tokenize("f'some words {a+b:.3f} more words {c+d=} final words'", """\ + FSTRING_START "f'" (1, 0) (1, 2) + FSTRING_MIDDLE 'some words ' (1, 2) (1, 13) + OP '{' (1, 13) (1, 14) + NAME 'a' (1, 14) (1, 15) + OP '+' (1, 15) (1, 16) + NAME 'b' (1, 16) (1, 17) + OP ':' (1, 17) (1, 18) + FSTRING_MIDDLE '.3f' (1, 18) (1, 21) + OP '}' (1, 21) (1, 22) + FSTRING_MIDDLE ' more words ' (1, 22) (1, 34) + OP '{' (1, 34) (1, 35) + NAME 'c' (1, 35) (1, 36) + OP '+' (1, 36) (1, 37) + NAME 'd' (1, 37) (1, 38) + OP '=' (1, 38) (1, 39) + OP '}' (1, 39) (1, 40) + FSTRING_MIDDLE ' final words' (1, 40) (1, 52) + FSTRING_END "'" (1, 52) (1, 53) + """) + self.check_tokenize("""\ +f'''{ +3 +=}'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + OP '{' (1, 4) (1, 5) + NL '\\n' (1, 5) (1, 6) + NUMBER '3' (2, 0) (2, 1) + NL '\\n' (2, 1) (2, 2) + OP '=' (3, 0) (3, 1) + OP '}' (3, 1) (3, 2) + FSTRING_END "'''" (3, 2) (3, 5) + """) + self.check_tokenize("""\ +f'''__{ + x:a +}__'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + FSTRING_MIDDLE '__' (1, 4) (1, 6) + OP '{' (1, 6) (1, 7) + NL '\\n' (1, 7) (1, 8) + NAME 'x' (2, 4) (2, 5) + OP ':' (2, 5) (2, 6) + FSTRING_MIDDLE 'a\\n' (2, 6) (3, 0) + OP '}' (3, 0) (3, 1) + FSTRING_MIDDLE '__' (3, 1) (3, 3) + FSTRING_END "'''" (3, 3) (3, 6) + """) + self.check_tokenize("""\ +f'''__{ + x:a + b + c + d +}__'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + FSTRING_MIDDLE '__' (1, 4) (1, 6) + OP '{' (1, 6) (1, 7) + NL '\\n' (1, 7) (1, 8) + NAME 'x' (2, 4) (2, 5) + OP ':' (2, 5) (2, 6) + FSTRING_MIDDLE 'a\\n b\\n c\\n d\\n' (2, 6) (6, 0) + OP '}' (6, 0) (6, 1) + FSTRING_MIDDLE '__' (6, 1) (6, 3) + FSTRING_END "'''" (6, 3) (6, 6) + """) + + self.check_tokenize("""\ + '''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli + aktualni pracownicy, obecni pracownicy''' +""", """\ + INDENT ' ' (1, 0) (1, 4) + STRING "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45) + NEWLINE '\\n' (2, 45) (2, 46) + DEDENT '' (3, 0) (3, 0) """) def test_function(self): @@ -457,6 +677,7 @@ def test_function(self): NAME 'pass' (1, 34) (1, 38) """) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_comparison(self): # Comparison self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " @@ -587,6 +808,7 @@ def test_unary(self): NUMBER '1' (1, 22) (1, 23) """) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_selector(self): # Selector self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\ @@ -609,6 +831,7 @@ def test_selector(self): OP ')' (2, 29) (2, 30) """) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_method(self): # Methods self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\ @@ -626,6 +849,7 @@ def test_method(self): NAME 'pass' (2, 14) (2, 18) """) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_tabs(self): # Evil tabs self.check_tokenize("def f():\n" @@ -647,6 +871,7 @@ def test_tabs(self): DEDENT '' (4, 0) (4, 0) """) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_non_ascii_identifiers(self): # Non-ascii identifiers self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\ @@ -659,6 +884,7 @@ def test_non_ascii_identifiers(self): STRING "'green'" (2, 7) (2, 14) """) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_unicode(self): # Legacy unicode literals: self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ @@ -671,6 +897,7 @@ def test_unicode(self): STRING "U'green'" (2, 7) (2, 15) """) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_async(self): # Async/await extension: self.check_tokenize("async = 1", """\ @@ -945,29 +1172,81 @@ async def bar(): pass DEDENT '' (7, 0) (7, 0) """) + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_newline_after_parenthesized_block_with_comment(self): + self.check_tokenize('''\ +[ + # A comment here + 1 +] +''', """\ + OP '[' (1, 0) (1, 1) + NL '\\n' (1, 1) (1, 2) + COMMENT '# A comment here' (2, 4) (2, 20) + NL '\\n' (2, 20) (2, 21) + NUMBER '1' (3, 4) (3, 5) + NL '\\n' (3, 5) (3, 6) + OP ']' (4, 0) (4, 1) + NEWLINE '\\n' (4, 1) (4, 2) + """) + + def test_closing_parenthesis_from_different_line(self): + self.check_tokenize("); x", """\ + OP ')' (1, 0) (1, 1) + OP ';' (1, 1) (1, 2) + NAME 'x' (1, 3) (1, 4) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_multiline_non_ascii_fstring(self): + self.check_tokenize("""\ +a = f''' + Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli'''""", """\ + NAME 'a' (1, 0) (1, 1) + OP '=' (1, 2) (1, 3) + FSTRING_START "f\'\'\'" (1, 4) (1, 8) + FSTRING_MIDDLE '\\n Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli' (1, 8) (2, 68) + FSTRING_END "\'\'\'" (2, 68) (2, 71) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_multiline_non_ascii_fstring_with_expr(self): + self.check_tokenize("""\ +f''' + 🔗 This is a test {test_arg1}🔗 +🔗'''""", """\ + FSTRING_START "f\'\'\'" (1, 0) (1, 4) + FSTRING_MIDDLE '\\n 🔗 This is a test ' (1, 4) (2, 21) + OP '{' (2, 21) (2, 22) + NAME 'test_arg1' (2, 22) (2, 31) + OP '}' (2, 31) (2, 32) + FSTRING_MIDDLE '🔗\\n🔗' (2, 32) (3, 1) + FSTRING_END "\'\'\'" (3, 1) (3, 4) + """) + class GenerateTokensTest(TokenizeTest): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. f = StringIO(s) - result = stringify_tokens_from_source(generate_tokens(f.readline), s) + result = stringify_tokens_from_source(tokenize.generate_tokens(f.readline), s) self.assertEqual(result, expected.rstrip().splitlines()) def decistmt(s): result = [] - g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string + g = tokenize.tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string for toknum, tokval, _, _, _ in g: - if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens + if toknum == tokenize.NUMBER and '.' in tokval: # replace NUMBER tokens result.extend([ - (NAME, 'Decimal'), - (OP, '('), - (STRING, repr(tokval)), - (OP, ')') + (tokenize.NAME, 'Decimal'), + (tokenize.OP, '('), + (tokenize.STRING, repr(tokval)), + (tokenize.OP, ')') ]) else: result.append((toknum, tokval)) - return untokenize(result).decode('utf-8') + return tokenize.untokenize(result).decode('utf-8').strip() class TestMisc(TestCase): @@ -991,6 +1270,13 @@ def test_decistmt(self): self.assertEqual(eval(decistmt(s)), Decimal('-3.217160342717258261933904529E-7')) + def test___all__(self): + expected = token.__all__ + [ + "TokenInfo", "TokenError", "generate_tokens", + "detect_encoding", "untokenize", "open", "tokenize", + ] + self.assertCountEqual(tokenize.__all__, expected) + class TestTokenizerAdheresToPep0263(TestCase): """ @@ -998,9 +1284,11 @@ class TestTokenizerAdheresToPep0263(TestCase): """ def _testFile(self, filename): - path = os.path.join(os.path.dirname(__file__), filename) - TestRoundtrip.check_roundtrip(self, open(path, 'rb')) + path = os.path.join(os.path.dirname(__file__), 'tokenizedata', filename) + with open(path, 'rb') as f: + TestRoundtrip.check_roundtrip(self, f) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_utf8_coding_cookie_and_no_utf8_bom(self): f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt' self._testFile(f) @@ -1016,16 +1304,16 @@ def test_latin1_coding_cookie_and_utf8_bom(self): f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt' self.assertRaises(SyntaxError, self._testFile, f) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_no_coding_cookie_and_utf8_bom(self): f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt' self._testFile(f) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_utf8_coding_cookie_and_utf8_bom(self): f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt' self._testFile(f) - # TODO: RUSTPYTHON - @unittest.expectedFailure # "bad_coding.py" and "bad_coding2.py" make the WASM CI fail def test_bad_coding_cookie(self): self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py') self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py') @@ -1041,33 +1329,18 @@ def readline(): nonlocal first if not first: first = True - return line + yield line else: - return b'' + yield b'' # skip the initial encoding token and the end tokens - tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2] - expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + tokens = list(tokenize._generate_tokens_from_c_tokenizer(readline().__next__, + encoding='utf-8', + extra_tokens=True))[:-2] + expected_tokens = [tokenize.TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") - def test__tokenize_does_not_decode_with_encoding_none(self): - literal = '"ЉЊЈЁЂ"' - first = False - def readline(): - nonlocal first - if not first: - first = True - return literal - else: - return b'' - - # skip the end tokens - tokens = list(_tokenize(readline, encoding=None))[:-2] - expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] - self.assertEqual(tokens, expected_tokens, - "string not tokenized when encoding is None") - class TestDetectEncoding(TestCase): @@ -1088,7 +1361,7 @@ def test_no_bom_no_encoding_cookie(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, list(lines[:2])) @@ -1098,7 +1371,7 @@ def test_bom_no_cookie(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'# something\n', b'print(something)\n']) @@ -1109,7 +1382,7 @@ def test_cookie_first_line_no_bom(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso-8859-1') self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) @@ -1119,7 +1392,7 @@ def test_matched_bom_and_cookie_first_line(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'# coding=utf-8\n']) @@ -1130,7 +1403,7 @@ def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): b'do_something(else)\n' ) readline = self.get_readline(lines) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_cookie_second_line_no_bom(self): lines = ( @@ -1139,7 +1412,7 @@ def test_cookie_second_line_no_bom(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'ascii') expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] self.assertEqual(consumed_lines, expected) @@ -1151,7 +1424,7 @@ def test_matched_bom_and_cookie_second_line(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'#! something\n', b'f# coding=utf-8\n']) @@ -1164,7 +1437,7 @@ def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): b'do_something(else)\n' ) readline = self.get_readline(lines) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_cookie_second_line_noncommented_first_line(self): lines = ( @@ -1172,7 +1445,7 @@ def test_cookie_second_line_noncommented_first_line(self): b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8') expected = [b"print('\xc2\xa3')\n"] self.assertEqual(consumed_lines, expected) @@ -1183,7 +1456,7 @@ def test_cookie_second_line_commented_first_line(self): b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso8859-15') expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] self.assertEqual(consumed_lines, expected) @@ -1194,13 +1467,13 @@ def test_cookie_second_line_empty_first_line(self): b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso8859-15') expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] self.assertEqual(consumed_lines, expected) def test_latin1_normalization(self): - # See get_normal_name() in tokenizer.c. + # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", "iso-8859-1-unix", "iso-latin-1-mac") for encoding in encodings: @@ -1211,21 +1484,21 @@ def test_latin1_normalization(self): b"print(things)\n", b"do_something += 4\n") rl = self.get_readline(lines) - found, consumed_lines = detect_encoding(rl) + found, consumed_lines = tokenize.detect_encoding(rl) self.assertEqual(found, "iso-8859-1") def test_syntaxerror_latin1(self): - # Issue 14629: need to raise SyntaxError if the first + # Issue 14629: need to raise TokenError if the first # line(s) have non-UTF-8 characters lines = ( b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S ) readline = self.get_readline(lines) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_utf8_normalization(self): - # See get_normal_name() in tokenizer.c. + # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix") for encoding in encodings: for rep in ("-", "_"): @@ -1234,36 +1507,36 @@ def test_utf8_normalization(self): b"# coding: " + enc.encode("ascii") + b"\n", b"1 + 3\n") rl = self.get_readline(lines) - found, consumed_lines = detect_encoding(rl) + found, consumed_lines = tokenize.detect_encoding(rl) self.assertEqual(found, "utf-8") def test_short_files(self): readline = self.get_readline((b'print(something)\n',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, [b'print(something)\n']) - encoding, consumed_lines = detect_encoding(self.get_readline(())) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(())) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, []) readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'print(something)\n']) readline = self.get_readline((b'\xef\xbb\xbf',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, []) readline = self.get_readline((b'# coding: bad\n',)) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_false_encoding(self): # Issue 18873: "Encoding" detected in non-comment lines readline = self.get_readline((b'print("#coding=fake")',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, [b'print("#coding=fake")']) @@ -1276,14 +1549,14 @@ def test_open(self): with open(filename, 'w', encoding=encoding) as fp: print("# coding: %s" % encoding, file=fp) print("print('euro:\u20ac')", file=fp) - with tokenize_open(filename) as fp: + with tokenize.open(filename) as fp: self.assertEqual(fp.encoding, encoding) self.assertEqual(fp.mode, 'r') # test BOM (no coding cookie) with open(filename, 'w', encoding='utf-8-sig') as fp: print("print('euro:\u20ac')", file=fp) - with tokenize_open(filename) as fp: + with tokenize.open(filename) as fp: self.assertEqual(fp.encoding, 'utf-8-sig') self.assertEqual(fp.mode, 'r') @@ -1310,16 +1583,16 @@ def readline(self): ins = Bunk(lines, path) # Make sure lacking a name isn't an issue. del ins.name - detect_encoding(ins.readline) + tokenize.detect_encoding(ins.readline) with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)): ins = Bunk(lines, path) - detect_encoding(ins.readline) + tokenize.detect_encoding(ins.readline) def test_open_error(self): # Issue #23840: open() must close the binary file on error m = BytesIO(b'#coding:xxx') with mock.patch('tokenize._builtin_open', return_value=m): - self.assertRaises(SyntaxError, tokenize_open, 'foobar') + self.assertRaises(SyntaxError, tokenize.open, 'foobar') self.assertTrue(m.closed) @@ -1327,17 +1600,20 @@ class TestTokenize(TestCase): def test_tokenize(self): import tokenize as tokenize_module - encoding = object() + encoding = "utf-8" encoding_used = None def mock_detect_encoding(readline): return encoding, [b'first', b'second'] - def mock__tokenize(readline, encoding): + def mock__tokenize(readline, encoding, **kwargs): nonlocal encoding_used encoding_used = encoding out = [] while True: - next_line = readline() + try: + next_line = readline() + except StopIteration: + return out if next_line: out.append(next_line) continue @@ -1352,16 +1628,16 @@ def mock_readline(): return str(counter).encode() orig_detect_encoding = tokenize_module.detect_encoding - orig__tokenize = tokenize_module._tokenize + orig_c_token = tokenize_module._generate_tokens_from_c_tokenizer tokenize_module.detect_encoding = mock_detect_encoding - tokenize_module._tokenize = mock__tokenize + tokenize_module._generate_tokens_from_c_tokenizer = mock__tokenize try: - results = tokenize(mock_readline) - self.assertEqual(list(results), + results = tokenize.tokenize(mock_readline) + self.assertEqual(list(results)[1:], [b'first', b'second', b'1', b'2', b'3', b'4']) finally: tokenize_module.detect_encoding = orig_detect_encoding - tokenize_module._tokenize = orig__tokenize + tokenize_module._generate_tokens_from_c_tokenizer = orig_c_token self.assertEqual(encoding_used, encoding) @@ -1373,23 +1649,23 @@ def test_oneline_defs(self): buf = '\n'.join(buf) # Test that 500 consequent, one-line defs is OK - toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) + toks = list(tokenize.tokenize(BytesIO(buf.encode('utf-8')).readline)) self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER # [-2] is always NEWLINE def assertExactTypeEqual(self, opstr, *optypes): - tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) + tokens = list(tokenize.tokenize(BytesIO(opstr.encode('utf-8')).readline)) num_optypes = len(optypes) self.assertEqual(len(tokens), 3 + num_optypes) - self.assertEqual(tok_name[tokens[0].exact_type], - tok_name[ENCODING]) + self.assertEqual(tokenize.tok_name[tokens[0].exact_type], + tokenize.tok_name[tokenize.ENCODING]) for i in range(num_optypes): - self.assertEqual(tok_name[tokens[i + 1].exact_type], - tok_name[optypes[i]]) - self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type], - tok_name[token.NEWLINE]) - self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type], - tok_name[token.ENDMARKER]) + self.assertEqual(tokenize.tok_name[tokens[i + 1].exact_type], + tokenize.tok_name[optypes[i]]) + self.assertEqual(tokenize.tok_name[tokens[1 + num_optypes].exact_type], + tokenize.tok_name[token.NEWLINE]) + self.assertEqual(tokenize.tok_name[tokens[2 + num_optypes].exact_type], + tokenize.tok_name[token.ENDMARKER]) def test_exact_type(self): self.assertExactTypeEqual('()', token.LPAR, token.RPAR) @@ -1439,11 +1715,11 @@ def test_exact_type(self): self.assertExactTypeEqual('@=', token.ATEQUAL) self.assertExactTypeEqual('a**2+b**2==c**2', - NAME, token.DOUBLESTAR, NUMBER, + tokenize.NAME, token.DOUBLESTAR, tokenize.NUMBER, token.PLUS, - NAME, token.DOUBLESTAR, NUMBER, + tokenize.NAME, token.DOUBLESTAR, tokenize.NUMBER, token.EQEQUAL, - NAME, token.DOUBLESTAR, NUMBER) + tokenize.NAME, token.DOUBLESTAR, tokenize.NUMBER) self.assertExactTypeEqual('{1, 2, 3}', token.LBRACE, token.NUMBER, token.COMMA, @@ -1460,22 +1736,58 @@ def test_pathological_trailing_whitespace(self): # See http://bugs.python.org/issue16152 self.assertExactTypeEqual('@ ', token.AT) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_comment_at_the_end_of_the_source_without_newline(self): # See http://bugs.python.org/issue44667 source = 'b = 1\n\n#test' - expected_tokens = [token.NAME, token.EQUAL, token.NUMBER, token.NEWLINE, token.NL, token.COMMENT] + expected_tokens = [ + tokenize.TokenInfo(type=token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''), + tokenize.TokenInfo(type=token.NAME, string='b', start=(1, 0), end=(1, 1), line='b = 1\n'), + tokenize.TokenInfo(type=token.OP, string='=', start=(1, 2), end=(1, 3), line='b = 1\n'), + tokenize.TokenInfo(type=token.NUMBER, string='1', start=(1, 4), end=(1, 5), line='b = 1\n'), + tokenize.TokenInfo(type=token.NEWLINE, string='\n', start=(1, 5), end=(1, 6), line='b = 1\n'), + tokenize.TokenInfo(type=token.NL, string='\n', start=(2, 0), end=(2, 1), line='\n'), + tokenize.TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test'), + tokenize.TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test'), + tokenize.TokenInfo(type=token.ENDMARKER, string='', start=(4, 0), end=(4, 0), line='') + ] + + tokens = list(tokenize.tokenize(BytesIO(source.encode('utf-8')).readline)) + self.assertEqual(tokens, expected_tokens) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_newline_and_space_at_the_end_of_the_source_without_newline(self): + # See https://github.com/python/cpython/issues/105435 + source = 'a\n ' + expected_tokens = [ + tokenize.TokenInfo(token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''), + tokenize.TokenInfo(token.NAME, string='a', start=(1, 0), end=(1, 1), line='a\n'), + tokenize.TokenInfo(token.NEWLINE, string='\n', start=(1, 1), end=(1, 2), line='a\n'), + tokenize.TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' '), + tokenize.TokenInfo(token.ENDMARKER, string='', start=(3, 0), end=(3, 0), line='') + ] + + tokens = list(tokenize.tokenize(BytesIO(source.encode('utf-8')).readline)) + self.assertEqual(tokens, expected_tokens) + + def test_invalid_character_in_fstring_middle(self): + # See gh-103824 + script = b'''F""" + \xe5"""''' + + with os_helper.temp_dir() as temp_dir: + filename = os.path.join(temp_dir, "script.py") + with open(filename, 'wb') as file: + file.write(script) + rs, _ = run_python_until_end(filename) + self.assertIn(b"SyntaxError", rs.err) - tokens = list(tokenize(BytesIO(source.encode('utf-8')).readline)) - self.assertEqual(tok_name[tokens[0].exact_type], tok_name[ENCODING]) - for i in range(6): - self.assertEqual(tok_name[tokens[i + 1].exact_type], tok_name[expected_tokens[i]]) - self.assertEqual(tok_name[tokens[-1].exact_type], tok_name[token.ENDMARKER]) class UntokenizeTest(TestCase): def test_bad_input_order(self): # raise if previous row - u = Untokenizer() + u = tokenize.Untokenizer() u.prev_row = 2 u.prev_col = 2 with self.assertRaises(ValueError) as cm: @@ -1485,9 +1797,10 @@ def test_bad_input_order(self): # raise if previous column in row self.assertRaises(ValueError, u.add_whitespace, (2,1)) + @unittest.expectedFailure # TODO: RUSTPYTHON def test_backslash_continuation(self): # The problem is that \ leaves no token - u = Untokenizer() + u = tokenize.Untokenizer() u.prev_row = 1 u.prev_col = 1 u.tokens = [] @@ -1499,21 +1812,38 @@ def test_backslash_continuation(self): TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') def test_iter_compat(self): - u = Untokenizer() - token = (NAME, 'Hello') - tokens = [(ENCODING, 'utf-8'), token] + u = tokenize.Untokenizer() + token = (tokenize.NAME, 'Hello') + tokens = [(tokenize.ENCODING, 'utf-8'), token] u.compat(token, iter([])) self.assertEqual(u.tokens, ["Hello "]) - u = Untokenizer() + u = tokenize.Untokenizer() self.assertEqual(u.untokenize(iter([token])), 'Hello ') - u = Untokenizer() + u = tokenize.Untokenizer() self.assertEqual(u.untokenize(iter(tokens)), 'Hello ') self.assertEqual(u.encoding, 'utf-8') - self.assertEqual(untokenize(iter(tokens)), b'Hello ') + self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ') + + +def contains_ambiguous_backslash(source): + """Return `True` if the source contains a backslash on a + line by itself. For example: + + a = (1 + \\ + ) + + Code like this cannot be untokenized exactly. This is because + the tokenizer does not produce any tokens for the line containing + the backslash and so there is no way to know its indent. + """ + pattern = re.compile(br'\n\s*\\\r?\n') + return pattern.search(source) is not None class TestRoundtrip(TestCase): + @unittest.expectedFailure # TODO: RUSTPYTHON def check_roundtrip(self, f): """ Test roundtrip for `untokenize`. `f` is an open file or a string. @@ -1522,6 +1852,9 @@ def check_roundtrip(self, f): tokenize.untokenize(), and the latter tokenized again to 2-tuples. The test fails if the 3 pair tokenizations do not match. + If the source code can be untokenized unambiguously, the + untokenized code must match the original code exactly. + When untokenize bugs are fixed, untokenize with 5-tuples should reproduce code that does not contain a backslash continuation following spaces. A proper test should test this. @@ -1531,21 +1864,39 @@ def check_roundtrip(self, f): code = f.encode('utf-8') else: code = f.read() - f.close() readline = iter(code.splitlines(keepends=True)).__next__ - tokens5 = list(tokenize(readline)) + tokens5 = list(tokenize.tokenize(readline)) tokens2 = [tok[:2] for tok in tokens5] # Reproduce tokens2 from pairs - bytes_from2 = untokenize(tokens2) + bytes_from2 = tokenize.untokenize(tokens2) readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__ - tokens2_from2 = [tok[:2] for tok in tokenize(readline2)] + tokens2_from2 = [tok[:2] for tok in tokenize.tokenize(readline2)] self.assertEqual(tokens2_from2, tokens2) # Reproduce tokens2 from 5-tuples - bytes_from5 = untokenize(tokens5) + bytes_from5 = tokenize.untokenize(tokens5) readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__ - tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] + tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) + if not contains_ambiguous_backslash(code): + # The BOM does not produce a token so there is no way to preserve it. + code_without_bom = code.removeprefix(b'\xef\xbb\xbf') + readline = iter(code_without_bom.splitlines(keepends=True)).__next__ + untokenized_code = tokenize.untokenize(tokenize.tokenize(readline)) + self.assertEqual(code_without_bom, untokenized_code) + + def check_line_extraction(self, f): + if isinstance(f, str): + code = f.encode('utf-8') + else: + code = f.read() + readline = iter(code.splitlines(keepends=True)).__next__ + for tok in tokenize.tokenize(readline): + if tok.type in {tokenize.ENCODING, tokenize.ENDMARKER}: + continue + self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]]) + + @unittest.expectedFailure # TODO: RUSTPYTHON def test_roundtrip(self): # There are some standard formatting practices that are easy to get right. @@ -1561,7 +1912,7 @@ def test_roundtrip(self): self.check_roundtrip("if x == 1 : \n" " print(x)\n") - fn = support.findfile("tokenize_tests.txt") + fn = support.findfile("tokenize_tests.txt", subdir="tokenizedata") with open(fn, 'rb') as f: self.check_roundtrip(f) self.check_roundtrip("if x == 1:\n" @@ -1585,6 +1936,64 @@ def test_roundtrip(self): " print('Can not import' # comment2\n)" "else: print('Loaded')\n") + self.check_roundtrip("f'\\N{EXCLAMATION MARK}'") + self.check_roundtrip(r"f'\\N{SNAKE}'") + self.check_roundtrip(r"f'\\N{{SNAKE}}'") + self.check_roundtrip(r"f'\N{SNAKE}'") + self.check_roundtrip(r"f'\\\N{SNAKE}'") + self.check_roundtrip(r"f'\\\\\N{SNAKE}'") + self.check_roundtrip(r"f'\\\\\\\N{SNAKE}'") + + self.check_roundtrip(r"f'\\N{1}'") + self.check_roundtrip(r"f'\\\\N{2}'") + self.check_roundtrip(r"f'\\\\\\N{3}'") + self.check_roundtrip(r"f'\\\\\\\\N{4}'") + + self.check_roundtrip(r"f'\\N{{'") + self.check_roundtrip(r"f'\\\\N{{'") + self.check_roundtrip(r"f'\\\\\\N{{'") + self.check_roundtrip(r"f'\\\\\\\\N{{'") + + self.check_roundtrip(r"f'\n{{foo}}'") + self.check_roundtrip(r"f'\\n{{foo}}'") + self.check_roundtrip(r"f'\\\n{{foo}}'") + self.check_roundtrip(r"f'\\\\n{{foo}}'") + + self.check_roundtrip(r"f'\t{{foo}}'") + self.check_roundtrip(r"f'\\t{{foo}}'") + self.check_roundtrip(r"f'\\\t{{foo}}'") + self.check_roundtrip(r"f'\\\\t{{foo}}'") + + self.check_roundtrip(r"rf'\t{{foo}}'") + self.check_roundtrip(r"rf'\\t{{foo}}'") + self.check_roundtrip(r"rf'\\\t{{foo}}'") + self.check_roundtrip(r"rf'\\\\t{{foo}}'") + + self.check_roundtrip(r"rf'\{{foo}}'") + self.check_roundtrip(r"f'\\{{foo}}'") + self.check_roundtrip(r"rf'\\\{{foo}}'") + self.check_roundtrip(r"f'\\\\{{foo}}'") + cases = [ + """ +if 1: + "foo" +"bar" +""", + """ +if 1: + ("foo" + "bar") +""", + """ +if 1: + "foo" + "bar" +""" ] + for case in cases: + self.check_roundtrip(case) + + + @unittest.expectedFailure # TODO: RUSTPYTHON def test_continuation(self): # Balancing continuation self.check_roundtrip("a = (3,4, \n" @@ -1598,6 +2007,7 @@ def test_continuation(self): "+ len(z) - z[\n" "'b']\n") + @unittest.expectedFailure # TODO: RUSTPYTHON def test_backslash_continuation(self): # Backslash means line continuation, except for comments self.check_roundtrip("x=1+\\\n" @@ -1611,26 +2021,15 @@ def test_string_concatenation(self): # Two string literals on the same line self.check_roundtrip("'' ''") - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.skip("TODO: RUSTPYTHON; slow and fails") def test_random_files(self): # Test roundtrip on random python modules. # pass the '-ucpu' option to process the full directory. import glob, random - fn = support.findfile("tokenize_tests.txt") - tempdir = os.path.dirname(fn) or os.curdir + tempdir = os.path.dirname(__file__) or os.curdir testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py")) - # Tokenize is broken on test_pep3131.py because regular expressions are - # broken on the obscure unicode identifiers in it. *sigh* - # With roundtrip extended to test the 5-tuple mode of untokenize, - # 7 more testfiles fail. Remove them also until the failure is diagnosed. - - testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py")) - for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'): - testfiles.remove(os.path.join(tempdir, "test_%s.py") % f) - if not support.is_resource_enabled("cpu"): testfiles = random.sample(testfiles, 10) @@ -1640,13 +2039,15 @@ def test_random_files(self): with open(testfile, 'rb') as f: with self.subTest(file=testfile): self.check_roundtrip(f) + self.check_line_extraction(f) def roundtrip(self, code): if isinstance(code, str): code = code.encode('utf-8') - return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8') + return tokenize.untokenize(tokenize.tokenize(BytesIO(code).readline)).decode('utf-8') + @unittest.expectedFailure # TODO: RUSTPYTHON def test_indentation_semantics_retained(self): """ Ensure that although whitespace might be mutated in a roundtrip, @@ -1658,5 +2059,1134 @@ def test_indentation_semantics_retained(self): self.check_roundtrip(code) +class InvalidPythonTests(TestCase): + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_number_followed_by_name(self): + # See issue #gh-105549 + source = "2sin(x)" + expected_tokens = [ + tokenize.TokenInfo(type=token.NUMBER, string='2', start=(1, 0), end=(1, 1), line='2sin(x)'), + tokenize.TokenInfo(type=token.NAME, string='sin', start=(1, 1), end=(1, 4), line='2sin(x)'), + tokenize.TokenInfo(type=token.OP, string='(', start=(1, 4), end=(1, 5), line='2sin(x)'), + tokenize.TokenInfo(type=token.NAME, string='x', start=(1, 5), end=(1, 6), line='2sin(x)'), + tokenize.TokenInfo(type=token.OP, string=')', start=(1, 6), end=(1, 7), line='2sin(x)'), + tokenize.TokenInfo(type=token.NEWLINE, string='', start=(1, 7), end=(1, 8), line='2sin(x)'), + tokenize.TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') + ] + + tokens = list(tokenize.generate_tokens(StringIO(source).readline)) + self.assertEqual(tokens, expected_tokens) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_number_starting_with_zero(self): + source = "01234" + expected_tokens = [ + tokenize.TokenInfo(type=token.NUMBER, string='01234', start=(1, 0), end=(1, 5), line='01234'), + tokenize.TokenInfo(type=token.NEWLINE, string='', start=(1, 5), end=(1, 6), line='01234'), + tokenize.TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') + ] + + tokens = list(tokenize.generate_tokens(StringIO(source).readline)) + self.assertEqual(tokens, expected_tokens) + +class CTokenizeTest(TestCase): + def check_tokenize(self, s, expected): + # Format the tokens in s in a table format. + # The ENDMARKER and final NEWLINE are omitted. + f = StringIO(s) + with self.subTest(source=s): + result = stringify_tokens_from_source( + tokenize._generate_tokens_from_c_tokenizer(f.readline), s + ) + self.assertEqual(result, expected.rstrip().splitlines()) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_encoding(self): + def readline(encoding): + yield "1+1".encode(encoding) + + expected = [ + tokenize.TokenInfo(type=tokenize.NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1'), + tokenize.TokenInfo(type=tokenize.OP, string='+', start=(1, 1), end=(1, 2), line='1+1'), + tokenize.TokenInfo(type=tokenize.NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1'), + tokenize.TokenInfo(type=tokenize.NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1'), + tokenize.TokenInfo(type=tokenize.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') + ] + for encoding in ["utf-8", "latin-1", "utf-16"]: + with self.subTest(encoding=encoding): + tokens = list(tokenize._generate_tokens_from_c_tokenizer( + readline(encoding).__next__, + extra_tokens=True, + encoding=encoding, + )) + self.assertEqual(tokens, expected) + + def test_int(self): + + self.check_tokenize('0xff <= 255', """\ + NUMBER '0xff' (1, 0) (1, 4) + LESSEQUAL '<=' (1, 5) (1, 7) + NUMBER '255' (1, 8) (1, 11) + """) + + self.check_tokenize('0b10 <= 255', """\ + NUMBER '0b10' (1, 0) (1, 4) + LESSEQUAL '<=' (1, 5) (1, 7) + NUMBER '255' (1, 8) (1, 11) + """) + + self.check_tokenize('0o123 <= 0O123', """\ + NUMBER '0o123' (1, 0) (1, 5) + LESSEQUAL '<=' (1, 6) (1, 8) + NUMBER '0O123' (1, 9) (1, 14) + """) + + self.check_tokenize('1234567 > ~0x15', """\ + NUMBER '1234567' (1, 0) (1, 7) + GREATER '>' (1, 8) (1, 9) + TILDE '~' (1, 10) (1, 11) + NUMBER '0x15' (1, 11) (1, 15) + """) + + self.check_tokenize('2134568 != 1231515', """\ + NUMBER '2134568' (1, 0) (1, 7) + NOTEQUAL '!=' (1, 8) (1, 10) + NUMBER '1231515' (1, 11) (1, 18) + """) + + self.check_tokenize('(-124561-1) & 200000000', """\ + LPAR '(' (1, 0) (1, 1) + MINUS '-' (1, 1) (1, 2) + NUMBER '124561' (1, 2) (1, 8) + MINUS '-' (1, 8) (1, 9) + NUMBER '1' (1, 9) (1, 10) + RPAR ')' (1, 10) (1, 11) + AMPER '&' (1, 12) (1, 13) + NUMBER '200000000' (1, 14) (1, 23) + """) + + self.check_tokenize('0xdeadbeef != -1', """\ + NUMBER '0xdeadbeef' (1, 0) (1, 10) + NOTEQUAL '!=' (1, 11) (1, 13) + MINUS '-' (1, 14) (1, 15) + NUMBER '1' (1, 15) (1, 16) + """) + + self.check_tokenize('0xdeadc0de & 12345', """\ + NUMBER '0xdeadc0de' (1, 0) (1, 10) + AMPER '&' (1, 11) (1, 12) + NUMBER '12345' (1, 13) (1, 18) + """) + + self.check_tokenize('0xFF & 0x15 | 1234', """\ + NUMBER '0xFF' (1, 0) (1, 4) + AMPER '&' (1, 5) (1, 6) + NUMBER '0x15' (1, 7) (1, 11) + VBAR '|' (1, 12) (1, 13) + NUMBER '1234' (1, 14) (1, 18) + """) + + def test_float(self): + + self.check_tokenize('x = 3.14159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3.14159' (1, 4) (1, 11) + """) + + self.check_tokenize('x = 314159.', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '314159.' (1, 4) (1, 11) + """) + + self.check_tokenize('x = .314159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '.314159' (1, 4) (1, 11) + """) + + self.check_tokenize('x = 3e14159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3e14159' (1, 4) (1, 11) + """) + + self.check_tokenize('x = 3E123', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3E123' (1, 4) (1, 9) + """) + + self.check_tokenize('x+y = 3e-1230', """\ + NAME 'x' (1, 0) (1, 1) + PLUS '+' (1, 1) (1, 2) + NAME 'y' (1, 2) (1, 3) + EQUAL '=' (1, 4) (1, 5) + NUMBER '3e-1230' (1, 6) (1, 13) + """) + + self.check_tokenize('x = 3.14e159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3.14e159' (1, 4) (1, 12) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_string(self): + + self.check_tokenize('x = \'\'; y = ""', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING "''" (1, 4) (1, 6) + SEMI ';' (1, 6) (1, 7) + NAME 'y' (1, 8) (1, 9) + EQUAL '=' (1, 10) (1, 11) + STRING '""' (1, 12) (1, 14) + """) + + self.check_tokenize('x = \'"\'; y = "\'"', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING '\\'"\\'' (1, 4) (1, 7) + SEMI ';' (1, 7) (1, 8) + NAME 'y' (1, 9) (1, 10) + EQUAL '=' (1, 11) (1, 12) + STRING '"\\'"' (1, 13) (1, 16) + """) + + self.check_tokenize('x = "doesn\'t "shrink", does it"', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING '"doesn\\'t "' (1, 4) (1, 14) + NAME 'shrink' (1, 14) (1, 20) + STRING '", does it"' (1, 20) (1, 31) + """) + + self.check_tokenize("x = 'abc' + 'ABC'", """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING "'abc'" (1, 4) (1, 9) + PLUS '+' (1, 10) (1, 11) + STRING "'ABC'" (1, 12) (1, 17) + """) + + self.check_tokenize('y = "ABC" + "ABC"', """\ + NAME 'y' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING '"ABC"' (1, 4) (1, 9) + PLUS '+' (1, 10) (1, 11) + STRING '"ABC"' (1, 12) (1, 17) + """) + + self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING "r'abc'" (1, 4) (1, 10) + PLUS '+' (1, 11) (1, 12) + STRING "r'ABC'" (1, 13) (1, 19) + PLUS '+' (1, 20) (1, 21) + STRING "R'ABC'" (1, 22) (1, 28) + PLUS '+' (1, 29) (1, 30) + STRING "R'ABC'" (1, 31) (1, 37) + """) + + self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\ + NAME 'y' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING 'r"abc"' (1, 4) (1, 10) + PLUS '+' (1, 11) (1, 12) + STRING 'r"ABC"' (1, 13) (1, 19) + PLUS '+' (1, 20) (1, 21) + STRING 'R"ABC"' (1, 22) (1, 28) + PLUS '+' (1, 29) (1, 30) + STRING 'R"ABC"' (1, 31) (1, 37) + """) + + self.check_tokenize("u'abc' + U'abc'", """\ + STRING "u'abc'" (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING "U'abc'" (1, 9) (1, 15) + """) + + self.check_tokenize('u"abc" + U"abc"', """\ + STRING 'u"abc"' (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING 'U"abc"' (1, 9) (1, 15) + """) + + self.check_tokenize("b'abc' + B'abc'", """\ + STRING "b'abc'" (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING "B'abc'" (1, 9) (1, 15) + """) + + self.check_tokenize('b"abc" + B"abc"', """\ + STRING 'b"abc"' (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING 'B"abc"' (1, 9) (1, 15) + """) + + self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\ + STRING "br'abc'" (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING "bR'abc'" (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING "Br'abc'" (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING "BR'abc'" (1, 30) (1, 37) + """) + + self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\ + STRING 'br"abc"' (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING 'bR"abc"' (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING 'Br"abc"' (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING 'BR"abc"' (1, 30) (1, 37) + """) + + self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\ + STRING "rb'abc'" (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING "rB'abc'" (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING "Rb'abc'" (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING "RB'abc'" (1, 30) (1, 37) + """) + + self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\ + STRING 'rb"abc"' (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING 'rB"abc"' (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING 'Rb"abc"' (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING 'RB"abc"' (1, 30) (1, 37) + """) + + self.check_tokenize('"a\\\nde\\\nfg"', """\ + STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3) + """) + + self.check_tokenize('u"a\\\nde"', """\ + STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3) + """) + + self.check_tokenize('rb"a\\\nd"', """\ + STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2) + """) + + self.check_tokenize(r'"""a\ +b"""', """\ + STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) + """) + self.check_tokenize(r'u"""a\ +b"""', """\ + STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) + """) + self.check_tokenize(r'rb"""a\ +b\ +c"""', """\ + STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) + """) + + self.check_tokenize(r'"hola\\\r\ndfgf"', """\ + STRING \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16) + """) + + self.check_tokenize('f"abc"', """\ + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc' (1, 2) (1, 5) + FSTRING_END '"' (1, 5) (1, 6) + """) + + self.check_tokenize('fR"a{b}c"', """\ + FSTRING_START 'fR"' (1, 0) (1, 3) + FSTRING_MIDDLE 'a' (1, 3) (1, 4) + LBRACE '{' (1, 4) (1, 5) + NAME 'b' (1, 5) (1, 6) + RBRACE '}' (1, 6) (1, 7) + FSTRING_MIDDLE 'c' (1, 7) (1, 8) + FSTRING_END '"' (1, 8) (1, 9) + """) + + self.check_tokenize('f"""abc"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE 'abc' (1, 4) (1, 7) + FSTRING_END '\"""' (1, 7) (1, 10) + """) + + self.check_tokenize(r'f"abc\ +def"', """\ + FSTRING_START \'f"\' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 2) (2, 3) + FSTRING_END '"' (2, 3) (2, 4) + """) + + self.check_tokenize('''\ +f"{ +a}"''', """\ + FSTRING_START 'f"' (1, 0) (1, 2) + LBRACE '{' (1, 2) (1, 3) + NAME 'a' (2, 0) (2, 1) + RBRACE '}' (2, 1) (2, 2) + FSTRING_END '"' (2, 2) (2, 3) + """) + + self.check_tokenize(r'Rf"abc\ +def"', """\ + FSTRING_START 'Rf"' (1, 0) (1, 3) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3) + FSTRING_END '"' (2, 3) (2, 4) + """) + + self.check_tokenize(r'f"hola\\\r\ndfgf"', """\ + FSTRING_START \'f"\' (1, 0) (1, 2) + FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16) + FSTRING_END \'"\' (1, 16) (1, 17) + """) + + self.check_tokenize("""\ +f'''__{ + x:a +}__'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + FSTRING_MIDDLE '__' (1, 4) (1, 6) + LBRACE '{' (1, 6) (1, 7) + NAME 'x' (2, 4) (2, 5) + COLON ':' (2, 5) (2, 6) + FSTRING_MIDDLE 'a\\n' (2, 6) (3, 0) + RBRACE '}' (3, 0) (3, 1) + FSTRING_MIDDLE '__' (3, 1) (3, 3) + FSTRING_END "'''" (3, 3) (3, 6) + """) + + self.check_tokenize("""\ +f'''__{ + x:a + b + c + d +}__'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + FSTRING_MIDDLE '__' (1, 4) (1, 6) + LBRACE '{' (1, 6) (1, 7) + NAME 'x' (2, 4) (2, 5) + COLON ':' (2, 5) (2, 6) + FSTRING_MIDDLE 'a\\n b\\n c\\n d\\n' (2, 6) (6, 0) + RBRACE '}' (6, 0) (6, 1) + FSTRING_MIDDLE '__' (6, 1) (6, 3) + FSTRING_END "'''" (6, 3) (6, 6) + """) + + def test_function(self): + + self.check_tokenize('def d22(a, b, c=2, d=2, *k): pass', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'd22' (1, 4) (1, 7) + LPAR '(' (1, 7) (1, 8) + NAME 'a' (1, 8) (1, 9) + COMMA ',' (1, 9) (1, 10) + NAME 'b' (1, 11) (1, 12) + COMMA ',' (1, 12) (1, 13) + NAME 'c' (1, 14) (1, 15) + EQUAL '=' (1, 15) (1, 16) + NUMBER '2' (1, 16) (1, 17) + COMMA ',' (1, 17) (1, 18) + NAME 'd' (1, 19) (1, 20) + EQUAL '=' (1, 20) (1, 21) + NUMBER '2' (1, 21) (1, 22) + COMMA ',' (1, 22) (1, 23) + STAR '*' (1, 24) (1, 25) + NAME 'k' (1, 25) (1, 26) + RPAR ')' (1, 26) (1, 27) + COLON ':' (1, 27) (1, 28) + NAME 'pass' (1, 29) (1, 33) + """) + + self.check_tokenize('def d01v_(a=1, *k, **w): pass', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'd01v_' (1, 4) (1, 9) + LPAR '(' (1, 9) (1, 10) + NAME 'a' (1, 10) (1, 11) + EQUAL '=' (1, 11) (1, 12) + NUMBER '1' (1, 12) (1, 13) + COMMA ',' (1, 13) (1, 14) + STAR '*' (1, 15) (1, 16) + NAME 'k' (1, 16) (1, 17) + COMMA ',' (1, 17) (1, 18) + DOUBLESTAR '**' (1, 19) (1, 21) + NAME 'w' (1, 21) (1, 22) + RPAR ')' (1, 22) (1, 23) + COLON ':' (1, 23) (1, 24) + NAME 'pass' (1, 25) (1, 29) + """) + + self.check_tokenize('def d23(a: str, b: int=3) -> int: pass', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'd23' (1, 4) (1, 7) + LPAR '(' (1, 7) (1, 8) + NAME 'a' (1, 8) (1, 9) + COLON ':' (1, 9) (1, 10) + NAME 'str' (1, 11) (1, 14) + COMMA ',' (1, 14) (1, 15) + NAME 'b' (1, 16) (1, 17) + COLON ':' (1, 17) (1, 18) + NAME 'int' (1, 19) (1, 22) + EQUAL '=' (1, 22) (1, 23) + NUMBER '3' (1, 23) (1, 24) + RPAR ')' (1, 24) (1, 25) + RARROW '->' (1, 26) (1, 28) + NAME 'int' (1, 29) (1, 32) + COLON ':' (1, 32) (1, 33) + NAME 'pass' (1, 34) (1, 38) + """) + + def test_comparison(self): + + self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " + "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\ + NAME 'if' (1, 0) (1, 2) + NUMBER '1' (1, 3) (1, 4) + LESS '<' (1, 5) (1, 6) + NUMBER '1' (1, 7) (1, 8) + GREATER '>' (1, 9) (1, 10) + NUMBER '1' (1, 11) (1, 12) + EQEQUAL '==' (1, 13) (1, 15) + NUMBER '1' (1, 16) (1, 17) + GREATEREQUAL '>=' (1, 18) (1, 20) + NUMBER '5' (1, 21) (1, 22) + LESSEQUAL '<=' (1, 23) (1, 25) + NUMBER '0x15' (1, 26) (1, 30) + LESSEQUAL '<=' (1, 31) (1, 33) + NUMBER '0x12' (1, 34) (1, 38) + NOTEQUAL '!=' (1, 39) (1, 41) + NUMBER '1' (1, 42) (1, 43) + NAME 'and' (1, 44) (1, 47) + NUMBER '5' (1, 48) (1, 49) + NAME 'in' (1, 50) (1, 52) + NUMBER '1' (1, 53) (1, 54) + NAME 'not' (1, 55) (1, 58) + NAME 'in' (1, 59) (1, 61) + NUMBER '1' (1, 62) (1, 63) + NAME 'is' (1, 64) (1, 66) + NUMBER '1' (1, 67) (1, 68) + NAME 'or' (1, 69) (1, 71) + NUMBER '5' (1, 72) (1, 73) + NAME 'is' (1, 74) (1, 76) + NAME 'not' (1, 77) (1, 80) + NUMBER '1' (1, 81) (1, 82) + COLON ':' (1, 82) (1, 83) + NAME 'pass' (1, 84) (1, 88) + """) + + def test_additive(self): + + self.check_tokenize('x = 1 - y + 15 - 1 + 0x124 + z + a[5]', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '1' (1, 4) (1, 5) + MINUS '-' (1, 6) (1, 7) + NAME 'y' (1, 8) (1, 9) + PLUS '+' (1, 10) (1, 11) + NUMBER '15' (1, 12) (1, 14) + MINUS '-' (1, 15) (1, 16) + NUMBER '1' (1, 17) (1, 18) + PLUS '+' (1, 19) (1, 20) + NUMBER '0x124' (1, 21) (1, 26) + PLUS '+' (1, 27) (1, 28) + NAME 'z' (1, 29) (1, 30) + PLUS '+' (1, 31) (1, 32) + NAME 'a' (1, 33) (1, 34) + LSQB '[' (1, 34) (1, 35) + NUMBER '5' (1, 35) (1, 36) + RSQB ']' (1, 36) (1, 37) + """) + + def test_multiplicative(self): + + self.check_tokenize('x = 1//1*1/5*12%0x12@42', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '1' (1, 4) (1, 5) + DOUBLESLASH '//' (1, 5) (1, 7) + NUMBER '1' (1, 7) (1, 8) + STAR '*' (1, 8) (1, 9) + NUMBER '1' (1, 9) (1, 10) + SLASH '/' (1, 10) (1, 11) + NUMBER '5' (1, 11) (1, 12) + STAR '*' (1, 12) (1, 13) + NUMBER '12' (1, 13) (1, 15) + PERCENT '%' (1, 15) (1, 16) + NUMBER '0x12' (1, 16) (1, 20) + AT '@' (1, 20) (1, 21) + NUMBER '42' (1, 21) (1, 23) + """) + + def test_unary(self): + + self.check_tokenize('~1 ^ 1 & 1 |1 ^ -1', """\ + TILDE '~' (1, 0) (1, 1) + NUMBER '1' (1, 1) (1, 2) + CIRCUMFLEX '^' (1, 3) (1, 4) + NUMBER '1' (1, 5) (1, 6) + AMPER '&' (1, 7) (1, 8) + NUMBER '1' (1, 9) (1, 10) + VBAR '|' (1, 11) (1, 12) + NUMBER '1' (1, 12) (1, 13) + CIRCUMFLEX '^' (1, 14) (1, 15) + MINUS '-' (1, 16) (1, 17) + NUMBER '1' (1, 17) (1, 18) + """) + + self.check_tokenize('-1*1/1+1*1//1 - ---1**1', """\ + MINUS '-' (1, 0) (1, 1) + NUMBER '1' (1, 1) (1, 2) + STAR '*' (1, 2) (1, 3) + NUMBER '1' (1, 3) (1, 4) + SLASH '/' (1, 4) (1, 5) + NUMBER '1' (1, 5) (1, 6) + PLUS '+' (1, 6) (1, 7) + NUMBER '1' (1, 7) (1, 8) + STAR '*' (1, 8) (1, 9) + NUMBER '1' (1, 9) (1, 10) + DOUBLESLASH '//' (1, 10) (1, 12) + NUMBER '1' (1, 12) (1, 13) + MINUS '-' (1, 14) (1, 15) + MINUS '-' (1, 16) (1, 17) + MINUS '-' (1, 17) (1, 18) + MINUS '-' (1, 18) (1, 19) + NUMBER '1' (1, 19) (1, 20) + DOUBLESTAR '**' (1, 20) (1, 22) + NUMBER '1' (1, 22) (1, 23) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_selector(self): + + self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\ + NAME 'import' (1, 0) (1, 6) + NAME 'sys' (1, 7) (1, 10) + COMMA ',' (1, 10) (1, 11) + NAME 'time' (1, 12) (1, 16) + NEWLINE '' (1, 16) (1, 16) + NAME 'x' (2, 0) (2, 1) + EQUAL '=' (2, 2) (2, 3) + NAME 'sys' (2, 4) (2, 7) + DOT '.' (2, 7) (2, 8) + NAME 'modules' (2, 8) (2, 15) + LSQB '[' (2, 15) (2, 16) + STRING "'time'" (2, 16) (2, 22) + RSQB ']' (2, 22) (2, 23) + DOT '.' (2, 23) (2, 24) + NAME 'time' (2, 24) (2, 28) + LPAR '(' (2, 28) (2, 29) + RPAR ')' (2, 29) (2, 30) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_method(self): + + self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ + AT '@' (1, 0) (1, 1) + NAME 'staticmethod' (1, 1) (1, 13) + NEWLINE '' (1, 13) (1, 13) + NAME 'def' (2, 0) (2, 3) + NAME 'foo' (2, 4) (2, 7) + LPAR '(' (2, 7) (2, 8) + NAME 'x' (2, 8) (2, 9) + COMMA ',' (2, 9) (2, 10) + NAME 'y' (2, 10) (2, 11) + RPAR ')' (2, 11) (2, 12) + COLON ':' (2, 12) (2, 13) + NAME 'pass' (2, 14) (2, 18) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_tabs(self): + + self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ + AT '@' (1, 0) (1, 1) + NAME 'staticmethod' (1, 1) (1, 13) + NEWLINE '' (1, 13) (1, 13) + NAME 'def' (2, 0) (2, 3) + NAME 'foo' (2, 4) (2, 7) + LPAR '(' (2, 7) (2, 8) + NAME 'x' (2, 8) (2, 9) + COMMA ',' (2, 9) (2, 10) + NAME 'y' (2, 10) (2, 11) + RPAR ')' (2, 11) (2, 12) + COLON ':' (2, 12) (2, 13) + NAME 'pass' (2, 14) (2, 18) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_async(self): + + self.check_tokenize('async = 1', """\ + NAME 'async' (1, 0) (1, 5) + EQUAL '=' (1, 6) (1, 7) + NUMBER '1' (1, 8) (1, 9) + """) + + self.check_tokenize('a = (async = 1)', """\ + NAME 'a' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + LPAR '(' (1, 4) (1, 5) + NAME 'async' (1, 5) (1, 10) + EQUAL '=' (1, 11) (1, 12) + NUMBER '1' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + """) + + self.check_tokenize('async()', """\ + NAME 'async' (1, 0) (1, 5) + LPAR '(' (1, 5) (1, 6) + RPAR ')' (1, 6) (1, 7) + """) + + self.check_tokenize('class async(Bar):pass', """\ + NAME 'class' (1, 0) (1, 5) + NAME 'async' (1, 6) (1, 11) + LPAR '(' (1, 11) (1, 12) + NAME 'Bar' (1, 12) (1, 15) + RPAR ')' (1, 15) (1, 16) + COLON ':' (1, 16) (1, 17) + NAME 'pass' (1, 17) (1, 21) + """) + + self.check_tokenize('class async:pass', """\ + NAME 'class' (1, 0) (1, 5) + NAME 'async' (1, 6) (1, 11) + COLON ':' (1, 11) (1, 12) + NAME 'pass' (1, 12) (1, 16) + """) + + self.check_tokenize('await = 1', """\ + NAME 'await' (1, 0) (1, 5) + EQUAL '=' (1, 6) (1, 7) + NUMBER '1' (1, 8) (1, 9) + """) + + self.check_tokenize('foo.async', """\ + NAME 'foo' (1, 0) (1, 3) + DOT '.' (1, 3) (1, 4) + NAME 'async' (1, 4) (1, 9) + """) + + self.check_tokenize('async for a in b: pass', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'for' (1, 6) (1, 9) + NAME 'a' (1, 10) (1, 11) + NAME 'in' (1, 12) (1, 14) + NAME 'b' (1, 15) (1, 16) + COLON ':' (1, 16) (1, 17) + NAME 'pass' (1, 18) (1, 22) + """) + + self.check_tokenize('async with a as b: pass', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'with' (1, 6) (1, 10) + NAME 'a' (1, 11) (1, 12) + NAME 'as' (1, 13) (1, 15) + NAME 'b' (1, 16) (1, 17) + COLON ':' (1, 17) (1, 18) + NAME 'pass' (1, 19) (1, 23) + """) + + self.check_tokenize('async.foo', """\ + NAME 'async' (1, 0) (1, 5) + DOT '.' (1, 5) (1, 6) + NAME 'foo' (1, 6) (1, 9) + """) + + self.check_tokenize('async', """\ + NAME 'async' (1, 0) (1, 5) + """) + + self.check_tokenize('async\n#comment\nawait', """\ + NAME 'async' (1, 0) (1, 5) + NEWLINE '' (1, 5) (1, 5) + NAME 'await' (3, 0) (3, 5) + """) + + self.check_tokenize('async\n...\nawait', """\ + NAME 'async' (1, 0) (1, 5) + NEWLINE '' (1, 5) (1, 5) + ELLIPSIS '...' (2, 0) (2, 3) + NEWLINE '' (2, 3) (2, 3) + NAME 'await' (3, 0) (3, 5) + """) + + self.check_tokenize('async\nawait', """\ + NAME 'async' (1, 0) (1, 5) + NEWLINE '' (1, 5) (1, 5) + NAME 'await' (2, 0) (2, 5) + """) + + self.check_tokenize('foo.async + 1', """\ + NAME 'foo' (1, 0) (1, 3) + DOT '.' (1, 3) (1, 4) + NAME 'async' (1, 4) (1, 9) + PLUS '+' (1, 10) (1, 11) + NUMBER '1' (1, 12) (1, 13) + """) + + self.check_tokenize('async def foo(): pass', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + COLON ':' (1, 15) (1, 16) + NAME 'pass' (1, 17) (1, 21) + """) + + self.check_tokenize('''\ +async def foo(): + def foo(await): + await = 1 + if 1: + await +async += 1 +''', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + COLON ':' (1, 15) (1, 16) + NEWLINE '' (1, 16) (1, 16) + INDENT '' (2, -1) (2, -1) + NAME 'def' (2, 2) (2, 5) + NAME 'foo' (2, 6) (2, 9) + LPAR '(' (2, 9) (2, 10) + NAME 'await' (2, 10) (2, 15) + RPAR ')' (2, 15) (2, 16) + COLON ':' (2, 16) (2, 17) + NEWLINE '' (2, 17) (2, 17) + INDENT '' (3, -1) (3, -1) + NAME 'await' (3, 4) (3, 9) + EQUAL '=' (3, 10) (3, 11) + NUMBER '1' (3, 12) (3, 13) + NEWLINE '' (3, 13) (3, 13) + DEDENT '' (4, -1) (4, -1) + NAME 'if' (4, 2) (4, 4) + NUMBER '1' (4, 5) (4, 6) + COLON ':' (4, 6) (4, 7) + NEWLINE '' (4, 7) (4, 7) + INDENT '' (5, -1) (5, -1) + NAME 'await' (5, 4) (5, 9) + NEWLINE '' (5, 9) (5, 9) + DEDENT '' (6, -1) (6, -1) + DEDENT '' (6, -1) (6, -1) + NAME 'async' (6, 0) (6, 5) + PLUSEQUAL '+=' (6, 6) (6, 8) + NUMBER '1' (6, 9) (6, 10) + NEWLINE '' (6, 10) (6, 10) + """) + + self.check_tokenize('async def foo():\n async for i in 1: pass', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + COLON ':' (1, 15) (1, 16) + NEWLINE '' (1, 16) (1, 16) + INDENT '' (2, -1) (2, -1) + NAME 'async' (2, 2) (2, 7) + NAME 'for' (2, 8) (2, 11) + NAME 'i' (2, 12) (2, 13) + NAME 'in' (2, 14) (2, 16) + NUMBER '1' (2, 17) (2, 18) + COLON ':' (2, 18) (2, 19) + NAME 'pass' (2, 20) (2, 24) + DEDENT '' (2, -1) (2, -1) + """) + + self.check_tokenize('async def foo(async): await', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + NAME 'async' (1, 14) (1, 19) + RPAR ')' (1, 19) (1, 20) + COLON ':' (1, 20) (1, 21) + NAME 'await' (1, 22) (1, 27) + """) + + self.check_tokenize('''\ +def f(): + + def baz(): pass + async def bar(): pass + + await = 2''', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'f' (1, 4) (1, 5) + LPAR '(' (1, 5) (1, 6) + RPAR ')' (1, 6) (1, 7) + COLON ':' (1, 7) (1, 8) + NEWLINE '' (1, 8) (1, 8) + INDENT '' (3, -1) (3, -1) + NAME 'def' (3, 2) (3, 5) + NAME 'baz' (3, 6) (3, 9) + LPAR '(' (3, 9) (3, 10) + RPAR ')' (3, 10) (3, 11) + COLON ':' (3, 11) (3, 12) + NAME 'pass' (3, 13) (3, 17) + NEWLINE '' (3, 17) (3, 17) + NAME 'async' (4, 2) (4, 7) + NAME 'def' (4, 8) (4, 11) + NAME 'bar' (4, 12) (4, 15) + LPAR '(' (4, 15) (4, 16) + RPAR ')' (4, 16) (4, 17) + COLON ':' (4, 17) (4, 18) + NAME 'pass' (4, 19) (4, 23) + NEWLINE '' (4, 23) (4, 23) + NAME 'await' (6, 2) (6, 7) + EQUAL '=' (6, 8) (6, 9) + NUMBER '2' (6, 10) (6, 11) + DEDENT '' (6, -1) (6, -1) + """) + + self.check_tokenize('''\ +async def f(): + + def baz(): pass + async def bar(): pass + + await = 2''', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'f' (1, 10) (1, 11) + LPAR '(' (1, 11) (1, 12) + RPAR ')' (1, 12) (1, 13) + COLON ':' (1, 13) (1, 14) + NEWLINE '' (1, 14) (1, 14) + INDENT '' (3, -1) (3, -1) + NAME 'def' (3, 2) (3, 5) + NAME 'baz' (3, 6) (3, 9) + LPAR '(' (3, 9) (3, 10) + RPAR ')' (3, 10) (3, 11) + COLON ':' (3, 11) (3, 12) + NAME 'pass' (3, 13) (3, 17) + NEWLINE '' (3, 17) (3, 17) + NAME 'async' (4, 2) (4, 7) + NAME 'def' (4, 8) (4, 11) + NAME 'bar' (4, 12) (4, 15) + LPAR '(' (4, 15) (4, 16) + RPAR ')' (4, 16) (4, 17) + COLON ':' (4, 17) (4, 18) + NAME 'pass' (4, 19) (4, 23) + NEWLINE '' (4, 23) (4, 23) + NAME 'await' (6, 2) (6, 7) + EQUAL '=' (6, 8) (6, 9) + NUMBER '2' (6, 10) (6, 11) + DEDENT '' (6, -1) (6, -1) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_unicode(self): + + self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ + NAME 'Örter' (1, 0) (1, 5) + EQUAL '=' (1, 6) (1, 7) + STRING "u'places'" (1, 8) (1, 17) + NEWLINE '' (1, 17) (1, 17) + NAME 'grün' (2, 0) (2, 4) + EQUAL '=' (2, 5) (2, 6) + STRING "U'green'" (2, 7) (2, 15) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_invalid_syntax(self): + def get_tokens(string): + the_string = StringIO(string) + return list(tokenize._generate_tokens_from_c_tokenizer(the_string.readline)) + + for case in [ + "(1+2]", + "(1+2}", + "{1+2]", + "1_", + "1.2_", + "1e2_", + "1e+", + + "\xa0", + "€", + "0b12", + "0b1_2", + "0b2", + "0b1_", + "0b", + "0o18", + "0o1_8", + "0o8", + "0o1_", + "0o", + "0x1_", + "0x", + "1_", + "012", + "1.2_", + "1e2_", + "1e+", + "'sdfsdf", + "'''sdfsdf''", + "("*1000+"a"+")"*1000, + "]", + """\ + f'__{ + x:d + }__'""", + ]: + with self.subTest(case=case): + self.assertRaises(tokenize.TokenError, get_tokens, case) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_max_indent(self): + MAXINDENT = 100 + + def generate_source(indents): + source = ''.join((' ' * x) + 'if True:\n' for x in range(indents)) + source += ' ' * indents + 'pass\n' + return source + + valid = generate_source(MAXINDENT - 1) + the_input = StringIO(valid) + tokens = list(tokenize._generate_tokens_from_c_tokenizer(the_input.readline)) + self.assertEqual(tokens[-2].type, tokenize.DEDENT) + self.assertEqual(tokens[-1].type, tokenize.ENDMARKER) + compile(valid, "", "exec") + + invalid = generate_source(MAXINDENT) + the_input = StringIO(invalid) + self.assertRaises(IndentationError, lambda: list(tokenize._generate_tokens_from_c_tokenizer(the_input.readline))) + self.assertRaises( + IndentationError, compile, invalid, "", "exec" + ) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_continuation_lines_indentation(self): + def get_tokens(string): + the_string = StringIO(string) + return [(kind, string) for (kind, string, *_) + in tokenize._generate_tokens_from_c_tokenizer(the_string.readline)] + + code = dedent(""" + def fib(n): + \\ + '''Print a Fibonacci series up to n.''' + \\ + a, b = 0, 1 + """) + + self.check_tokenize(code, """\ + NAME 'def' (2, 0) (2, 3) + NAME 'fib' (2, 4) (2, 7) + LPAR '(' (2, 7) (2, 8) + NAME 'n' (2, 8) (2, 9) + RPAR ')' (2, 9) (2, 10) + COLON ':' (2, 10) (2, 11) + NEWLINE '' (2, 11) (2, 11) + INDENT '' (4, -1) (4, -1) + STRING "'''Print a Fibonacci series up to n.'''" (4, 0) (4, 39) + NEWLINE '' (4, 39) (4, 39) + NAME 'a' (6, 0) (6, 1) + COMMA ',' (6, 1) (6, 2) + NAME 'b' (6, 3) (6, 4) + EQUAL '=' (6, 5) (6, 6) + NUMBER '0' (6, 7) (6, 8) + COMMA ',' (6, 8) (6, 9) + NUMBER '1' (6, 10) (6, 11) + NEWLINE '' (6, 11) (6, 11) + DEDENT '' (6, -1) (6, -1) + """) + + code_no_cont = dedent(""" + def fib(n): + '''Print a Fibonacci series up to n.''' + a, b = 0, 1 + """) + + self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) + + code = dedent(""" + pass + \\ + + pass + """) + + self.check_tokenize(code, """\ + NAME 'pass' (2, 0) (2, 4) + NEWLINE '' (2, 4) (2, 4) + NAME 'pass' (5, 0) (5, 4) + NEWLINE '' (5, 4) (5, 4) + """) + + code_no_cont = dedent(""" + pass + pass + """) + + self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) + + code = dedent(""" + if x: + y = 1 + \\ + \\ + \\ + \\ + foo = 1 + """) + + self.check_tokenize(code, """\ + NAME 'if' (2, 0) (2, 2) + NAME 'x' (2, 3) (2, 4) + COLON ':' (2, 4) (2, 5) + NEWLINE '' (2, 5) (2, 5) + INDENT '' (3, -1) (3, -1) + NAME 'y' (3, 4) (3, 5) + EQUAL '=' (3, 6) (3, 7) + NUMBER '1' (3, 8) (3, 9) + NEWLINE '' (3, 9) (3, 9) + NAME 'foo' (8, 4) (8, 7) + EQUAL '=' (8, 8) (8, 9) + NUMBER '1' (8, 10) (8, 11) + NEWLINE '' (8, 11) (8, 11) + DEDENT '' (8, -1) (8, -1) + """) + + code_no_cont = dedent(""" + if x: + y = 1 + foo = 1 + """) + + self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) + + +class CTokenizerBufferTests(unittest.TestCase): + def test_newline_at_the_end_of_buffer(self): + # See issue 99581: Make sure that if we need to add a new line at the + # end of the buffer, we have enough space in the buffer, specially when + # the current line is as long as the buffer space available. + test_script = f"""\ + #coding: latin-1 + #{"a"*10000} + #{"a"*10002}""" + with os_helper.temp_dir() as temp_dir: + file_name = make_script(temp_dir, 'foo', test_script) + run_test_script(file_name) + + if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_unittest/testmock/testhelpers.py b/Lib/test/test_unittest/testmock/testhelpers.py index 040e28517f..5facac685f 100644 --- a/Lib/test/test_unittest/testmock/testhelpers.py +++ b/Lib/test/test_unittest/testmock/testhelpers.py @@ -930,8 +930,6 @@ def check_data_descriptor(mock_attr): check_data_descriptor(foo.desc) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_autospec_on_bound_builtin_function(self): meth = types.MethodType(time.ctime, time.time()) self.assertIsInstance(meth(), str) diff --git a/Lib/test/tokenizedata/__init__.py b/Lib/test/tokenizedata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Lib/test/tokenizedata/bad_coding.py b/Lib/test/tokenizedata/bad_coding.py new file mode 100644 index 0000000000..971b0a8f3d --- /dev/null +++ b/Lib/test/tokenizedata/bad_coding.py @@ -0,0 +1 @@ +# -*- coding: uft-8 -*- diff --git a/Lib/test/tokenizedata/bad_coding2.py b/Lib/test/tokenizedata/bad_coding2.py new file mode 100644 index 0000000000..bb2bb7e1e7 --- /dev/null +++ b/Lib/test/tokenizedata/bad_coding2.py @@ -0,0 +1,2 @@ +#coding: utf8 +print('我') diff --git a/Lib/test/tokenizedata/badsyntax_3131.py b/Lib/test/tokenizedata/badsyntax_3131.py new file mode 100644 index 0000000000..901d3744ca --- /dev/null +++ b/Lib/test/tokenizedata/badsyntax_3131.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +€ = 2 diff --git a/Lib/test/tokenizedata/badsyntax_pep3120.py b/Lib/test/tokenizedata/badsyntax_pep3120.py new file mode 100644 index 0000000000..d14b4c96ed --- /dev/null +++ b/Lib/test/tokenizedata/badsyntax_pep3120.py @@ -0,0 +1 @@ +print("bse") diff --git a/Lib/test/tokenizedata/coding20731.py b/Lib/test/tokenizedata/coding20731.py new file mode 100644 index 0000000000..b0e227ad11 --- /dev/null +++ b/Lib/test/tokenizedata/coding20731.py @@ -0,0 +1,4 @@ +#coding:latin1 + + + diff --git a/Lib/test/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt b/Lib/test/tokenizedata/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt similarity index 100% rename from Lib/test/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt rename to Lib/test/tokenizedata/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt diff --git a/Lib/test/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt b/Lib/test/tokenizedata/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt similarity index 100% rename from Lib/test/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt rename to Lib/test/tokenizedata/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt diff --git a/Lib/test/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt b/Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt similarity index 100% rename from Lib/test/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt rename to Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt diff --git a/Lib/test/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt b/Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt similarity index 100% rename from Lib/test/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt rename to Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt diff --git a/Lib/test/tokenize_tests.txt b/Lib/test/tokenizedata/tokenize_tests.txt similarity index 100% rename from Lib/test/tokenize_tests.txt rename to Lib/test/tokenizedata/tokenize_tests.txt diff --git a/Lib/token.py b/Lib/token.py index 493bf04265..54d7cdccad 100644 --- a/Lib/token.py +++ b/Lib/token.py @@ -1,7 +1,8 @@ """Token constants.""" -# Auto-generated by Tools/scripts/generate_token.py +# Auto-generated by Tools/build/generate_token.py -__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF'] +__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF', + 'EXACT_TOKEN_TYPES'] ENDMARKER = 0 NAME = 1 @@ -57,17 +58,20 @@ RARROW = 51 ELLIPSIS = 52 COLONEQUAL = 53 -OP = 54 -AWAIT = 55 -ASYNC = 56 -TYPE_IGNORE = 57 -TYPE_COMMENT = 58 +EXCLAMATION = 54 +OP = 55 +TYPE_IGNORE = 56 +TYPE_COMMENT = 57 +SOFT_KEYWORD = 58 +FSTRING_START = 59 +FSTRING_MIDDLE = 60 +FSTRING_END = 61 +COMMENT = 62 +NL = 63 # These aren't used by the C tokenizer but are needed for tokenize.py -ERRORTOKEN = 59 -COMMENT = 60 -NL = 61 -ENCODING = 62 -N_TOKENS = 63 +ERRORTOKEN = 64 +ENCODING = 65 +N_TOKENS = 66 # Special definitions for cooperation with parser NT_OFFSET = 256 @@ -77,6 +81,7 @@ __all__.extend(tok_name.values()) EXACT_TOKEN_TYPES = { + '!': EXCLAMATION, '!=': NOTEQUAL, '%': PERCENT, '%=': PERCENTEQUAL, diff --git a/Lib/tokenize.py b/Lib/tokenize.py index d72968e425..7ca552c4fc 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -24,10 +24,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 'Michael Foord') -try: - from builtins import open as _builtin_open -except ImportError: - pass +from builtins import open as _builtin_open from codecs import lookup, BOM_UTF8 import collections import functools @@ -37,13 +34,14 @@ import sys from token import * from token import EXACT_TOKEN_TYPES +import _tokenize cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) import token __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", - "untokenize", "TokenInfo"] + "untokenize", "TokenInfo", "open", "TokenError"] del token class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): @@ -146,6 +144,7 @@ def _compile(expr): endpats[_prefix + '"'] = Double endpats[_prefix + "'''"] = Single3 endpats[_prefix + '"""'] = Double3 +del _prefix # A set of all of the single and triple quoted string prefixes, # including the opening quotes. @@ -156,13 +155,12 @@ def _compile(expr): single_quoted.add(u) for u in (t + '"""', t + "'''"): triple_quoted.add(u) +del t, u tabsize = 8 class TokenError(Exception): pass -class StopTokenizing(Exception): pass - class Untokenizer: @@ -170,6 +168,8 @@ def __init__(self): self.tokens = [] self.prev_row = 1 self.prev_col = 0 + self.prev_type = None + self.prev_line = "" self.encoding = None def add_whitespace(self, start): @@ -177,14 +177,51 @@ def add_whitespace(self, start): if row < self.prev_row or row == self.prev_row and col < self.prev_col: raise ValueError("start ({},{}) precedes previous end ({},{})" .format(row, col, self.prev_row, self.prev_col)) - row_offset = row - self.prev_row - if row_offset: - self.tokens.append("\\\n" * row_offset) - self.prev_col = 0 + self.add_backslash_continuation(start) col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) + def add_backslash_continuation(self, start): + """Add backslash continuation characters if the row has increased + without encountering a newline token. + + This also inserts the correct amount of whitespace before the backslash. + """ + row = start[0] + row_offset = row - self.prev_row + if row_offset == 0: + return + + newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n' + line = self.prev_line.rstrip('\\\r\n') + ws = ''.join(_itertools.takewhile(str.isspace, reversed(line))) + self.tokens.append(ws + f"\\{newline}" * row_offset) + self.prev_col = 0 + + def escape_brackets(self, token): + characters = [] + consume_until_next_bracket = False + for character in token: + if character == "}": + if consume_until_next_bracket: + consume_until_next_bracket = False + else: + characters.append(character) + if character == "{": + n_backslashes = sum( + 1 for char in _itertools.takewhile( + "\\".__eq__, + characters[-2::-1] + ) + ) + if n_backslashes % 2 == 0 or characters[-1] != "N": + characters.append(character) + else: + consume_until_next_bracket = True + characters.append(character) + return "".join(characters) + def untokenize(self, iterable): it = iter(iterable) indents = [] @@ -214,12 +251,22 @@ def untokenize(self, iterable): self.tokens.append(indent) self.prev_col = len(indent) startline = False + elif tok_type == FSTRING_MIDDLE: + if '{' in token or '}' in token: + token = self.escape_brackets(token) + last_line = token.splitlines()[-1] + end_line, end_col = end + extra_chars = last_line.count("{{") + last_line.count("}}") + end = (end_line, end_col + extra_chars) + self.add_whitespace(start) self.tokens.append(token) self.prev_row, self.prev_col = end if tok_type in (NEWLINE, NL): self.prev_row += 1 self.prev_col = 0 + self.prev_type = tok_type + self.prev_line = line return "".join(self.tokens) def compat(self, token, iterable): @@ -227,6 +274,7 @@ def compat(self, token, iterable): toks_append = self.tokens.append startline = token[0] in (NEWLINE, NL) prevstring = False + in_fstring = 0 for tok in _itertools.chain([token], iterable): toknum, tokval = tok[:2] @@ -245,6 +293,10 @@ def compat(self, token, iterable): else: prevstring = False + if toknum == FSTRING_START: + in_fstring += 1 + elif toknum == FSTRING_END: + in_fstring -= 1 if toknum == INDENT: indents.append(tokval) continue @@ -256,7 +308,19 @@ def compat(self, token, iterable): elif startline and indents: toks_append(indents[-1]) startline = False + elif toknum == FSTRING_MIDDLE: + tokval = self.escape_brackets(tokval) + + # Insert a space between two consecutive brackets if we are in an f-string + if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring: + tokval = ' ' + tokval + + # Insert a space between two consecutive f-strings + if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): + self.tokens.append(" ") + toks_append(tokval) + self.prev_type = toknum def untokenize(iterable): @@ -268,16 +332,10 @@ def untokenize(iterable): with at least two elements, a token number and token value. If only two tokens are passed, the resulting output is poor. - Round-trip invariant for full input: - Untokenized source will match input source exactly - - Round-trip invariant for limited input: - # Output bytes will tokenize back to the input - t1 = [tok[:2] for tok in tokenize(f.readline)] - newcode = untokenize(t1) - readline = BytesIO(newcode).readline - t2 = [tok[:2] for tok in tokenize(readline)] - assert t1 == t2 + The result is guaranteed to tokenize back to match the input so + that the conversion is lossless and round-trips are assured. + The guarantee applies only to the token type and token string as + the spacing between tokens (column positions) may change. """ ut = Untokenizer() out = ut.untokenize(iterable) @@ -287,7 +345,7 @@ def untokenize(iterable): def _get_normal_name(orig_enc): - """Imitates get_normal_name in tokenizer.c.""" + """Imitates get_normal_name in Parser/tokenizer/helpers.c.""" # Only care about the first 12 characters. enc = orig_enc[:12].lower().replace("_", "-") if enc == "utf-8" or enc.startswith("utf-8-"): @@ -405,7 +463,6 @@ def open(filename): buffer.close() raise - def tokenize(readline): """ The tokenize() generator requires one argument, readline, which @@ -426,193 +483,13 @@ def tokenize(readline): which tells you which encoding was used to decode the bytes stream. """ encoding, consumed = detect_encoding(readline) - empty = _itertools.repeat(b"") - rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) - return _tokenize(rl_gen.__next__, encoding) - - -def _tokenize(readline, encoding): - lnum = parenlev = continued = 0 - numchars = '0123456789' - contstr, needcont = '', 0 - contline = None - indents = [0] - + rl_gen = _itertools.chain(consumed, iter(readline, b"")) if encoding is not None: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') - last_line = b'' - line = b'' - while True: # loop over lines in stream - try: - # We capture the value of the line variable here because - # readline uses the empty string '' to signal end of input, - # hence `line` itself will always be overwritten at the end - # of this loop. - last_line = line - line = readline() - except StopIteration: - line = b'' - - if encoding is not None: - line = line.decode(encoding) - lnum += 1 - pos, max = 0, len(line) - - if contstr: # continued string - if not line: - raise TokenError("EOF in multi-line string", strstart) - endmatch = endprog.match(line) - if endmatch: - pos = end = endmatch.end(0) - yield TokenInfo(STRING, contstr + line[:end], - strstart, (lnum, end), contline + line) - contstr, needcont = '', 0 - contline = None - elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': - yield TokenInfo(ERRORTOKEN, contstr + line, - strstart, (lnum, len(line)), contline) - contstr = '' - contline = None - continue - else: - contstr = contstr + line - contline = contline + line - continue - - elif parenlev == 0 and not continued: # new statement - if not line: break - column = 0 - while pos < max: # measure leading whitespace - if line[pos] == ' ': - column += 1 - elif line[pos] == '\t': - column = (column//tabsize + 1)*tabsize - elif line[pos] == '\f': - column = 0 - else: - break - pos += 1 - if pos == max: - break - - if line[pos] in '#\r\n': # skip comments or blank lines - if line[pos] == '#': - comment_token = line[pos:].rstrip('\r\n') - yield TokenInfo(COMMENT, comment_token, - (lnum, pos), (lnum, pos + len(comment_token)), line) - pos += len(comment_token) - - yield TokenInfo(NL, line[pos:], - (lnum, pos), (lnum, len(line)), line) - continue - - if column > indents[-1]: # count indents or dedents - indents.append(column) - yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - while column < indents[-1]: - if column not in indents: - raise IndentationError( - "unindent does not match any outer indentation level", - ("", lnum, pos, line)) - indents = indents[:-1] - - yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) - - else: # continued statement - if not line: - raise TokenError("EOF in multi-line statement", (lnum, 0)) - continued = 0 - - while pos < max: - pseudomatch = _compile(PseudoToken).match(line, pos) - if pseudomatch: # scan for tokens - start, end = pseudomatch.span(1) - spos, epos, pos = (lnum, start), (lnum, end), end - if start == end: - continue - token, initial = line[start:end], line[start] - - if (initial in numchars or # ordinary number - (initial == '.' and token != '.' and token != '...')): - yield TokenInfo(NUMBER, token, spos, epos, line) - elif initial in '\r\n': - if parenlev > 0: - yield TokenInfo(NL, token, spos, epos, line) - else: - yield TokenInfo(NEWLINE, token, spos, epos, line) - - elif initial == '#': - assert not token.endswith("\n") - yield TokenInfo(COMMENT, token, spos, epos, line) - - elif token in triple_quoted: - endprog = _compile(endpats[token]) - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] - yield TokenInfo(STRING, token, spos, (lnum, pos), line) - else: - strstart = (lnum, start) # multiple lines - contstr = line[start:] - contline = line - break - - # Check up to the first 3 chars of the token to see if - # they're in the single_quoted set. If so, they start - # a string. - # We're using the first 3, because we're looking for - # "rb'" (for example) at the start of the token. If - # we switch to longer prefixes, this needs to be - # adjusted. - # Note that initial == token[:1]. - # Also note that single quote checking must come after - # triple quote checking (above). - elif (initial in single_quoted or - token[:2] in single_quoted or - token[:3] in single_quoted): - if token[-1] == '\n': # continued string - strstart = (lnum, start) - # Again, using the first 3 chars of the - # token. This is looking for the matching end - # regex for the correct type of quote - # character. So it's really looking for - # endpats["'"] or endpats['"'], by trying to - # skip string prefix characters, if any. - endprog = _compile(endpats.get(initial) or - endpats.get(token[1]) or - endpats.get(token[2])) - contstr, needcont = line[start:], 1 - contline = line - break - else: # ordinary string - yield TokenInfo(STRING, token, spos, epos, line) - - elif initial.isidentifier(): # ordinary name - yield TokenInfo(NAME, token, spos, epos, line) - elif initial == '\\': # continued stmt - continued = 1 - else: - if initial in '([{': - parenlev += 1 - elif initial in ')]}': - parenlev -= 1 - yield TokenInfo(OP, token, spos, epos, line) - else: - yield TokenInfo(ERRORTOKEN, line[pos], - (lnum, pos), (lnum, pos+1), line) - pos += 1 - - # Add an implicit NEWLINE if the input doesn't end in one - if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): - yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') - for indent in indents[1:]: # pop remaining indent levels - yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') - yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') - + yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True) def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. @@ -620,7 +497,7 @@ def generate_tokens(readline): This has the same API as tokenize(), except that it expects the *readline* callable to return str objects instead of bytes. """ - return _tokenize(readline, None) + return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True) def main(): import argparse @@ -657,7 +534,9 @@ def error(message, filename=None, location=None): tokens = list(tokenize(f.readline)) else: filename = "" - tokens = _tokenize(sys.stdin.readline, None) + tokens = _generate_tokens_from_c_tokenizer( + sys.stdin.readline, extra_tokens=True) + # Output the tokenization for token in tokens: @@ -683,5 +562,31 @@ def error(message, filename=None, location=None): perror("unexpected error: %s" % err) raise +def _transform_msg(msg): + """Transform error messages from the C tokenizer into the Python tokenize + + The C tokenizer is more picky than the Python one, so we need to massage + the error messages a bit for backwards compatibility. + """ + if "unterminated triple-quoted string literal" in msg: + return "EOF in multi-line string" + return msg + +def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False): + """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" + if encoding is None: + it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens) + else: + it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens) + try: + for info in it: + yield TokenInfo._make(info) + except SyntaxError as e: + if type(e) != SyntaxError: + raise e from None + msg = _transform_msg(e.msg) + raise TokenError(msg, (e.lineno, e.offset)) from None + + if __name__ == "__main__": main() diff --git a/stdlib/Cargo.toml b/stdlib/Cargo.toml index 7f64802d35..4a065bc407 100644 --- a/stdlib/Cargo.toml +++ b/stdlib/Cargo.toml @@ -49,6 +49,12 @@ dyn-clone = "1.0.10" pymath = { workspace = true } xml = "1.0" +# tokenize +ruff_python_ast = { workspace = true } +ruff_python_parser = { workspace = true } +ruff_source_file = { workspace = true } +ruff_text_size = { workspace = true } + # random rand_core = { workspace = true } mt19937 = "3.1" diff --git a/stdlib/src/lib.rs b/stdlib/src/lib.rs index 706ce0ef21..7c0118b177 100644 --- a/stdlib/src/lib.rs +++ b/stdlib/src/lib.rs @@ -50,6 +50,7 @@ mod suggestions; pub mod socket; #[cfg(all(unix, not(target_os = "redox")))] mod syslog; +mod tokenize; mod unicodedata; mod faulthandler; @@ -140,6 +141,7 @@ pub fn get_module_inits() -> impl Iterator, StdlibInit "_random" => random::make_module, "_statistics" => statistics::make_module, "_struct" => pystruct::make_module, + "_tokenize" => tokenize::make_module, "unicodedata" => unicodedata::make_module, "zlib" => zlib::make_module, "_statistics" => statistics::make_module, diff --git a/stdlib/src/tokenize.rs b/stdlib/src/tokenize.rs new file mode 100644 index 0000000000..78102fbece --- /dev/null +++ b/stdlib/src/tokenize.rs @@ -0,0 +1,392 @@ +pub(crate) use _tokenize::make_module; + +#[pymodule] +mod _tokenize { + use crate::{ + common::lock::PyRwLock, + vm::{ + AsObject, Py, PyPayload, PyResult, VirtualMachine, + builtins::{PyBytes, PyStr, PyStrRef, PyTypeRef}, + convert::ToPyObject, + function::ArgCallable, + protocol::PyIterReturn, + types::{Constructor, IterNext, Iterable, SelfIter}, + }, + }; + use ruff_python_ast::PySourceType; + use ruff_python_parser::{ParseError, Token, TokenKind, Tokens, parse_unchecked_source}; + use ruff_source_file::{LineIndex, LineRanges}; + use ruff_text_size::{Ranged, TextRange}; + use std::{cmp::Ordering, fmt}; + + /// Cpython `__import__("token").OP` + const TOKEN_OP: u8 = 55; + + #[pyattr] + #[pyclass(name = "TokenizerIter")] + #[derive(PyPayload)] + pub struct PyTokenizerIter { + readline: ArgCallable, // TODO: This should be PyObject + extra_tokens: bool, + encoding: Option, + state: PyRwLock, + } + + impl PyTokenizerIter { + fn readline(&self, vm: &VirtualMachine) -> PyResult { + // TODO: When `readline` is PyObject, + // we need to check if it's callable and raise a type error if it's not. + let raw_line = match self.readline.invoke((), vm) { + Ok(v) => v, + Err(err) => { + if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) { + return Ok(String::new()); + } + return Err(err); + } + }; + Ok(match &self.encoding { + Some(encoding) => { + let bytes = raw_line + .downcast::() + .map_err(|_| vm.new_type_error("readline() returned a non-bytes object"))?; + vm.state + .codec_registry + .decode_text(bytes.into(), encoding, None, vm) + .map(|s| s.as_str().to_owned())? + } + None => raw_line + .downcast::() + .map(|s| s.as_str().to_owned()) + .map_err(|_| vm.new_type_error("readline() returned a non-string object"))?, + }) + } + } + + impl fmt::Debug for PyTokenizerIter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PyTokenizerIter") + .field("readline", &self.readline) + .field("encoding", &self.encoding) + .field("extra_tokens", &self.extra_tokens) + .finish() + } + } + + #[pyclass(with(Constructor, Iterable, IterNext))] + impl PyTokenizerIter {} + + impl Constructor for PyTokenizerIter { + type Args = PyTokenizerIterArgs; + + fn py_new(cls: PyTypeRef, args: Self::Args, vm: &VirtualMachine) -> PyResult { + let Self::Args { + readline, + extra_tokens, + encoding, + } = args; + + Self { + readline, + extra_tokens, + encoding: encoding.map(|s| s.as_str().to_owned()), + state: PyRwLock::new(PyTokenizerIterState::default()), + } + .into_ref_with_type(vm, cls) + .map(Into::into) + } + } + + impl SelfIter for PyTokenizerIter {} + + impl IterNext for PyTokenizerIter { + fn next(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let mut state = { + let guard = zelf.state.read(); + guard.clone() + }; + + if state.eof { + return Ok(PyIterReturn::StopIteration(None)); + } + + let token = loop { + // TODO: Check here for errors. Raise SyntaxError if needed + + if let Some(tok) = state.next_token() { + break tok; + } + + let nline = zelf.readline(vm)?; + if nline.is_empty() { + state.eof = true; + *zelf.state.write() = state.clone(); + + let line_num = &state.start().0; + let out = vm + .ctx + .new_tuple(vec![ + token_kind_value(TokenKind::EndOfFile).to_pyobject(vm), + vm.ctx.new_str("").into(), + vm.ctx + .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)]) + .into(), + vm.ctx + .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)]) + .into(), + vm.ctx.new_str(state.current_line()).into(), + ]) + .into(); + return Ok(PyIterReturn::Return(out)); + } + state.push_line(&nline); + }; + + *zelf.state.write() = state.clone(); + + let token_kind = token.kind(); + let token_value = if zelf.extra_tokens && token_kind.is_operator() { + TOKEN_OP + } else { + token_kind_value(token_kind) + }; + let (start_x, start_y) = &state.start(); + let (end_x, end_y) = &state.end(); + + let mut token_repr = &state.source[state.range()]; + if !zelf.extra_tokens { + token_repr = token_repr.trim(); + } + + let out = vm + .ctx + .new_tuple(vec![ + token_value.to_pyobject(vm), + vm.ctx.new_str(token_repr).into(), + vm.ctx + .new_tuple(vec![start_x.to_pyobject(vm), start_y.to_pyobject(vm)]) + .into(), + vm.ctx + .new_tuple(vec![end_x.to_pyobject(vm), end_y.to_pyobject(vm)]) + .into(), + vm.ctx.new_str(state.current_line()).into(), + ]) + .into(); + Ok(PyIterReturn::Return(out)) + } + } + + #[derive(FromArgs)] + pub struct PyTokenizerIterArgs { + #[pyarg(positional)] + readline: ArgCallable, + #[pyarg(named)] + extra_tokens: bool, + #[pyarg(named, optional)] + encoding: Option, + } + + #[derive(Clone, Debug)] + struct PyTokenizerIterState { + /// Source code. + source: String, + prev_token: Option, + /// Tokens of `source`. + tokens: Tokens, + /// Errors of `source` + errors: Vec, + /// LineIndex of `source`. + line_index: LineIndex, + /// Marker that says we already emitted EOF, and needs to stop iterating. + eof: bool, + } + + impl PyTokenizerIterState { + fn push_line(&mut self, line: &str) { + self.source.push_str(line); + + let parsed = parse_unchecked_source(&self.source, PySourceType::Python); + self.tokens = parsed.tokens().clone(); + self.errors = parsed.errors().to_vec(); + self.line_index = LineIndex::from_source_text(&self.source); + } + + #[must_use] + fn current_line(&self) -> &str { + let (kind, range) = match self.prev_token { + Some(token) => token.as_tuple(), + None => (TokenKind::Unknown, TextRange::default()), + }; + + match kind { + TokenKind::Newline => self.source.full_line_str(range.start()), + _ => self.source.full_lines_str(range), + } + } + + #[must_use] + fn next_token(&mut self) -> Option { + for token in self.tokens.iter() { + let (kind, range) = token.as_tuple(); + + if matches!(kind, TokenKind::NonLogicalNewline) { + continue; + } + + if matches!(range.ordering(self.range()), Ordering::Greater) { + self.prev_token = Some(*token); + return self.prev_token; + } + } + + None + } + + #[must_use] + fn range(&self) -> TextRange { + match self.prev_token { + Some(token) => token.range(), + None => TextRange::default(), + } + } + + #[must_use] + fn start(&self) -> (usize, usize) { + let lc = self + .line_index + .line_column(self.range().start(), &self.source); + (lc.line.get(), lc.column.to_zero_indexed()) + } + + #[must_use] + fn end(&self) -> (usize, usize) { + let lc = self + .line_index + .line_column(self.range().end(), &self.source); + (lc.line.get(), lc.column.to_zero_indexed()) + } + } + + impl Default for PyTokenizerIterState { + fn default() -> Self { + const SOURCE: &str = ""; + let parsed = parse_unchecked_source(SOURCE, PySourceType::Python); + + Self { + source: SOURCE.to_owned(), + prev_token: None, + tokens: parsed.tokens().clone(), + errors: parsed.errors().to_vec(), + line_index: LineIndex::from_source_text(SOURCE), + eof: false, + } + } + } + + const fn token_kind_value(kind: TokenKind) -> u8 { + match kind { + TokenKind::EndOfFile => 0, + TokenKind::Name + | TokenKind::For + | TokenKind::In + | TokenKind::Pass + | TokenKind::Class + | TokenKind::And + | TokenKind::Is + | TokenKind::Raise + | TokenKind::True + | TokenKind::False + | TokenKind::Assert + | TokenKind::Try + | TokenKind::While + | TokenKind::Yield + | TokenKind::Lambda + | TokenKind::None + | TokenKind::Not + | TokenKind::Or + | TokenKind::Break + | TokenKind::Continue + | TokenKind::Global + | TokenKind::Nonlocal + | TokenKind::Return + | TokenKind::Except + | TokenKind::Import + | TokenKind::Case + | TokenKind::Match + | TokenKind::Type + | TokenKind::Await + | TokenKind::With + | TokenKind::Del + | TokenKind::Finally + | TokenKind::From + | TokenKind::Def + | TokenKind::If + | TokenKind::Else + | TokenKind::Elif + | TokenKind::As + | TokenKind::Async => 1, + TokenKind::Int | TokenKind::Complex | TokenKind::Float => 2, + TokenKind::String => 3, + TokenKind::Newline | TokenKind::NonLogicalNewline => 4, + TokenKind::Indent => 5, + TokenKind::Dedent => 6, + TokenKind::Lpar => 7, + TokenKind::Rpar => 8, + TokenKind::Lsqb => 9, + TokenKind::Rsqb => 10, + TokenKind::Colon => 11, + TokenKind::Comma => 12, + TokenKind::Semi => 13, + TokenKind::Plus => 14, + TokenKind::Minus => 15, + TokenKind::Star => 16, + TokenKind::Slash => 17, + TokenKind::Vbar => 18, + TokenKind::Amper => 19, + TokenKind::Less => 20, + TokenKind::Greater => 21, + TokenKind::Equal => 22, + TokenKind::Dot => 23, + TokenKind::Percent => 24, + TokenKind::Lbrace => 25, + TokenKind::Rbrace => 26, + TokenKind::EqEqual => 27, + TokenKind::NotEqual => 28, + TokenKind::LessEqual => 29, + TokenKind::GreaterEqual => 30, + TokenKind::Tilde => 31, + TokenKind::CircumFlex => 32, + TokenKind::LeftShift => 33, + TokenKind::RightShift => 34, + TokenKind::DoubleStar => 35, + TokenKind::PlusEqual => 36, + TokenKind::MinusEqual => 37, + TokenKind::StarEqual => 38, + TokenKind::SlashEqual => 39, + TokenKind::PercentEqual => 40, + TokenKind::AmperEqual => 41, + TokenKind::VbarEqual => 42, + TokenKind::CircumflexEqual => 43, + TokenKind::LeftShiftEqual => 44, + TokenKind::RightShiftEqual => 45, + TokenKind::DoubleStarEqual => 46, + TokenKind::DoubleSlash => 47, + TokenKind::DoubleSlashEqual => 48, + TokenKind::At => 49, + TokenKind::AtEqual => 50, + TokenKind::Rarrow => 51, + TokenKind::Ellipsis => 52, + TokenKind::ColonEqual => 53, + TokenKind::Exclamation => 54, + TokenKind::FStringStart => 59, + TokenKind::FStringMiddle => 60, + TokenKind::FStringEnd => 61, + TokenKind::Comment => 62, + TokenKind::TStringStart => 62, // 3.14 compatible + TokenKind::TStringMiddle => 63, // 3.14 compatible + TokenKind::TStringEnd => 64, // 3.14 compatible + TokenKind::IpyEscapeCommand | TokenKind::Question => 0, // Ruff's specific + TokenKind::Unknown => 0, + } + } +}