diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..9ffaa00773 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,5 @@ +[submodule "submodules/cpython-v3.13.9"] + path = submodules/cpython-v3.13.9 + url = https://github.com/python/cpython.git + branch = 3.13 + shallow = true diff --git a/crates/compiler-core/generate.py b/crates/compiler-core/generate.py new file mode 100644 index 0000000000..5fe002ff10 --- /dev/null +++ b/crates/compiler-core/generate.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python +from __future__ import annotations + +import dataclasses +import enum +import functools +import io +import pathlib +import subprocess +import sys +import typing + +import tomllib + +if typing.TYPE_CHECKING: + from collections.abc import Iterator + +CPYTHON_VERSION = "v3.13.9" + + +CRATE_ROOT = pathlib.Path(__file__).parent +CONF_FILE = CRATE_ROOT / "instructions.toml" +OUT_FILE = CRATE_ROOT / "src" / "bytecode" / "instruction.rs" + +ROOT = CRATE_ROOT.parents[1] +SUBMODULES = ROOT / "submodules" +CPYTHON_DIR = SUBMODULES / f"cpython-{CPYTHON_VERSION}" +CPYTHON_TOOLS_DIR = CPYTHON_DIR / "Tools" / "cases_generator" +DIS_DOC = CPYTHON_DIR / "Doc" / "library" / "dis.rst" + +sys.path.append(CPYTHON_TOOLS_DIR.as_posix()) + +import analyzer +from generators_common import DEFAULT_INPUT + +U8_MAX = 255 + + +@dataclasses.dataclass(frozen=True, slots=True) +class OpargMetadata: + name: str | None = None + typ: str | None = None + + +@dataclasses.dataclass(slots=True) +class InstructionOverride: + enabled: bool = True + name: str | None = None + oparg: OpargMetadata = dataclasses.field(default_factory=OpargMetadata) + properties: analyzer.Properties | None = None + + def __post_init__(self): + if isinstance(self.oparg, dict): + self.oparg = OpargMetadata(**self.oparg) + + if isinstance(self.properties, dict): + self.properties = dataclasses.replace( + analyzer.SKIP_PROPERTIES, **self.properties + ) + + +@dataclasses.dataclass(slots=True) +class Instruction: + # TODO: Maybe add a post_init hook to show warning incase of oparg being set for + # instructions with no oparg? + instruction: analyzer.Instruction | analyzer.PseudoInstruction + override: InstructionOverride = dataclasses.field( + default_factory=InstructionOverride + ) + + @property + def rust_name(self) -> str: + return self.override.name or snake_case_to_pascal_case(self.instruction.name) + + @property + def rust_enum_variant(self) -> str: + if self.properties.oparg: + fields = f"{{ {self.oparg_name}: Arg<{self.oparg_typ}> }}" + else: + fields = "" + + return f"{self.rust_name} {fields} = {self.instruction.opcode}" + + @property + def properties(self) -> analyzer.Properties: + return self.override.properties or self.instruction.properties + + @property + def oparg_name(self) -> str | None: + if name := self.override.oparg.name: + return name + + if not self.properties.oparg: + return None + + oparg_names_map = build_oparg_names_map() + if name := oparg_names_map.get(self.instruction.name): + return name + + return self._oparg.field_name + + @property + def oparg_typ(self) -> str | None: + if typ := self.override.oparg.typ: + return typ + + properties = self.properties + if not properties.oparg: + return None + + try: + return self._oparg.name + except ValueError: + return "u32" # Fallback + + @property + def _oparg(self) -> Oparg: + try: + return Oparg.try_from_properties(self.properties) + except ValueError as err: + err.add_note(self.instruction.name) + raise err + + @classmethod + def from_analysis( + cls, analysis: analyzer.Analysis, overrides: dict[str, dict] + ) -> Iterator[typing.Self]: + insts = {} + for name, inst in analysis.instructions.items(): + override = InstructionOverride(**overrides.get(name, {})) + if not override.enabled: + continue + + opcode = inst.opcode + insts[opcode] = cls(inst, override) + + # Because we are treating pseudos like real opcodes, + # we need to find an alternative opcode for them (they go over u8::MAX) + occupied = set(insts) + for opcode, inst in insts.items(): + if opcode <= U8_MAX: + continue + + # Preserve `HAVE_ARG` semantics. + if inst.properties.oparg: + rang = range(analysis.have_arg, U8_MAX + 1) + else: + rang = range(0, analysis.have_arg) + + new_opcode = next(i for i in rang if i not in occupied) + occupied.add(new_opcode) + inst.instruction.opcode = new_opcode + + yield from insts.values() + + +@enum.unique +class Oparg(enum.Enum): + Label = enum.auto() + NameIdx = enum.auto() + + @property + def field_name(self) -> str: + match self: + case self.Label: + return "target" + case self.NameIdx: + return "namei" + + @classmethod + def try_from_properties(cls, properties: analyzer.Properties) -> typing.Self: + # TODO: `properties.uses_co_consts` -> `ConstIdx` + # TODO: `properties.uses_locals` -> `LocalIdx` + + if properties.uses_co_names: + return cls.NameIdx + elif properties.jumps: + return cls.Label + else: + raise ValueError(f"Could not detect oparg type of {properties}") + + +@functools.cache +def build_oparg_names_map() -> dict[str, str]: + doc = DIS_DOC.read_text() + + out = {} + for line in doc.splitlines(): + if not line.startswith(".. opcode:: "): + continue + + # At this point `line` would look something like: + # + # `.. opcode:: OPCODE_NAME` + # or + # `.. opcode:: OPCODE_NAME (oparg_name)` + # + # We only care about the later. + + parts = line.split() + if len(parts) != 4: + continue + + _, _, cpython_name, oparg = parts + out[cpython_name] = oparg.removeprefix("(").removesuffix(")") + + return out + + +def snake_case_to_pascal_case(name: str) -> str: + return name.title().replace("_", "") + + +def rustfmt(code: str) -> str: + return subprocess.check_output(["rustfmt", "--emit=stdout"], input=code, text=True) + + +def get_analysis() -> analyser.Analysis: + analysis = analyzer.analyze_files([DEFAULT_INPUT]) + + # We don't differentiate between real and pseudos yet + analysis.instructions |= analysis.pseudos + return analysis + + +def write_enum(outfile: typing.IO, instructions: list[Instruction]) -> None: + variants = ",\n".join(inst.rust_enum_variant for inst in instructions) + outfile.write( + f""" + /// A Single bytecode instruction. + #[repr(u8)] + #[derive(Clone, Copy, Debug, Eq, PartialEq)] + pub enum Instruction {{ + {variants} + }} + """ + ) + + +def main(): + analysis = get_analysis() + conf = tomllib.loads(CONF_FILE.read_text()) + overrides = conf["overrides"] + + instructions = sorted( + Instruction.from_analysis(analysis, overrides), key=lambda inst: inst.rust_name + ) + + outfile = io.StringIO() + write_enum(outfile, instructions) + + generated = outfile.getvalue() + + imports = ",".join( + { + inst.oparg_typ + for inst in instructions + if ((inst.oparg_typ is not None) and (inst.oparg_typ != "u32")) + } + ) + script_path = pathlib.Path(__file__).resolve().relative_to(ROOT).as_posix() + output = rustfmt( + f""" + // This file is generated by {script_path} + // Do not edit! + + use crate::bytecode::{{Arg, {imports}}}; + + {generated} + """ + ) + OUT_FILE.write_text(output) + + +if __name__ == "__main__": + main() diff --git a/crates/compiler-core/instructions.toml b/crates/compiler-core/instructions.toml new file mode 100644 index 0000000000..aa2ba26ccf --- /dev/null +++ b/crates/compiler-core/instructions.toml @@ -0,0 +1,128 @@ +# This file is used by `generate.py` to autogenerate CPython opcodes. +# +# The script will try to detect the properties for each opcode, but it can be useful to override some proerties. +# So Ideally, we want to have this file as empty as possible, +# because doing so means that we are getting closer to be fully aligned with CPython's opcodes. +# +# enabled : bool +# When false the opcode will not appear as a variant. +# name : str +# Sets the variant name. +# oparg : dict +# Sets oparg metadata. +# name : str +# Field name for the variant's oparg (`Foo { name: ... }`). +# typ : str +# Type of the variant's oparg (`Foo { i: typ }`). +# properties : dict +# Dict corresponding to https://github.com/python/cpython/blob/8183fa5e3f78ca6ab862de7fb8b14f3d929421e0/Tools/cases_generator/analyzer.py#L8-L29 + +[overrides] +BINARY_OP = { oparg = { typ = "BinaryOperator" } } +CALL_INTRINSIC_1 = { oparg = { name = "func", typ = "IntrinsicFunction1" } } +CALL_INTRINSIC_2 = { oparg = { name = "func", typ = "IntrinsicFunction2" } } +CONTAINS_OP = { oparg = { typ = "Invert" } } +GET_AWAITABLE = { oparg = { name = "r#where" } } # `where` is a rust keyword +IS_OP = { oparg = { typ = "Invert" } } +RAISE_VARARGS = { name = "RaiseVarArgs" } +RESUME = { oparg = { typ = "ResumeKind" } } +YIELD_VALUE = { properties = { oparg = false } } # 3.13 changed this to have an oparg + +# Disabled (not implemented) + +BINARY_OP_ADD_FLOAT = { enabled = false } +BINARY_OP_ADD_INT = { enabled = false } +BINARY_OP_ADD_UNICODE = { enabled = false } +BINARY_OP_INPLACE_ADD_UNICODE = { enabled = false } +BINARY_OP_MULTIPLY_FLOAT = { enabled = false } +BINARY_OP_MULTIPLY_INT = { enabled = false } +BINARY_OP_SUBTRACT_FLOAT = { enabled = false } +BINARY_OP_SUBTRACT_INT = { enabled = false } +BINARY_SUBSCR_DICT = { enabled = false } +BINARY_SUBSCR_GETITEM = { enabled = false } +BINARY_SUBSCR_LIST_INT = { enabled = false } +BINARY_SUBSCR_STR_INT = { enabled = false } +BINARY_SUBSCR_TUPLE_INT = { enabled = false } +BUILD_CONST_KEY_MAP = { enabled = false } +CACHE = { enabled = false } +CALL_ALLOC_AND_ENTER_INIT = { enabled = false } +CALL_BOUND_METHOD_EXACT_ARGS = { enabled = false } +CALL_BOUND_METHOD_GENERAL = { enabled = false } +CALL_BUILTIN_CLASS = { enabled = false } +CALL_BUILTIN_FAST = { enabled = false } +CALL_BUILTIN_FAST_WITH_KEYWORDS = { enabled = false } +CALL_BUILTIN_O = { enabled = false } +CALL_ISINSTANCE = { enabled = false } +CALL_LEN = { enabled = false } +CALL_LIST_APPEND = { enabled = false } +CALL_METHOD_DESCRIPTOR_FAST = { enabled = false } +CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS = { enabled = false } +CALL_METHOD_DESCRIPTOR_NOARGS = { enabled = false } +CALL_METHOD_DESCRIPTOR_O = { enabled = false } +CALL_NON_PY_GENERAL = { enabled = false } +CALL_PY_EXACT_ARGS = { enabled = false } +CALL_PY_GENERAL = { enabled = false } +CALL_STR_1 = { enabled = false } +CALL_TUPLE_1 = { enabled = false } +CALL_TYPE_1 = { enabled = false } +COMPARE_OP_FLOAT = { enabled = false } +COMPARE_OP_INT = { enabled = false } +COMPARE_OP_STR = { enabled = false } +CONTAINS_OP_DICT = { enabled = false } +CONTAINS_OP_SET = { enabled = false } +ENTER_EXECUTOR = { enabled = false } +FOR_ITER_GEN = { enabled = false } +FOR_ITER_LIST = { enabled = false } +FOR_ITER_RANGE = { enabled = false } +FOR_ITER_TUPLE = { enabled = false } +INSTRUMENTED_CALL = { enabled = false } +INSTRUMENTED_CALL_FUNCTION_EX = { enabled = false } +INSTRUMENTED_CALL_KW = { enabled = false } +INSTRUMENTED_END_FOR = { enabled = false } +INSTRUMENTED_END_SEND = { enabled = false } +INSTRUMENTED_FOR_ITER = { enabled = false } +INSTRUMENTED_INSTRUCTION = { enabled = false } +INSTRUMENTED_JUMP_BACKWARD = { enabled = false } +INSTRUMENTED_JUMP_FORWARD = { enabled = false } +INSTRUMENTED_LINE = { enabled = false } +INSTRUMENTED_LOAD_SUPER_ATTR = { enabled = false } +INSTRUMENTED_POP_JUMP_IF_FALSE = { enabled = false } +INSTRUMENTED_POP_JUMP_IF_NONE = { enabled = false } +INSTRUMENTED_POP_JUMP_IF_NOT_NONE = { enabled = false } +INSTRUMENTED_POP_JUMP_IF_TRUE = { enabled = false } +INSTRUMENTED_RESUME = { enabled = false } +INSTRUMENTED_RETURN_CONST = { enabled = false } +INSTRUMENTED_RETURN_VALUE = { enabled = false } +INSTRUMENTED_YIELD_VALUE = { enabled = false } +LOAD_ASSERTION_ERROR = { enabled = false } +LOAD_ATTR_CLASS = { enabled = false } +LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN = { enabled = false } +LOAD_ATTR_INSTANCE_VALUE = { enabled = false } +LOAD_ATTR_METHOD_LAZY_DICT = { enabled = false } +LOAD_ATTR_METHOD_NO_DICT = { enabled = false } +LOAD_ATTR_METHOD_WITH_VALUES = { enabled = false } +LOAD_ATTR_MODULE = { enabled = false } +LOAD_ATTR_NONDESCRIPTOR_NO_DICT = { enabled = false } +LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES = { enabled = false } +LOAD_ATTR_PROPERTY = { enabled = false } +LOAD_ATTR_SLOT = { enabled = false } +LOAD_ATTR_WITH_HINT = { enabled = false } +LOAD_GLOBAL_BUILTIN = { enabled = false } +LOAD_GLOBAL_MODULE = { enabled = false } +LOAD_SUPER_ATTR_ATTR = { enabled = false } +LOAD_SUPER_ATTR_METHOD = { enabled = false } +PUSH_EXC_INFO = { enabled = false } +PUSH_NULL = { enabled = false } +RERAISE = { enabled = false } +RESUME_CHECK = { enabled = false } +SEND_GEN = { enabled = false } +STORE_ATTR_INSTANCE_VALUE = { enabled = false } +STORE_ATTR_SLOT = { enabled = false } +STORE_ATTR_WITH_HINT = { enabled = false } +STORE_FAST_MAYBE_NULL = { enabled = false } +STORE_SUBSCR_DICT = { enabled = false } +STORE_SUBSCR_LIST_INT = { enabled = false } +UNPACK_SEQUENCE_LIST = { enabled = false } +UNPACK_SEQUENCE_TUPLE = { enabled = false } +UNPACK_SEQUENCE_TWO_TUPLE = { enabled = false } +WITH_EXCEPT_START = { enabled = false } diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index 8a095de611..491f95c0f2 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -12,6 +12,10 @@ use num_complex::Complex64; use rustpython_wtf8::{Wtf8, Wtf8Buf}; use std::{collections::BTreeSet, fmt, hash, marker::PhantomData, mem, num::NonZeroU8, ops::Deref}; +pub use crate::bytecode::instruction::Instruction; + +mod instruction; + #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)] #[repr(i8)] #[allow(clippy::cast_possible_wrap)] @@ -545,290 +549,6 @@ op_arg_enum!( pub type NameIdx = u32; -/// A Single bytecode instruction. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -#[repr(u8)] -pub enum Instruction { - Nop, - /// Importing by name - ImportName { - idx: Arg, - }, - /// Importing without name - ImportNameless, - /// from ... import ... - ImportFrom { - idx: Arg, - }, - LoadFast(Arg), - LoadNameAny(Arg), - LoadGlobal(Arg), - LoadDeref(Arg), - LoadClassDeref(Arg), - StoreFast(Arg), - StoreLocal(Arg), - StoreGlobal(Arg), - StoreDeref(Arg), - DeleteFast(Arg), - DeleteLocal(Arg), - DeleteGlobal(Arg), - DeleteDeref(Arg), - LoadClosure(Arg), - Subscript, - StoreSubscript, - DeleteSubscript, - /// Performs `is` comparison, or `is not` if `invert` is 1. - IsOp(Arg), - /// Performs `in` comparison, or `not in` if `invert` is 1. - ContainsOp(Arg), - StoreAttr { - idx: Arg, - }, - DeleteAttr { - idx: Arg, - }, - LoadConst { - /// index into constants vec - idx: Arg, - }, - UnaryOperation { - op: Arg, - }, - BinaryOperation { - op: Arg, - }, - BinaryOperationInplace { - op: Arg, - }, - BinarySubscript, - LoadAttr { - idx: Arg, - }, - CompareOperation { - op: Arg, - }, - CopyItem { - index: Arg, - }, - Pop, - Swap { - index: Arg, - }, - ToBool, - GetIter, - GetLen, - CallIntrinsic1 { - func: Arg, - }, - CallIntrinsic2 { - func: Arg, - }, - Continue { - target: Arg