🌐 AI搜索 & 代理 主页
Skip to content

Commit 572c780

Browse files
authored
gh-138122: Implement frame caching in RemoteUnwinder to reduce memory reads (#142137)
This PR implements frame caching in the RemoteUnwinder class to significantly reduce memory reads when profiling remote processes with deep call stacks. When cache_frames=True, the unwinder stores the frame chain from each sample and reuses unchanged portions in subsequent samples. Since most profiling samples capture similar call stacks (especially the parent frames), this optimization avoids repeatedly reading the same frame data from the target process. The implementation adds a last_profiled_frame field to the thread state that tracks where the previous sample stopped. On the next sample, if the current frame chain reaches this marker, the cached frames from that point onward are reused instead of being re-read from remote memory. The sampling profiler now enables frame caching by default.
1 parent 332da62 commit 572c780

24 files changed

+1855
-142
lines changed

Include/cpython/pystate.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ struct _ts {
135135
/* Pointer to currently executing frame. */
136136
struct _PyInterpreterFrame *current_frame;
137137

138+
struct _PyInterpreterFrame *last_profiled_frame;
139+
138140
Py_tracefunc c_profilefunc;
139141
Py_tracefunc c_tracefunc;
140142
PyObject *c_profileobj;

Include/internal/pycore_debug_offsets.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ typedef struct _Py_DebugOffsets {
102102
uint64_t next;
103103
uint64_t interp;
104104
uint64_t current_frame;
105+
uint64_t last_profiled_frame;
105106
uint64_t thread_id;
106107
uint64_t native_thread_id;
107108
uint64_t datastack_chunk;
@@ -272,6 +273,7 @@ typedef struct _Py_DebugOffsets {
272273
.next = offsetof(PyThreadState, next), \
273274
.interp = offsetof(PyThreadState, interp), \
274275
.current_frame = offsetof(PyThreadState, current_frame), \
276+
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
275277
.thread_id = offsetof(PyThreadState, thread_id), \
276278
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
277279
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \

Include/internal/pycore_global_objects_fini_generated.h

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_global_strings.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ struct _Py_global_strings {
332332
STRUCT_FOR_ID(c_parameter_type)
333333
STRUCT_FOR_ID(c_return)
334334
STRUCT_FOR_ID(c_stack)
335+
STRUCT_FOR_ID(cache_frames)
335336
STRUCT_FOR_ID(cached_datetime_module)
336337
STRUCT_FOR_ID(cached_statements)
337338
STRUCT_FOR_ID(cadata)
@@ -776,6 +777,7 @@ struct _Py_global_strings {
776777
STRUCT_FOR_ID(stacklevel)
777778
STRUCT_FOR_ID(start)
778779
STRUCT_FOR_ID(statement)
780+
STRUCT_FOR_ID(stats)
779781
STRUCT_FOR_ID(status)
780782
STRUCT_FOR_ID(stderr)
781783
STRUCT_FOR_ID(stdin)

Include/internal/pycore_runtime_init_generated.h

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_unicodeobject_generated.h

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

InternalDocs/frames.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,26 @@ The shim frame points to a special code object containing the `INTERPRETER_EXIT`
111111
instruction which cleans up the shim frame and returns.
112112

113113

114+
### Remote Profiling Frame Cache
115+
116+
The `last_profiled_frame` field in `PyThreadState` supports an optimization for
117+
remote profilers that sample call stacks from external processes. When a remote
118+
profiler reads the call stack, it writes the current frame address to this field.
119+
The eval loop then keeps this pointer valid by updating it to the parent frame
120+
whenever a frame returns (in `_PyEval_FrameClearAndPop`).
121+
122+
This creates a "high-water mark" that always points to a frame still on the stack.
123+
On subsequent samples, the profiler can walk from `current_frame` until it reaches
124+
`last_profiled_frame`, knowing that frames from that point downward are unchanged
125+
and can be retrieved from a cache. This significantly reduces the amount of remote
126+
memory reads needed when call stacks are deep and stable at their base.
127+
128+
The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
129+
`last_profiled_frame` is non-NULL AND matches the frame being popped. This
130+
prevents transient frames (called and returned between profiler samples) from
131+
corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
132+
133+
114134
### The Instruction Pointer
115135

116136
`_PyInterpreterFrame` has two fields which are used to maintain the instruction

Lib/profiling/sampling/sample.py

Lines changed: 102 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,24 @@
2727

2828

2929
class SampleProfiler:
30-
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True):
30+
def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True, collect_stats=False):
3131
self.pid = pid
3232
self.sample_interval_usec = sample_interval_usec
3333
self.all_threads = all_threads
3434
self.mode = mode # Store mode for later use
35+
self.collect_stats = collect_stats
3536
if _FREE_THREADED_BUILD:
3637
self.unwinder = _remote_debugging.RemoteUnwinder(
3738
self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc,
38-
skip_non_matching_threads=skip_non_matching_threads
39+
skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
40+
stats=collect_stats
3941
)
4042
else:
4143
only_active_threads = bool(self.all_threads)
4244
self.unwinder = _remote_debugging.RemoteUnwinder(
4345
self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc,
44-
skip_non_matching_threads=skip_non_matching_threads
46+
skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
47+
stats=collect_stats
4548
)
4649
# Track sample intervals and total sample count
4750
self.sample_intervals = deque(maxlen=100)
@@ -129,6 +132,10 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
129132
print(f"Sample rate: {sample_rate:.2f} samples/sec")
130133
print(f"Error rate: {error_rate:.2f}%")
131134

135+
# Print unwinder stats if stats collection is enabled
136+
if self.collect_stats:
137+
self._print_unwinder_stats()
138+
132139
# Pass stats to flamegraph collector if it's the right type
133140
if hasattr(collector, 'set_stats'):
134141
collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@@ -176,17 +183,100 @@ def _print_realtime_stats(self):
176183
(1.0 / min_hz) * 1_000_000 if min_hz > 0 else 0
177184
) # Max time = Min Hz
178185

186+
# Build cache stats string if stats collection is enabled
187+
cache_stats_str = ""
188+
if self.collect_stats:
189+
try:
190+
stats = self.unwinder.get_stats()
191+
hits = stats.get('frame_cache_hits', 0)
192+
partial = stats.get('frame_cache_partial_hits', 0)
193+
misses = stats.get('frame_cache_misses', 0)
194+
total = hits + partial + misses
195+
if total > 0:
196+
hit_pct = (hits + partial) / total * 100
197+
cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
198+
except RuntimeError:
199+
pass
200+
179201
# Clear line and print stats
180202
print(
181-
f"\r\033[K{ANSIColors.BOLD_BLUE}Real-time sampling stats:{ANSIColors.RESET} "
182-
f"{ANSIColors.YELLOW}Mean: {mean_hz:.1f}Hz ({mean_us_per_sample:.2f}µs){ANSIColors.RESET} "
183-
f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz ({max_us_per_sample:.2f}µs){ANSIColors.RESET} "
184-
f"{ANSIColors.RED}Max: {max_hz:.1f}Hz ({min_us_per_sample:.2f}µs){ANSIColors.RESET} "
185-
f"{ANSIColors.CYAN}Samples: {self.total_samples}{ANSIColors.RESET}",
203+
f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} "
204+
f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} "
205+
f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} "
206+
f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} "
207+
f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}"
208+
f"{cache_stats_str}",
186209
end="",
187210
flush=True,
188211
)
189212

213+
def _print_unwinder_stats(self):
214+
"""Print unwinder statistics including cache performance."""
215+
try:
216+
stats = self.unwinder.get_stats()
217+
except RuntimeError:
218+
return # Stats not enabled
219+
220+
print(f"\n{ANSIColors.BOLD_BLUE}{'='*50}{ANSIColors.RESET}")
221+
print(f"{ANSIColors.BOLD_BLUE}Unwinder Statistics:{ANSIColors.RESET}")
222+
223+
# Frame cache stats
224+
total_samples = stats.get('total_samples', 0)
225+
frame_cache_hits = stats.get('frame_cache_hits', 0)
226+
frame_cache_partial_hits = stats.get('frame_cache_partial_hits', 0)
227+
frame_cache_misses = stats.get('frame_cache_misses', 0)
228+
total_lookups = frame_cache_hits + frame_cache_partial_hits + frame_cache_misses
229+
230+
# Calculate percentages
231+
hits_pct = (frame_cache_hits / total_lookups * 100) if total_lookups > 0 else 0
232+
partial_pct = (frame_cache_partial_hits / total_lookups * 100) if total_lookups > 0 else 0
233+
misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0
234+
235+
print(f" {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}")
236+
print(f" Total samples: {total_samples:,}")
237+
print(f" Full hits: {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})")
238+
print(f" Partial hits: {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})")
239+
print(f" Misses: {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})")
240+
241+
# Frame read stats
242+
frames_from_cache = stats.get('frames_read_from_cache', 0)
243+
frames_from_memory = stats.get('frames_read_from_memory', 0)
244+
total_frames = frames_from_cache + frames_from_memory
245+
cache_frame_pct = (frames_from_cache / total_frames * 100) if total_frames > 0 else 0
246+
memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0
247+
248+
print(f" {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}")
249+
print(f" From cache: {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})")
250+
print(f" From memory: {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})")
251+
252+
# Code object cache stats
253+
code_hits = stats.get('code_object_cache_hits', 0)
254+
code_misses = stats.get('code_object_cache_misses', 0)
255+
total_code = code_hits + code_misses
256+
code_hits_pct = (code_hits / total_code * 100) if total_code > 0 else 0
257+
code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0
258+
259+
print(f" {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}")
260+
print(f" Hits: {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})")
261+
print(f" Misses: {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})")
262+
263+
# Memory operations
264+
memory_reads = stats.get('memory_reads', 0)
265+
memory_bytes = stats.get('memory_bytes_read', 0)
266+
if memory_bytes >= 1024 * 1024:
267+
memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB"
268+
elif memory_bytes >= 1024:
269+
memory_str = f"{memory_bytes / 1024:.1f} KB"
270+
else:
271+
memory_str = f"{memory_bytes} B"
272+
print(f" {ANSIColors.CYAN}Memory:{ANSIColors.RESET}")
273+
print(f" Read operations: {memory_reads:,} ({memory_str})")
274+
275+
# Stale invalidations
276+
stale_invalidations = stats.get('stale_cache_invalidations', 0)
277+
if stale_invalidations > 0:
278+
print(f" {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
279+
190280

191281
def sample(
192282
pid,
@@ -234,7 +324,8 @@ def sample(
234324
mode=mode,
235325
native=native,
236326
gc=gc,
237-
skip_non_matching_threads=skip_non_matching_threads
327+
skip_non_matching_threads=skip_non_matching_threads,
328+
collect_stats=realtime_stats,
238329
)
239330
profiler.realtime_stats = realtime_stats
240331

@@ -290,7 +381,8 @@ def sample_live(
290381
mode=mode,
291382
native=native,
292383
gc=gc,
293-
skip_non_matching_threads=skip_non_matching_threads
384+
skip_non_matching_threads=skip_non_matching_threads,
385+
collect_stats=realtime_stats,
294386
)
295387
profiler.realtime_stats = realtime_stats
296388

0 commit comments

Comments
 (0)