diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index a9d97e47e005df..25c8ec13f5d562 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -55,14 +55,14 @@ typedef struct _err_stackitem { } _PyErr_StackItem; typedef struct _stack_chunk { - struct _stack_chunk *previous; size_t size; - size_t top; - PyObject * data[1]; /* Variable sized */ + struct _stack_chunk *previous; + PyObject *data[1]; /* Variable sized */ } _PyStackChunk; -/* Minimum size of data stack chunk */ -#define _PY_DATA_STACK_CHUNK_SIZE (16*1024) +#define _PY_STACK_CHUNK_MIN_SIZE 4096 +#define _PY_STACK_CHUNK_OVERHEADS (offsetof(_PyStackChunk, data)) + struct _ts { /* See Python/ceval.c for comments explaining most fields */ @@ -195,10 +195,9 @@ struct _ts { /* Unique thread state id. */ uint64_t id; - _PyStackChunk *datastack_chunk; - PyObject **datastack_top; - PyObject **datastack_limit; - _PyStackChunk *datastack_cached_chunk; + _PyStackChunk *stack_chunk_list; + PyObject **stack_top; + PyObject **stack_limit; /* XXX signal handlers should also be here */ /* The following fields are here to avoid allocation during init. diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h index 18490f98a918a7..81ed4f0273fe20 100644 --- a/Include/internal/pycore_debug_offsets.h +++ b/Include/internal/pycore_debug_offsets.h @@ -106,7 +106,7 @@ typedef struct _Py_DebugOffsets { uint64_t last_profiled_frame; uint64_t thread_id; uint64_t native_thread_id; - uint64_t datastack_chunk; + uint64_t stack_chunk_list; uint64_t status; uint64_t holds_gil; uint64_t gil_requested; @@ -296,7 +296,7 @@ typedef struct _Py_DebugOffsets { .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \ .thread_id = offsetof(PyThreadState, thread_id), \ .native_thread_id = offsetof(PyThreadState, native_thread_id), \ - .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \ + .stack_chunk_list = offsetof(PyThreadState, stack_chunk_list), \ .status = offsetof(PyThreadState, _status), \ .holds_gil = offsetof(PyThreadState, holds_gil), \ .gil_requested = offsetof(PyThreadState, gil_requested), \ diff --git a/Include/internal/pycore_interpframe.h b/Include/internal/pycore_interpframe.h index 28370ababc47b9..567e7402f89dd5 100644 --- a/Include/internal/pycore_interpframe.h +++ b/Include/internal/pycore_interpframe.h @@ -333,12 +333,12 @@ static inline bool _PyThreadState_HasStackSpace(PyThreadState *tstate, int size) { assert( - (tstate->datastack_top == NULL && tstate->datastack_limit == NULL) + (tstate->stack_top == NULL && tstate->stack_limit == NULL) || - (tstate->datastack_top != NULL && tstate->datastack_limit != NULL) + (tstate->stack_top != NULL && tstate->stack_limit != NULL) ); - return tstate->datastack_top != NULL && - size < tstate->datastack_limit - tstate->datastack_top; + return tstate->stack_top != NULL && + size < tstate->stack_limit - tstate->stack_top; } // Exported for external JIT support @@ -356,9 +356,9 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_ CALL_STAT_INC(frames_pushed); PyFunctionObject *func_obj = (PyFunctionObject *)PyStackRef_AsPyObjectBorrow(func); PyCodeObject *code = (PyCodeObject *)func_obj->func_code; - _PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top; - tstate->datastack_top += code->co_framesize; - assert(tstate->datastack_top < tstate->datastack_limit); + _PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->stack_top; + tstate->stack_top += code->co_framesize; + assert(tstate->stack_top < tstate->stack_limit); _PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from, previous); return new_frame; @@ -370,9 +370,9 @@ static inline _PyInterpreterFrame * _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int stackdepth, _PyInterpreterFrame * previous) { CALL_STAT_INC(frames_pushed); - _PyInterpreterFrame *frame = (_PyInterpreterFrame *)tstate->datastack_top; - tstate->datastack_top += code->co_framesize; - assert(tstate->datastack_top < tstate->datastack_limit); + _PyInterpreterFrame *frame = (_PyInterpreterFrame *)tstate->stack_top; + tstate->stack_top += code->co_framesize; + assert(tstate->stack_top < tstate->stack_limit); frame->previous = previous; frame->f_funcobj = PyStackRef_None; frame->f_executable = PyStackRef_FromPyObjectNew(code); @@ -408,6 +408,54 @@ PyAPI_FUNC(_PyInterpreterFrame *) _PyEvalFramePushAndInit_Ex(PyThreadState *tstate, _PyStackRef func, PyObject *locals, Py_ssize_t nargs, PyObject *callargs, PyObject *kwargs, _PyInterpreterFrame *previous); +static inline bool +ptr_in_chunk(const char *ptr, const _PyStackChunk *chunk) +{ + assert(chunk != NULL); + const char *start = (char *)&chunk->data[0]; + const intptr_t offset = ptr - start; + const intptr_t usable_size = (intptr_t)(chunk->size - _PY_STACK_CHUNK_OVERHEADS); + return offset >= 0 && offset < usable_size && start + offset == ptr; +} + +static inline uintptr_t +get_offset_in_chunk(const char *ptr, const _PyStackChunk *chunk) +{ + assert(chunk != NULL); + assert(chunk->data != NULL); + assert(ptr_in_chunk(ptr, chunk)); + + return ptr - (char *)chunk; +} + +static inline uintptr_t +get_offset_in_chunk_list(char *base, _PyStackChunk *stack_chunk_list) +{ + assert(stack_chunk_list != NULL); + assert(base != NULL); + _PyStackChunk *chunk = stack_chunk_list; + do { + if (ptr_in_chunk(base, chunk)) { + return get_offset_in_chunk(base, chunk); + } + chunk = chunk->previous; + } while (chunk); + assert(false); // did not find correct chunk + Py_UNREACHABLE(); +} + +static inline void * +_Py_ensure_frame_in_current_stack_chunk(PyThreadState *tstate, char *frame) +{ + assert(tstate != NULL); + assert(frame != NULL); + if (ptr_in_chunk(frame, tstate->stack_chunk_list)) { + return frame; + } + uintptr_t offset = get_offset_in_chunk_list(frame, tstate->stack_chunk_list->previous); + return ((char *)tstate->stack_chunk_list) + offset; +} + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h index 6c48ac0dccfcb1..315c7130e91ab4 100644 --- a/Include/internal/pycore_runtime_init.h +++ b/Include/internal/pycore_runtime_init.h @@ -173,6 +173,9 @@ extern PyTypeObject _PyExc_MemoryError; ._whence = _PyThreadState_WHENCE_NOTSET, \ .py_recursion_limit = Py_DEFAULT_RECURSION_LIMIT, \ .context_ver = 1, \ + .stack_chunk_list = NULL, \ + .stack_limit = NULL, \ + .stack_top = NULL, \ } diff --git a/InternalDocs/frames.md b/InternalDocs/frames.md index 60ab2055afa7b1..93954df9df1e95 100644 --- a/InternalDocs/frames.md +++ b/InternalDocs/frames.md @@ -18,12 +18,52 @@ The definition of the `_PyInterpreterFrame` struct is in Python semantics allows frames to outlive the activation, so they need to be allocated outside the C call stack. To reduce overhead and improve locality of reference, most frames are allocated contiguously in a per-thread stack -(see `_PyThreadState_PushFrame` in [Python/pystate.c](../Python/pystate.c)). +chunk (see `_PyThreadState_PushFrame` in +[Python/pystate.c](../Python/pystate.c)). Frames of generators and coroutines are embedded in the generator and coroutine objects, so are not allocated in the per-thread stack. See `_PyGenObject` in [Include/internal/pycore_interpframe_structs.h](../Include/internal/pycore_interpframe_structs.h). +## Stack allocation + +The per-thread stack is a resizable array backed by `_PyStackChunk` +allocations. Each `PyThreadState` stores: + +* `stack_chunk_list`: the newest stack chunk, with older chunks linked through + `_PyStackChunk.previous`; +* `stack_top`: the next free slot in the newest chunk; and +* `stack_limit`: the end of the newest chunk. + +The first frame allocation creates a chunk of at least +`_PY_STACK_CHUNK_MIN_SIZE=4096` bytes. If a subsequent frame does not fit, +`resize_stack()` allocates a larger chunk, twice the size of the +previous one and large enough for the requested frame. The old chunk is +retained in `stack_chunk_list->previous`, instead of being copied or +immediately freed. + +The newest chunk is aligned with the previous logical stack by setting its +`stack_top` to the same offset that was used in the old chunk. New frames are +then placed in the newest chunk. Existing frame records remain where they were, +so the `previous` links in the frame chain can cross from the newest chunk into +older retained chunks. + +![Resizable stack after a resize](images/stack-resize.png) + +When a frame is popped, `_PyThreadState_PopFrame()` ensures that +`tstate->stack_top` keeps pointing into the new stack chunk even if the popped +frame resides in an older chunk. This ensures that when new frames are pushed, +they are created in the new chunk. + +Starting from the situation depicted above, the picture below shows what happens +after resizing and a number of pops remove one frame in the older chunk, then a +new frame is created and placed in the newest chunk. + +![Stack after a resize, various pops and one push](images/stack-resize-pop-push.png) + +All retained stack chunks are freed when the thread state is deleted by +`clear_stack_chunk_list()`. + ## Layout Each activation record is laid out as: diff --git a/InternalDocs/images/stack-resize-pop-push.dot b/InternalDocs/images/stack-resize-pop-push.dot new file mode 100644 index 00000000000000..f7690b6c5866cc --- /dev/null +++ b/InternalDocs/images/stack-resize-pop-push.dot @@ -0,0 +1,146 @@ +digraph stack_resize { + graph [ + fontname="Monaco", + fontsize=16, + bgcolor="white", + compound=true, + newrank=true, + splines=false + ]; + + node [ + fontname="Monaco", + fontsize=16, + shape=box, + style="rounded,filled", + color="#2563eb", + fillcolor="white", + penwidth=2, + fixedsize=true + ]; + + edge [ + color="#111111", + fontname="Monaco", + penwidth=2, + arrowsize=0.75 + ]; + + subgraph cluster_older_chunk { + label="older chunk"; + labelloc=b; + fontcolor="#0f172a"; + style="rounded,filled"; + color="#334155"; + fillcolor="#f1f5f9"; + penwidth=2; + margin=24; + + node [ + color="#64748b", + width=2.5, + ]; + + old_frame_hidden [style="invis", label=""]; + old_frame_1 [style="invis", label=""]; + subgraph cluster_older_frames { + label=""; + penwidth=0; + old_frame_2 [label="older frame"]; + old_frame_3 [label="older frame"]; + } + old_previous [ + color="invis", + fillcolor="invis", + width=2.15, + height=0.58, + label="previous = NULL" + ]; + + old_frame_hidden:s -> old_frame_1:n [style="invis"]; + old_frame_1:s -> old_frame_2:n [style="invis"]; + old_frame_2:s -> old_frame_3:n; + old_frame_3:s -> old_previous:n [style="invis"]; + } + + subgraph cluster_newest_chunk { + label="newest chunk"; + labelloc=b; + fontcolor="#0f172a"; + style="rounded,filled"; + color="#1e3a8a"; + fillcolor="#eaf4ff"; + penwidth=2; + margin=24; + + newer_frame_hidden [width=2.5, style="invis"]; + top_frame [width=2.5, label="top frame", style="invis"]; + newer_frame_1 [width=2.5, label="newer frame", style="invis"]; + newer_frame_2 [width=2.5, label="top frame"]; + + subgraph cluster_untouched_memory { + label=""; + color="#f59e0b"; + fillcolor="#ffec99"; + style="filled,dashed"; + penwidth=2; + margin=24; + + node [ + width=2.5, + height=0.5, + ]; + + untouched_memory_top [style="invis", label=""]; + untouched_memory_bottom [style="invis", label=""]; + } + + new_previous [ + color="invis", + fillcolor="invis", + width=1.15, + height=0.58, + label="previous" + ]; + + newer_frame_hidden:s -> top_frame:n [style="invis"]; + top_frame:s -> newer_frame_1:n [style="invis"]; + newer_frame_1:s -> newer_frame_2:n [style="invis"]; + newer_frame_2:s -> untouched_memory_top:n [ + lhead=cluster_untouched_memory, + style="invis", + ]; + untouched_memory_bottom:s -> new_previous:n [ + ltail=cluster_untouched_memory, + style="invis", + ]; + } + + stack_top_label [ + shape=plaintext, + style="", + fixedsize=false, + label="stack_top", + fontcolor="#0f172a" + ]; + + { rank=same; old_frame_1; newer_frame_2; } + { rank=same; old_frame_2; untouched_memory_top; } + { rank=same; old_frame_3; untouched_memory_bottom; } + { rank=same; old_previous; new_previous; } + { rank=same; newer_frame_2; stack_top_label; } + + newer_frame_2 -> stack_top_label [style=invis, weight=100, constraint=false]; + old_frame_1 -> newer_frame_2 [style=invis, weight=100, constraint=false]; + + newer_frame_2:w -> old_frame_2:e [ + constraint=false, + ]; + stack_top_label:w -> newer_frame_2:e [ + constraint=false + ]; + new_previous:w -> old_previous:e [ + lhead=cluster_older_chunk, + constraint=false + ]; +} diff --git a/InternalDocs/images/stack-resize-pop-push.png b/InternalDocs/images/stack-resize-pop-push.png new file mode 100644 index 00000000000000..a739442f4acff6 Binary files /dev/null and b/InternalDocs/images/stack-resize-pop-push.png differ diff --git a/InternalDocs/images/stack-resize.dot b/InternalDocs/images/stack-resize.dot new file mode 100644 index 00000000000000..3f2d5a62b92f68 --- /dev/null +++ b/InternalDocs/images/stack-resize.dot @@ -0,0 +1,153 @@ +digraph stack_resize { + graph [ + fontname="Monaco", + fontsize=16, + bgcolor="white", + compound=true, + newrank=true, + splines=false + ]; + + node [ + fontname="Monaco", + fontsize=16, + shape=box, + style="rounded,filled", + color="#2563eb", + fillcolor="white", + penwidth=2, + fixedsize=true + ]; + + edge [ + color="#111111", + fontname="Monaco", + penwidth=2, + arrowsize=0.75 + ]; + + subgraph cluster_older_chunk { + label="older chunk"; + labelloc=b; + fontcolor="#0f172a"; + style="rounded,filled"; + color="#334155"; + fillcolor="#f1f5f9"; + penwidth=2; + margin=24; + + node [ + color="#64748b", + width=2.5, + ]; + + old_frame_hidden [style="invis", label=""]; + subgraph cluster_older_frames { + label=""; + penwidth=0; + old_frame_1 [label="older frame"]; + old_frame_2 [label="older frame"]; + old_frame_3 [label="older frame"]; + } + old_previous [ + color="invis", + fillcolor="invis", + width=2.15, + height=0.58, + label="previous = NULL" + ]; + + old_frame_hidden:s -> old_frame_1:n [style="invis"]; + old_frame_1:s -> old_frame_2:n; + old_frame_2:s -> old_frame_3:n; + old_frame_3:s -> old_previous:n [style="invis"]; + } + + subgraph cluster_newest_chunk { + label="newest chunk"; + labelloc=b; + fontcolor="#0f172a"; + style="rounded,filled"; + color="#1e3a8a"; + fillcolor="#eaf4ff"; + penwidth=2; + margin=24; + + top_frame [width=2.5, label="top frame"]; + newer_frame_1 [width=2.5, label="newer frame"]; + newer_frame_2 [width=2.5, label="newer frame"]; + + subgraph cluster_untouched_memory { + label=""; + color="#f59e0b"; + fillcolor="#ffec99"; + style="filled,dashed"; + penwidth=2; + margin=24; + + node [ + width=2.5, + height=0.5, + ]; + + untouched_memory_top [style="invis", label=""]; + untouched_memory [ + shape=plaintext, + style="", + fixedsize=false, + label="untouched memory\n(same size as\npreviously used)" + ]; + untouched_memory_bottom [style="invis", label=""]; + } + + new_previous [ + color="invis", + fillcolor="invis", + width=1.15, + height=0.58, + label="previous" + ]; + + top_frame:s -> newer_frame_1:n; + newer_frame_1:s -> newer_frame_2:n; + newer_frame_2:s -> untouched_memory_top:n [ + lhead=cluster_untouched_memory, + style="invis", + ]; + untouched_memory_bottom:s -> new_previous:n [ + ltail=cluster_untouched_memory, + style="invis", + ]; + } + + stack_top_label [ + shape=plaintext, + style="", + fixedsize=false, + label="stack_top", + fontcolor="#0f172a" + ]; + + { rank=same; old_frame_hidden; newer_frame_2; } + { rank=same; old_frame_1; untouched_memory_top; } + { rank=same; old_frame_2; untouched_memory; } + { rank=same; old_frame_3; untouched_memory_bottom; } + { rank=same; old_previous; new_previous; } + { rank=same; top_frame; stack_top_label; } + + top_frame -> stack_top_label [style=invis, weight=100, constraint=false]; + old_frame_1 -> newer_frame_2 [style=invis, weight=100, constraint=false]; + + newer_frame_2:w -> old_frame_1:e [ + constraint=false, + taillabel="frame chain can cross\nstack chunks", + labeldistance=15, + ]; + stack_top_label:w -> top_frame:e [ + constraint=false + ]; + new_previous:w -> old_previous:e [ + lhead=cluster_older_chunk, + constraint=false + ]; +} diff --git a/InternalDocs/images/stack-resize.png b/InternalDocs/images/stack-resize.png new file mode 100644 index 00000000000000..93820ca6b2a6b5 Binary files /dev/null and b/InternalDocs/images/stack-resize.png differ diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py index a29e6cdbbf6c78..6fcf093011496b 100644 --- a/Lib/test/test_external_inspection.py +++ b/Lib/test/test_external_inspection.py @@ -3736,6 +3736,56 @@ def recurse(n): "Cache exhaustion should not affect stack completeness", ) + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_deep_stack_uses_copied_stack_chunks(self): + """Test deep stacks do not fall back to one remote read per frame.""" + depth = 1000 + script_body = f"""\ +import sys +sys.setrecursionlimit(2000) + +def recurse(n): + if n <= 0: + sock.sendall(b"ready") + sock.recv(16) + return + recurse(n - 1) + +recurse({depth}) +""" + + with self._target_process(script_body) as ( + p, + client_socket, + _, + ): + unwinder = RemoteUnwinder( + p.pid, all_threads=True, cache_frames=False, stats=True + ) + frames = self._sample_frames( + client_socket, + unwinder, + b"ready", + b"done", + {"recurse"}, + expected_frames=depth + 2, + ) + stats = unwinder.get_stats() + + self.assertIsNotNone(frames) + + recurse_count = [f.funcname for f in frames].count("recurse") + self.assertGreaterEqual(recurse_count, depth) + + # This stack spans multiple chunks. Copying only the current chunk + # forces older frames down the slow remote-read fallback path. + self.assertGreater(stats["stack_chunks_copied"], 1, stats) + self.assertLess(stats["memory_reads"], depth // 10, stats) + @skip_if_not_supported @unittest.skipIf( sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, diff --git a/Lib/test/test_isinstance.py b/Lib/test/test_isinstance.py index d97535ba46e677..54966c1178e234 100644 --- a/Lib/test/test_isinstance.py +++ b/Lib/test/test_isinstance.py @@ -307,6 +307,8 @@ def __bases__(self): self.assertEqual(True, issubclass(B(), int)) + @support.skip_emscripten_stack_overflow() + @support.skip_wasi_stack_overflow() def test_infinite_recursion_in_bases(self): class X: @property diff --git a/Lib/test/test_pyrepl/test_unix_console.py b/Lib/test/test_pyrepl/test_unix_console.py index 71b2e17e334151..8e60d7628aaebb 100644 --- a/Lib/test/test_pyrepl/test_unix_console.py +++ b/Lib/test/test_pyrepl/test_unix_console.py @@ -7,7 +7,7 @@ import unittest from functools import partial from test.support import force_color, os_helper, force_not_colorized_test_class -from test.support import threading_helper +from test.support import threading_helper, is_emscripten from unittest import TestCase from unittest.mock import MagicMock, call, patch, ANY, Mock @@ -349,6 +349,7 @@ def same_console(events): console.restore() con.restore() + @unittest.skipIf(is_emscripten, "Causes TypeError: Cannot read properties of undefined (reading '0')") def test_getheightwidth_with_invalid_environ(self, _os_write): # gh-128636 console = UnixConsole(term="xterm") @@ -384,6 +385,7 @@ def test_restore_in_thread(self, _os_write): @unittest.skipIf(sys.platform == "win32", "No Unix console on Windows") class TestUnixConsoleEIOHandling(TestCase): + @unittest.skipIf(is_emscripten, "Causes TypeError: Cannot read properties of undefined (reading '0')") @patch('_pyrepl.unix_console.tcsetattr') @patch('_pyrepl.unix_console.tcgetattr') def test_eio_error_handling_in_restore(self, mock_tcgetattr, mock_tcsetattr): diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-17-12-27-33.gh-issue-142183.jKFUky.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-17-12-27-33.gh-issue-142183.jKFUky.rst new file mode 100644 index 00000000000000..7be9d1cf8ae1eb --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-17-12-27-33.gh-issue-142183.jKFUky.rst @@ -0,0 +1 @@ +Change the resizing implementation of the Python stack to use a resizable array instead of a chain of arrays, to avoid degenerate performance cases and allow further JIT optimizations. diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 7369cd1514c296..82b1e139bcde8e 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -239,6 +239,7 @@ typedef struct { uint64_t frames_read_from_memory; // Total frames read from remote memory uint64_t memory_reads; // Total remote memory read operations uint64_t memory_bytes_read; // Total bytes read from remote memory + uint64_t stack_chunks_copied; // Total stack chunks copied uint64_t code_object_cache_hits; // Code object cache hits uint64_t code_object_cache_misses; // Code object cache misses uint64_t stale_cache_invalidations; // Times stale entries were cleared diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h index 1133db808efaec..067dede2c2a73d 100644 --- a/Modules/_remote_debugging/clinic/module.c.h +++ b/Modules/_remote_debugging/clinic/module.c.h @@ -408,6 +408,7 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__, " - frames_read_from_memory: Total frames read from remote memory\n" " - memory_reads: Total remote memory read operations\n" " - memory_bytes_read: Total bytes read from remote memory\n" +" - stack_chunks_copied: Total stack chunks copied from remote memory\n" " - code_object_cache_hits: Code object cache hits\n" " - code_object_cache_misses: Code object cache misses\n" " - stale_cache_invalidations: Times stale cache entries were cleared\n" @@ -1564,4 +1565,4 @@ _remote_debugging_get_gc_stats(PyObject *module, PyObject *const *args, Py_ssize exit: return return_value; } -/*[clinic end generated code: output=36674f4cb8a653f3 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=b30022bf5c171170 input=a9049054013a1b77]*/ diff --git a/Modules/_remote_debugging/debug_offsets_validation.h b/Modules/_remote_debugging/debug_offsets_validation.h index f070f03ac459dc..298a339ebfc170 100644 --- a/Modules/_remote_debugging/debug_offsets_validation.h +++ b/Modules/_remote_debugging/debug_offsets_validation.h @@ -252,7 +252,7 @@ validate_fixed_field( #define PY_REMOTE_DEBUG_THREAD_STATE_FIELDS(APPLY, buffer_size) \ APPLY(thread_state, native_thread_id, sizeof(unsigned long), _Alignof(long), buffer_size); \ APPLY(thread_state, interp, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ - APPLY(thread_state, datastack_chunk, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ + APPLY(thread_state, stack_chunk_list, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ APPLY(thread_state, status, FIELD_SIZE(PyThreadState, _status), _Alignof(unsigned int), buffer_size); \ APPLY(thread_state, holds_gil, sizeof(int), _Alignof(int), buffer_size); \ APPLY(thread_state, gil_requested, sizeof(int), _Alignof(int), buffer_size); \ diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index bbdfce3f7201d9..1adc90421662e4 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -27,7 +27,7 @@ process_single_stack_chunk( StackChunkInfo *chunk_info ) { // Start with default size assumption - size_t current_size = _PY_DATA_STACK_CHUNK_SIZE; + size_t current_size = _PY_STACK_CHUNK_MIN_SIZE; char *this_chunk = PyMem_RawMalloc(current_size); if (!this_chunk) { @@ -87,11 +87,17 @@ copy_stack_chunks(RemoteUnwinderObject *unwinder, size_t count = 0; size_t max_chunks = 16; - if (read_ptr(unwinder, tstate_addr + (uintptr_t)unwinder->debug_offsets.thread_state.datastack_chunk, &chunk_addr)) { + if (read_ptr(unwinder, tstate_addr + (uintptr_t)unwinder->debug_offsets.thread_state.stack_chunk_list, &chunk_addr)) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read initial stack chunk address"); return -1; } + if (!chunk_addr) { + out_chunks->chunks = NULL; + out_chunks->count = 0; + return 0; + } + chunks = PyMem_RawMalloc(max_chunks * sizeof(StackChunkInfo)); if (!chunks) { PyErr_NoMemory(); @@ -100,28 +106,49 @@ copy_stack_chunks(RemoteUnwinderObject *unwinder, } const size_t MAX_STACK_CHUNKS = 4096; - while (chunk_addr != 0 && count < MAX_STACK_CHUNKS) { - // Grow array if needed + while (chunk_addr != 0) { + if (count >= MAX_STACK_CHUNKS) { + PyErr_Format(PyExc_RuntimeError, + "Too many stack chunks (%zu) - possible corrupted remote memory", + count); + set_exception_cause(unwinder, PyExc_RuntimeError, + "Too many stack chunks"); + goto error; + } + if (count >= max_chunks) { max_chunks *= 2; - StackChunkInfo *new_chunks = PyMem_RawRealloc(chunks, max_chunks * sizeof(StackChunkInfo)); + StackChunkInfo *new_chunks = PyMem_RawRealloc( + chunks, max_chunks * sizeof(StackChunkInfo)); if (!new_chunks) { PyErr_NoMemory(); - set_exception_cause(unwinder, PyExc_MemoryError, "Failed to grow stack chunks array"); + set_exception_cause(unwinder, PyExc_MemoryError, + "Failed to grow stack chunks array"); goto error; } chunks = new_chunks; } - // Process this chunk - if (process_single_stack_chunk(unwinder, chunk_addr, &chunks[count]) < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process stack chunk"); + uintptr_t prev_addr = chunk_addr; + if (process_single_stack_chunk(unwinder, chunk_addr, + &chunks[count]) < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, + "Failed to process stack chunk"); goto error; } + STATS_INC(unwinder, stack_chunks_copied); - // Get next chunk address and increment count - chunk_addr = GET_MEMBER(uintptr_t, chunks[count].local_copy, offsetof(_PyStackChunk, previous)); + chunk_addr = GET_MEMBER(uintptr_t, chunks[count].local_copy, + offsetof(_PyStackChunk, previous)); count++; + + if (chunk_addr == prev_addr) { + PyErr_SetString(PyExc_RuntimeError, + "Stack chunk cycle detected (corrupted remote memory)"); + set_exception_cause(unwinder, PyExc_RuntimeError, + "Stack chunk cycle detected"); + goto error; + } } out_chunks->chunks = chunks; diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 172f8711a2a2a0..b6ca96324947c9 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -901,6 +901,7 @@ RemoteUnwinder was created with stats=True. - frames_read_from_memory: Total frames read from remote memory - memory_reads: Total remote memory read operations - memory_bytes_read: Total bytes read from remote memory + - stack_chunks_copied: Total stack chunks copied from remote memory - code_object_cache_hits: Code object cache hits - code_object_cache_misses: Code object cache misses - stale_cache_invalidations: Times stale cache entries were cleared @@ -913,7 +914,7 @@ RemoteUnwinder was created with stats=True. static PyObject * _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self) -/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/ +/*[clinic end generated code: output=21e36477122be2a0 input=0e3897ce0f5b9f81]*/ { if (!self->collect_stats) { PyErr_SetString(PyExc_RuntimeError, @@ -945,6 +946,7 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self) ADD_STAT(frames_read_from_memory); ADD_STAT(memory_reads); ADD_STAT(memory_bytes_read); + ADD_STAT(stack_chunks_copied); ADD_STAT(code_object_cache_hits); ADD_STAT(code_object_cache_misses); ADD_STAT(stale_cache_invalidations); diff --git a/Python/ceval.c b/Python/ceval.c index 060e948e6b01c9..371685108e5257 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1957,8 +1957,13 @@ clear_thread_frame(PyThreadState *tstate, _PyInterpreterFrame * frame) assert(frame->owner == FRAME_OWNED_BY_THREAD); // Make sure that this is, indeed, the top frame. We can't check this in // _PyThreadState_PopFrame, since f_code is already cleared at that point: - assert((PyObject **)frame + _PyFrame_GetCode(frame)->co_framesize == - tstate->datastack_top); + assert( + _Py_ensure_frame_in_current_stack_chunk( // the frame might be in a previous stack chunk + tstate, + (char *)((PyObject **)frame + _PyFrame_GetCode(frame)->co_framesize) + ) + == tstate->stack_top + ); assert(frame->frame_obj == NULL || frame->frame_obj->f_frame == frame); _PyFrame_ClearExceptCode(frame); PyStackRef_CLEAR(frame->f_executable); diff --git a/Python/pystate.c b/Python/pystate.c index bf2616a49148a7..573b617e1180dd 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1433,17 +1433,16 @@ tstate_is_alive(PyThreadState *tstate) // lifecycle //---------- -static _PyStackChunk* -allocate_chunk(int size_in_bytes, _PyStackChunk* previous) +static _PyStackChunk * +allocate_stack_chunk(size_t size_in_bytes) { assert(size_in_bytes % sizeof(PyObject **) == 0); _PyStackChunk *res = _PyObject_VirtualAlloc(size_in_bytes); if (res == NULL) { return NULL; } - res->previous = previous; + res->previous = NULL; res->size = size_in_bytes; - res->top = 0; return res; } @@ -1572,10 +1571,9 @@ init_threadstate(_PyThreadStateImpl *_tstate, tstate->current_frame = &_tstate->base_frame; // base_frame pointer for profilers to validate stack unwinding tstate->base_frame = &_tstate->base_frame; - tstate->datastack_chunk = NULL; - tstate->datastack_top = NULL; - tstate->datastack_limit = NULL; - tstate->datastack_cached_chunk = NULL; + tstate->stack_chunk_list = NULL; + tstate->stack_top = NULL; + tstate->stack_limit = NULL; tstate->what_event = -1; tstate->current_executor = NULL; tstate->jit_exit = NULL; @@ -1717,20 +1715,17 @@ _PyThreadState_Init(PyThreadState *tstate) static void -clear_datastack(PyThreadState *tstate) +clear_stack_chunk_list(PyThreadState *tstate) { - _PyStackChunk *chunk = tstate->datastack_chunk; - tstate->datastack_chunk = NULL; - while (chunk != NULL) { - _PyStackChunk *prev = chunk->previous; - _PyObject_VirtualFree(chunk, chunk->size); - chunk = prev; - } - if (tstate->datastack_cached_chunk != NULL) { - _PyObject_VirtualFree(tstate->datastack_cached_chunk, - tstate->datastack_cached_chunk->size); - tstate->datastack_cached_chunk = NULL; + assert(tstate != NULL); + _PyStackChunk *chunk_list = tstate->stack_chunk_list; + while (chunk_list != NULL) { + _PyStackChunk *prev = chunk_list->previous; + size_t size = chunk_list->size; + _PyObject_VirtualFree(chunk_list, size); + chunk_list = prev; } + tstate->stack_chunk_list = NULL; } void @@ -1932,7 +1927,7 @@ tstate_delete_common(PyThreadState *tstate, int release_gil) } // XXX Move to PyThreadState_Clear()? - clear_datastack(tstate); + clear_stack_chunk_list(tstate); if (release_gil) { _PyEval_ReleaseLock(tstate->interp, tstate, 1); @@ -3082,42 +3077,44 @@ _PyInterpreterState_HasFeature(PyInterpreterState *interp, unsigned long feature return ((interp->feature_flags & feature) != 0); } - -#define MINIMUM_OVERHEAD 1000 +#define MINIMUM_OVERHEAD 0 static PyObject ** -push_chunk(PyThreadState *tstate, int size) -{ - int allocate_size = _PY_DATA_STACK_CHUNK_SIZE; - while (allocate_size < (int)sizeof(PyObject*)*(size + MINIMUM_OVERHEAD)) { +resize_stack(PyThreadState *tstate, int size) +{ + size_t current_size, allocate_size; + _PyStackChunk *old = tstate->stack_chunk_list; + if (old) { + current_size = tstate->stack_chunk_list->size; + assert(current_size > 0); + allocate_size = current_size * 2; + } else { + current_size = 0; + allocate_size = _PY_STACK_CHUNK_MIN_SIZE; + } + assert(allocate_size > current_size); + assert(allocate_size - current_size > _PY_STACK_CHUNK_OVERHEADS); + size_t required_space = sizeof(PyObject *) * (size + MINIMUM_OVERHEAD) + _PY_STACK_CHUNK_OVERHEADS; + while (allocate_size < required_space) { allocate_size *= 2; } - _PyStackChunk *new; - if (tstate->datastack_cached_chunk != NULL - && (size_t)allocate_size <= tstate->datastack_cached_chunk->size) - { - new = tstate->datastack_cached_chunk; - tstate->datastack_cached_chunk = NULL; - new->previous = tstate->datastack_chunk; - new->top = 0; - } - else { - new = allocate_chunk(allocate_size, tstate->datastack_chunk); - if (new == NULL) { - return NULL; - } + assert(allocate_size > 0); + _PyStackChunk *new = allocate_stack_chunk(allocate_size); + if (new == NULL) { + return NULL; } - if (tstate->datastack_chunk) { - tstate->datastack_chunk->top = tstate->datastack_top - - &tstate->datastack_chunk->data[0]; - } - tstate->datastack_chunk = new; - tstate->datastack_limit = (PyObject **)(((char *)new) + allocate_size); - // When new is the "root" chunk (i.e. new->previous == NULL), we can keep - // _PyThreadState_PopFrame from freeing it later by "skipping" over the - // first element: - PyObject **res = &new->data[new->previous == NULL]; - tstate->datastack_top = res + size; + if (old) { + new->previous = old; + long current_stack_size = tstate->stack_top - &old->data[0]; + assert(current_stack_size > 0); + tstate->stack_top = &new->data[current_stack_size]; + } else { + tstate->stack_top = &new->data[0]; + } + tstate->stack_chunk_list = new; + tstate->stack_limit = (PyObject **)(((char *)new) + allocate_size); + PyObject **res = tstate->stack_top; + tstate->stack_top = res + size; return res; } @@ -3126,38 +3123,22 @@ _PyThreadState_PushFrame(PyThreadState *tstate, size_t size) { assert(size < INT_MAX/sizeof(PyObject *)); if (_PyThreadState_HasStackSpace(tstate, (int)size)) { - _PyInterpreterFrame *res = (_PyInterpreterFrame *)tstate->datastack_top; - tstate->datastack_top += size; + _PyInterpreterFrame *res = (_PyInterpreterFrame *)tstate->stack_top; + tstate->stack_top += size; return res; } - return (_PyInterpreterFrame *)push_chunk(tstate, (int)size); + return (_PyInterpreterFrame *) resize_stack(tstate, (int) size); } void -_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame) -{ - assert(tstate->datastack_chunk); - PyObject **base = (PyObject **)frame; - if (base == &tstate->datastack_chunk->data[0]) { - _PyStackChunk *chunk = tstate->datastack_chunk; - _PyStackChunk *previous = chunk->previous; - _PyStackChunk *cached = tstate->datastack_cached_chunk; - // push_chunk ensures that the root chunk is never popped: - assert(previous); - tstate->datastack_top = &previous->data[previous->top]; - tstate->datastack_chunk = previous; - tstate->datastack_limit = (PyObject **)(((char *)previous) + previous->size); - chunk->previous = NULL; - if (cached != NULL) { - _PyObject_VirtualFree(cached, cached->size); - } - tstate->datastack_cached_chunk = chunk; - } - else { - assert(tstate->datastack_top); - assert(tstate->datastack_top >= base); - tstate->datastack_top = base; - } +_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame *frame) +{ + assert(tstate->stack_chunk_list); + PyObject **base = (PyObject **)_Py_ensure_frame_in_current_stack_chunk(tstate, (char *)frame); + assert(ptr_in_chunk((char *)base, tstate->stack_chunk_list)); + assert(tstate->stack_top); + assert(tstate->stack_top >= base); + tstate->stack_top = base; }