Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions Include/cpython/pystate.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@ typedef struct _err_stackitem {
} _PyErr_StackItem;

typedef struct _stack_chunk {
struct _stack_chunk *previous;
size_t size;
size_t top;
PyObject * data[1]; /* Variable sized */
struct _stack_chunk *previous;
PyObject *data[1]; /* Variable sized */
} _PyStackChunk;

/* Minimum size of data stack chunk */
#define _PY_DATA_STACK_CHUNK_SIZE (16*1024)
#define _PY_STACK_CHUNK_MIN_SIZE 4096
#define _PY_STACK_CHUNK_OVERHEADS (offsetof(_PyStackChunk, data))

struct _ts {
/* See Python/ceval.c for comments explaining most fields */

Expand Down Expand Up @@ -195,10 +195,9 @@ struct _ts {
/* Unique thread state id. */
uint64_t id;

_PyStackChunk *datastack_chunk;
PyObject **datastack_top;
PyObject **datastack_limit;
_PyStackChunk *datastack_cached_chunk;
_PyStackChunk *stack_chunk_list;
PyObject **stack_top;
PyObject **stack_limit;
/* XXX signal handlers should also be here */

/* The following fields are here to avoid allocation during init.
Expand Down
4 changes: 2 additions & 2 deletions Include/internal/pycore_debug_offsets.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ typedef struct _Py_DebugOffsets {
uint64_t last_profiled_frame;
uint64_t thread_id;
uint64_t native_thread_id;
uint64_t datastack_chunk;
uint64_t stack_chunk_list;
uint64_t status;
uint64_t holds_gil;
uint64_t gil_requested;
Expand Down Expand Up @@ -296,7 +296,7 @@ typedef struct _Py_DebugOffsets {
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
.thread_id = offsetof(PyThreadState, thread_id), \
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
.stack_chunk_list = offsetof(PyThreadState, stack_chunk_list), \
.status = offsetof(PyThreadState, _status), \
.holds_gil = offsetof(PyThreadState, holds_gil), \
.gil_requested = offsetof(PyThreadState, gil_requested), \
Expand Down
68 changes: 58 additions & 10 deletions Include/internal/pycore_interpframe.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,12 +333,12 @@ static inline bool
_PyThreadState_HasStackSpace(PyThreadState *tstate, int size)
{
assert(
(tstate->datastack_top == NULL && tstate->datastack_limit == NULL)
(tstate->stack_top == NULL && tstate->stack_limit == NULL)
||
(tstate->datastack_top != NULL && tstate->datastack_limit != NULL)
(tstate->stack_top != NULL && tstate->stack_limit != NULL)
);
return tstate->datastack_top != NULL &&
size < tstate->datastack_limit - tstate->datastack_top;
return tstate->stack_top != NULL &&
size < tstate->stack_limit - tstate->stack_top;
}

// Exported for external JIT support
Expand All @@ -356,9 +356,9 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_
CALL_STAT_INC(frames_pushed);
PyFunctionObject *func_obj = (PyFunctionObject *)PyStackRef_AsPyObjectBorrow(func);
PyCodeObject *code = (PyCodeObject *)func_obj->func_code;
_PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top;
tstate->datastack_top += code->co_framesize;
assert(tstate->datastack_top < tstate->datastack_limit);
_PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->stack_top;
tstate->stack_top += code->co_framesize;
assert(tstate->stack_top < tstate->stack_limit);
_PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from,
previous);
return new_frame;
Expand All @@ -370,9 +370,9 @@ static inline _PyInterpreterFrame *
_PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int stackdepth, _PyInterpreterFrame * previous)
{
CALL_STAT_INC(frames_pushed);
_PyInterpreterFrame *frame = (_PyInterpreterFrame *)tstate->datastack_top;
tstate->datastack_top += code->co_framesize;
assert(tstate->datastack_top < tstate->datastack_limit);
_PyInterpreterFrame *frame = (_PyInterpreterFrame *)tstate->stack_top;
tstate->stack_top += code->co_framesize;
assert(tstate->stack_top < tstate->stack_limit);
frame->previous = previous;
frame->f_funcobj = PyStackRef_None;
frame->f_executable = PyStackRef_FromPyObjectNew(code);
Expand Down Expand Up @@ -408,6 +408,54 @@ PyAPI_FUNC(_PyInterpreterFrame *)
_PyEvalFramePushAndInit_Ex(PyThreadState *tstate, _PyStackRef func,
PyObject *locals, Py_ssize_t nargs, PyObject *callargs, PyObject *kwargs, _PyInterpreterFrame *previous);

static inline bool
ptr_in_chunk(const char *ptr, const _PyStackChunk *chunk)
{
assert(chunk != NULL);
const char *start = (char *)&chunk->data[0];
const intptr_t offset = ptr - start;
const intptr_t usable_size = (intptr_t)(chunk->size - _PY_STACK_CHUNK_OVERHEADS);
return offset >= 0 && offset < usable_size && start + offset == ptr;
Comment on lines +416 to +418
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const intptr_t offset = ptr - start;
const intptr_t usable_size = (intptr_t)(chunk->size - _PY_STACK_CHUNK_OVERHEADS);
return offset >= 0 && offset < usable_size && start + offset == ptr;
const uintptr_t usable_size = (uintptr_t)(chunk->size - _PY_STACK_CHUNK_OVERHEADS);
return ptr >= start && (uintptr_t)(ptr - start) < usable_size;

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I have learned while working on this PR, those kinds of pointer comparisons are undefined behaviour according to the C standard, because they do not belong to the same object. See Section 6.5.8, paragraph 5 (https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf).

So ptr >= start is undefined, but computing where the pointer would have been in the new allocation and checking for equality is ok.

}

static inline uintptr_t
get_offset_in_chunk(const char *ptr, const _PyStackChunk *chunk)
{
assert(chunk != NULL);
assert(chunk->data != NULL);
assert(ptr_in_chunk(ptr, chunk));

return ptr - (char *)chunk;
}

static inline uintptr_t
get_offset_in_chunk_list(char *base, _PyStackChunk *stack_chunk_list)
{
assert(stack_chunk_list != NULL);
assert(base != NULL);
_PyStackChunk *chunk = stack_chunk_list;
do {
if (ptr_in_chunk(base, chunk)) {
return get_offset_in_chunk(base, chunk);
}
chunk = chunk->previous;
} while (chunk);
assert(false); // did not find correct chunk
Py_UNREACHABLE();
}

static inline void *
_Py_ensure_frame_in_current_stack_chunk(PyThreadState *tstate, char *frame)
{
assert(tstate != NULL);
assert(frame != NULL);
if (ptr_in_chunk(frame, tstate->stack_chunk_list)) {
return frame;
}
uintptr_t offset = get_offset_in_chunk_list(frame, tstate->stack_chunk_list->previous);
return ((char *)tstate->stack_chunk_list) + offset;
}

#ifdef __cplusplus
}
#endif
Expand Down
3 changes: 3 additions & 0 deletions Include/internal/pycore_runtime_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ extern PyTypeObject _PyExc_MemoryError;
._whence = _PyThreadState_WHENCE_NOTSET, \
.py_recursion_limit = Py_DEFAULT_RECURSION_LIMIT, \
.context_ver = 1, \
.stack_chunk_list = NULL, \
.stack_limit = NULL, \
.stack_top = NULL, \
}


Expand Down
42 changes: 41 additions & 1 deletion InternalDocs/frames.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,52 @@ The definition of the `_PyInterpreterFrame` struct is in
Python semantics allows frames to outlive the activation, so they need to
be allocated outside the C call stack. To reduce overhead and improve locality
of reference, most frames are allocated contiguously in a per-thread stack
(see `_PyThreadState_PushFrame` in [Python/pystate.c](../Python/pystate.c)).
chunk (see `_PyThreadState_PushFrame` in
[Python/pystate.c](../Python/pystate.c)).

Frames of generators and coroutines are embedded in the generator and coroutine
objects, so are not allocated in the per-thread stack. See `_PyGenObject` in
[Include/internal/pycore_interpframe_structs.h](../Include/internal/pycore_interpframe_structs.h).

## Stack allocation

The per-thread stack is a resizable array backed by `_PyStackChunk`
allocations. Each `PyThreadState` stores:

* `stack_chunk_list`: the newest stack chunk, with older chunks linked through
`_PyStackChunk.previous`;
* `stack_top`: the next free slot in the newest chunk; and
* `stack_limit`: the end of the newest chunk.

The first frame allocation creates a chunk of at least
`_PY_STACK_CHUNK_MIN_SIZE=4096` bytes. If a subsequent frame does not fit,
`resize_stack()` allocates a larger chunk, twice the size of the
previous one and large enough for the requested frame. The old chunk is
retained in `stack_chunk_list->previous`, instead of being copied or
immediately freed.

The newest chunk is aligned with the previous logical stack by setting its
`stack_top` to the same offset that was used in the old chunk. New frames are
then placed in the newest chunk. Existing frame records remain where they were,
so the `previous` links in the frame chain can cross from the newest chunk into
older retained chunks.

![Resizable stack after a resize](images/stack-resize.png)

When a frame is popped, `_PyThreadState_PopFrame()` ensures that
`tstate->stack_top` keeps pointing into the new stack chunk even if the popped
frame resides in an older chunk. This ensures that when new frames are pushed,
they are created in the new chunk.

Starting from the situation depicted above, the picture below shows what happens
after resizing and a number of pops remove one frame in the older chunk, then a
new frame is created and placed in the newest chunk.

![Stack after a resize, various pops and one push](images/stack-resize-pop-push.png)

All retained stack chunks are freed when the thread state is deleted by
`clear_stack_chunk_list()`.

## Layout

Each activation record is laid out as:
Expand Down
146 changes: 146 additions & 0 deletions InternalDocs/images/stack-resize-pop-push.dot
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
digraph stack_resize {
graph [
fontname="Monaco",
fontsize=16,
bgcolor="white",
compound=true,
newrank=true,
splines=false
];

node [
fontname="Monaco",
fontsize=16,
shape=box,
style="rounded,filled",
color="#2563eb",
fillcolor="white",
penwidth=2,
fixedsize=true
];

edge [
color="#111111",
fontname="Monaco",
penwidth=2,
arrowsize=0.75
];

subgraph cluster_older_chunk {
label="older chunk";
labelloc=b;
fontcolor="#0f172a";
style="rounded,filled";
color="#334155";
fillcolor="#f1f5f9";
penwidth=2;
margin=24;

node [
color="#64748b",
width=2.5,
];

old_frame_hidden [style="invis", label=""];
old_frame_1 [style="invis", label=""];
subgraph cluster_older_frames {
label="";
penwidth=0;
old_frame_2 [label="older frame"];
old_frame_3 [label="older frame"];
}
old_previous [
color="invis",
fillcolor="invis",
width=2.15,
height=0.58,
label="previous = NULL"
];

old_frame_hidden:s -> old_frame_1:n [style="invis"];
old_frame_1:s -> old_frame_2:n [style="invis"];
old_frame_2:s -> old_frame_3:n;
old_frame_3:s -> old_previous:n [style="invis"];
}

subgraph cluster_newest_chunk {
label="newest chunk";
labelloc=b;
fontcolor="#0f172a";
style="rounded,filled";
color="#1e3a8a";
fillcolor="#eaf4ff";
penwidth=2;
margin=24;

newer_frame_hidden [width=2.5, style="invis"];
top_frame [width=2.5, label="top frame", style="invis"];
newer_frame_1 [width=2.5, label="newer frame", style="invis"];
newer_frame_2 [width=2.5, label="top frame"];

subgraph cluster_untouched_memory {
label="";
color="#f59e0b";
fillcolor="#ffec99";
style="filled,dashed";
penwidth=2;
margin=24;

node [
width=2.5,
height=0.5,
];

untouched_memory_top [style="invis", label=""];
untouched_memory_bottom [style="invis", label=""];
}

new_previous [
color="invis",
fillcolor="invis",
width=1.15,
height=0.58,
label="previous"
];

newer_frame_hidden:s -> top_frame:n [style="invis"];
top_frame:s -> newer_frame_1:n [style="invis"];
newer_frame_1:s -> newer_frame_2:n [style="invis"];
newer_frame_2:s -> untouched_memory_top:n [
lhead=cluster_untouched_memory,
style="invis",
];
untouched_memory_bottom:s -> new_previous:n [
ltail=cluster_untouched_memory,
style="invis",
];
}

stack_top_label [
shape=plaintext,
style="",
fixedsize=false,
label="stack_top",
fontcolor="#0f172a"
];

{ rank=same; old_frame_1; newer_frame_2; }
{ rank=same; old_frame_2; untouched_memory_top; }
{ rank=same; old_frame_3; untouched_memory_bottom; }
{ rank=same; old_previous; new_previous; }
{ rank=same; newer_frame_2; stack_top_label; }

newer_frame_2 -> stack_top_label [style=invis, weight=100, constraint=false];
old_frame_1 -> newer_frame_2 [style=invis, weight=100, constraint=false];

newer_frame_2:w -> old_frame_2:e [
constraint=false,
];
stack_top_label:w -> newer_frame_2:e [
constraint=false
];
new_previous:w -> old_previous:e [
lhead=cluster_older_chunk,
constraint=false
];
}
Binary file added InternalDocs/images/stack-resize-pop-push.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading