diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst new file mode 100644 index 00000000000000..26737330716181 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-22-14-22-34.gh-issue-129201.wiZzEb.rst @@ -0,0 +1,5 @@ +The free-threaded version of the cyclic garbage collector has been optimized to +conditionally use CPU prefetch instructions during the collection. This can +reduce collection times by making it more likely that data is in the CPU cache +when it is needed. The prefetch instructions are enabled if the number of +long-lived objects (objects surviving a full collection) exceeds a threshold. diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 5d264e407e1cc8..10c76a67979884 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -21,6 +21,9 @@ // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 +// if true, enable the use of "prefetch" CPU instructions +#define GC_ENABLE_PREFETCH_INSTRUCTIONS 1 + // include additional roots in "mark alive" pass #define GC_MARK_ALIVE_EXTRA_ROOTS 1 @@ -472,13 +475,193 @@ gc_maybe_untrack(PyObject *op) } #ifdef GC_ENABLE_MARK_ALIVE + +// prefetch buffer and stack ////////////////////////////////// + +// The buffer is a circular FIFO queue of PyObject pointers. We take +// care to not dereference these pointers until they are taken out of +// the buffer. A prefetch CPU instruction is issued when a pointer is +// put into the buffer. If all is working as expected, there will be +// enough time between the enqueue and dequeue so that the needed memory +// for the object, most importantly ob_gc_bits and ob_type words, will +// already be in the CPU cache. +#define BUFFER_SIZE 256 +#define BUFFER_HI 16 +#define BUFFER_LO 8 +#define BUFFER_MASK (BUFFER_SIZE - 1) + +// the buffer size must be an exact power of two +static_assert(BUFFER_SIZE > 0 && !(BUFFER_SIZE & BUFFER_MASK), + "Invalid BUFFER_SIZE, must be power of 2"); +// the code below assumes these relationships are true +static_assert(BUFFER_HI < BUFFER_SIZE && + BUFFER_LO < BUFFER_HI && + BUFFER_LO > 0, + "Invalid prefetch buffer level settings."); + +// Prefetch intructions will fetch the line of data from memory that +// contains the byte specified with the source operand to a location in +// the cache hierarchy specified by a locality hint. The instruction +// is only a hint and the CPU is free to ignore it. Instructions and +// behaviour are CPU specific but the definitions of locality hints +// below are mostly consistent. +// +// * T0 (temporal data) prefetch data into all levels of the cache hierarchy. +// +// * T1 (temporal data with respect to first level cache) prefetch data into +// level 2 cache and higher. +// +// * T2 (temporal data with respect to second level cache) prefetch data into +// level 3 cache and higher, or an implementation-specific choice. +// +// * NTA (non-temporal data with respect to all cache levels) prefetch data into +// non-temporal cache structure and into a location close to the processor, +// minimizing cache pollution. + +#if defined(__GNUC__) || defined(__clang__) + #define PREFETCH_T0(ptr) __builtin_prefetch(ptr, 0, 3) + #define PREFETCH_T1(ptr) __builtin_prefetch(ptr, 0, 2) + #define PREFETCH_T2(ptr) __builtin_prefetch(ptr, 0, 1) + #define PREFETCH_NTA(ptr) __builtin_prefetch(ptr, 0, 0) +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC) + #include + #define PREFETCH_T0(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) + #define PREFETCH_T1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1) + #define PREFETCH_T2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T2) + #define PREFETCH_NTA(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_NTA) +#elif defined (__aarch64__) + #define PREFETCH_T0(ptr) \ + do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0) + #define PREFETCH_T1(ptr) \ + do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0) + #define PREFETCH_T2(ptr) \ + do { __asm__ __volatile__("prfm pldl3keep, %0" ::"Q"(*(ptr))); } while (0) + #define PREFETCH_NTA(ptr) \ + do { __asm__ __volatile__("prfm pldl1strm, %0" ::"Q"(*(ptr))); } while (0) +#else + #define PREFETCH_T0(ptr) do { (void)(ptr); } while (0) /* disabled */ + #define PREFETCH_T1(ptr) do { (void)(ptr); } while (0) /* disabled */ + #define PREFETCH_T2(ptr) do { (void)(ptr); } while (0) /* disabled */ + #define PREFETCH_NTA(ptr) do { (void)(ptr); } while (0) /* disabled */ +#endif + +#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS + #define prefetch(ptr) PREFETCH_T1(ptr) +#else + #define prefetch(ptr) +#endif + +// a contigous sequence of PyObject pointers, can contain NULLs +typedef struct { + PyObject **start; + PyObject **end; +} gc_span_t; + +typedef struct { + Py_ssize_t size; + Py_ssize_t capacity; + gc_span_t *stack; +} gc_span_stack_t; + +typedef struct { + unsigned int in; + unsigned int out; + _PyObjectStack stack; + gc_span_stack_t spans; + PyObject *buffer[BUFFER_SIZE]; + bool use_prefetch; +} gc_mark_args_t; + + +// Returns number of entries in buffer +static inline unsigned int +gc_mark_buffer_len(gc_mark_args_t *args) +{ + return args->in - args->out; +} + +// Returns number of free entry slots in buffer +static inline unsigned int +gc_mark_buffer_avail(gc_mark_args_t *args) +{ + return BUFFER_SIZE - gc_mark_buffer_len(args); +} + +static inline bool +gc_mark_buffer_is_empty(gc_mark_args_t *args) +{ + return args->in == args->out; +} + +static inline bool +gc_mark_buffer_is_full(gc_mark_args_t *args) +{ + return gc_mark_buffer_len(args) == BUFFER_SIZE; +} + +static inline PyObject * +gc_mark_buffer_pop(gc_mark_args_t *args) +{ + assert(!gc_mark_buffer_is_empty(args)); + PyObject *op = args->buffer[args->out & BUFFER_MASK]; + args->out++; + return op; +} + +// Called when there is space in the buffer for the object. Issue the +// prefetch instruction and add it to the end of the buffer. +static inline void +gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args) +{ + assert(!gc_mark_buffer_is_full(args)); + prefetch(op); + args->buffer[args->in & BUFFER_MASK] = op; + args->in++; +} + +// Called when we run out of space in the buffer or if the prefetching +// is disabled. The object will be pushed on the gc_mark_args.stack. +static int +gc_mark_stack_push(_PyObjectStack *ms, PyObject *op) +{ + if (_PyObjectStack_Push(ms, op) < 0) { + return -1; + } + return 0; +} + static int -mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) +gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end) +{ + if (start == end) { + return 0; + } + if (ss->size >= ss->capacity) { + if (ss->capacity == 0) { + ss->capacity = 256; + } + else { + ss->capacity *= 2; + } + ss->stack = (gc_span_t *)PyMem_Realloc(ss->stack, ss->capacity * sizeof(gc_span_t)); + if (ss->stack == NULL) { + return -1; + } + } + assert(end > start); + ss->stack[ss->size].start = start; + ss->stack[ss->size].end = end; + ss->size++; + return 0; +} + +static int +gc_mark_enqueue_no_buffer(PyObject *op, gc_mark_args_t *args) { if (op == NULL) { return 0; } - if (!_PyObject_GC_IS_TRACKED(op)) { + if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) { return 0; } if (gc_is_alive(op)) { @@ -491,12 +674,68 @@ mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) // Need to call tp_traverse on this object. Add to stack and mark it // alive so we don't traverse it a second time. gc_set_alive(op); - if (_PyObjectStack_Push(stack, op) < 0) { + if (_PyObjectStack_Push(&args->stack, op) < 0) { return -1; } return 0; } +static int +gc_mark_enqueue_buffer(PyObject *op, gc_mark_args_t *args) +{ + assert(op != NULL); + if (!gc_mark_buffer_is_full(args)) { + gc_mark_buffer_push(op, args); + return 0; + } + else { + return gc_mark_stack_push(&args->stack, op); + } +} + +// Called when we find an object that needs to be marked alive (either from a +// root or from calling tp_traverse). +static int +gc_mark_enqueue(PyObject *op, gc_mark_args_t *args) +{ + if (args->use_prefetch) { + return gc_mark_enqueue_buffer(op, args); + } + else { + return gc_mark_enqueue_no_buffer(op, args); + } +} + +// Called when we have a contigous sequence of PyObject pointers, either +// a tuple or list object. This will add the items to the buffer if there +// is space for them all otherwise push a new "span" on the span stack. Using +// spans has the advantage of not creating a deep _PyObjectStack stack when +// dealing with long sequences. Those sequences will be processed in smaller +// chunks by the gc_prime_from_spans() function. +static int +gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args) +{ + Py_ssize_t used = gc_mark_buffer_len(args); + Py_ssize_t free = BUFFER_SIZE - used; + if (free >= size) { + for (Py_ssize_t i = 0; i < size; i++) { + PyObject *op = item[i]; + if (op == NULL) { + continue; + } + gc_mark_buffer_push(op, args); + } + } + else { + assert(size > 0); + PyObject **end = &item[size]; + if (gc_mark_span_push(&args->spans, item, end) < 0) { + return -1; + } + } + return 0; +} + static bool gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area, void *block, size_t block_size, void *args) @@ -511,25 +750,56 @@ gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area, return true; } +static int +gc_mark_traverse_list(PyObject *self, void *args) +{ + PyListObject *list = (PyListObject *)self; + if (list->ob_item == NULL) { + return 0; + } + if (gc_mark_enqueue_span(list->ob_item, PyList_GET_SIZE(list), args) < 0) { + return -1; + } + return 0; +} + +static int +gc_mark_traverse_tuple(PyObject *self, void *args) +{ + _PyTuple_MaybeUntrack(self); + if (!gc_has_bit(self, _PyGC_BITS_TRACKED)) { + gc_clear_alive(self); + return 0; + } + PyTupleObject *tuple = _PyTuple_CAST(self); + if (gc_mark_enqueue_span(tuple->ob_item, Py_SIZE(tuple), args) < 0) { + return -1; + } + return 0; +} + static void gc_abort_mark_alive(PyInterpreterState *interp, struct collection_state *state, - _PyObjectStack *stack) + gc_mark_args_t *args) { - // We failed to allocate memory for "stack" while doing the "mark - // alive" phase. In that case, free the object stack and make sure - // that no objects have the alive bit set. - _PyObjectStack_Clear(stack); + // We failed to allocate memory while doing the "mark alive" phase. + // In that case, free the memory used for marking state and make + // sure that no objects have the alive bit set. + _PyObjectStack_Clear(&args->stack); + if (args->spans.stack != NULL) { + PyMem_Free(args->spans.stack); + } gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base); } #ifdef GC_MARK_ALIVE_STACKS static int -gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) +gc_visit_stackref_mark_alive(gc_mark_args_t *args, _PyStackRef stackref) { if (!PyStackRef_IsNull(stackref)) { PyObject *op = PyStackRef_AsPyObjectBorrow(stackref); - if (mark_alive_stack_push(op, stack) < 0) { + if (gc_mark_enqueue(op, args) < 0) { return -1; } } @@ -537,7 +807,7 @@ gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) } static int -gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack) +gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, gc_mark_args_t *args) { int err = 0; _Py_FOR_EACH_TSTATE_BEGIN(interp, p) { @@ -554,13 +824,13 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st } _PyStackRef *top = f->stackpointer; - if (gc_visit_stackref_mark_alive(stack, f->f_executable) < 0) { + if (gc_visit_stackref_mark_alive(args, f->f_executable) < 0) { err = -1; goto exit; } while (top != f->localsplus) { --top; - if (gc_visit_stackref_mark_alive(stack, *top) < 0) { + if (gc_visit_stackref_mark_alive(args, *top) < 0) { err = -1; goto exit; } @@ -904,22 +1174,124 @@ static int move_legacy_finalizer_reachable(struct collection_state *state); #ifdef GC_ENABLE_MARK_ALIVE + +static void +gc_prime_from_spans(gc_mark_args_t *args) +{ + Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args); + // there should always be at least this amount of space + assert(space <= gc_mark_buffer_avail(args)); + assert(space > 0); + gc_span_t entry = args->spans.stack[--args->spans.size]; + // spans on the stack should always have one or more elements + assert(entry.start < entry.end); + do { + PyObject *op = *entry.start; + entry.start++; + if (op != NULL) { + gc_mark_buffer_push(op, args); + space--; + if (space == 0) { + // buffer is as full as we want and not done with span + gc_mark_span_push(&args->spans, entry.start, entry.end); + return; + } + } + } while (entry.start < entry.end); +} + +static void +gc_prime_buffer(gc_mark_args_t *args) +{ + if (args->spans.size > 0) { + gc_prime_from_spans(args); + } + else { + // When priming, don't fill the buffer too full since that would + // likely cause the stack to be used shortly after when it + // fills. We want to use the buffer as much as possible and so + // we only fill to BUFFER_HI, not BUFFER_SIZE. + Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args); + assert(space > 0); + do { + PyObject *op = _PyObjectStack_Pop(&args->stack); + if (op == NULL) { + return; + } + gc_mark_buffer_push(op, args); + space--; + } while (space > 0); + } +} + static int -propagate_alive_bits(_PyObjectStack *stack) +gc_propagate_alive_prefetch(gc_mark_args_t *args) { for (;;) { - PyObject *op = _PyObjectStack_Pop(stack); - if (op == NULL) { - break; + Py_ssize_t buf_used = gc_mark_buffer_len(args); + if (buf_used <= BUFFER_LO) { + // The mark buffer is getting empty. If it's too empty + // then there will not be enough delay between issuing + // the prefetch and when the object is actually accessed. + // Prime the buffer with object pointers from the stack or + // from the spans, if there are any available. + gc_prime_buffer(args); + if (gc_mark_buffer_is_empty(args)) { + return 0; + } + } + PyObject *op = gc_mark_buffer_pop(args); + + if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) { + continue; } - assert(_PyObject_GC_IS_TRACKED(op)); - assert(gc_is_alive(op)); + + if (gc_is_alive(op)) { + continue; // already visited this object + } + + // Need to call tp_traverse on this object. Mark it alive so we + // don't traverse it a second time. + gc_set_alive(op); + traverseproc traverse = Py_TYPE(op)->tp_traverse; - if (traverse(op, (visitproc)&mark_alive_stack_push, stack) < 0) { + if (traverse == PyList_Type.tp_traverse) { + if (gc_mark_traverse_list(op, args) < 0) { + return -1; + } + } + else if (traverse == PyTuple_Type.tp_traverse) { + if (gc_mark_traverse_tuple(op, args) < 0) { + return -1; + } + } + else if (traverse(op, (visitproc)&gc_mark_enqueue_buffer, args) < 0) { return -1; } } - return 0; +} + +static int +gc_propagate_alive(gc_mark_args_t *args) +{ + if (args->use_prefetch) { + return gc_propagate_alive_prefetch(args); + } + else { + for (;;) { + PyObject *op = _PyObjectStack_Pop(&args->stack); + if (op == NULL) { + break; + } + assert(_PyObject_GC_IS_TRACKED(op)); + assert(gc_is_alive(op)); + traverseproc traverse = Py_TYPE(op)->tp_traverse; + if (traverse(op, (visitproc)&gc_mark_enqueue_no_buffer, args) < 0) { + return -1; + } + } + return 0; + } } // Using tp_traverse, mark everything reachable from known root objects @@ -939,48 +1311,64 @@ propagate_alive_bits(_PyObjectStack *stack) // // Returns -1 on failure (out of memory). static int -mark_alive_from_roots(PyInterpreterState *interp, - struct collection_state *state) +gc_mark_alive_from_roots(PyInterpreterState *interp, + struct collection_state *state) { #ifdef GC_DEBUG // Check that all objects don't have alive bit set gc_visit_heaps(interp, &validate_alive_bits, &state->base); #endif - _PyObjectStack stack = { NULL }; - - #define STACK_PUSH(op) \ - if (mark_alive_stack_push(op, &stack) < 0) { \ - gc_abort_mark_alive(interp, state, &stack); \ - return -1; \ + gc_mark_args_t mark_args = { 0 }; + + // Using prefetch instructions is only a win if the set of objects being + // examined by the GC does not fit into CPU caches. Otherwise, using the + // buffer and prefetch instructions is just overhead. Using the long lived + // object count seems a good estimate of if things will fit in the cache. + // On 64-bit platforms, the minimum object size is 32 bytes. A 4MB L2 cache + // would hold about 130k objects. + mark_args.use_prefetch = interp->gc.long_lived_total > 200000; + + #define MARK_ENQUEUE(op) \ + if (op != NULL ) { \ + if (gc_mark_enqueue(op, &mark_args) < 0) { \ + gc_abort_mark_alive(interp, state, &mark_args); \ + return -1; \ + } \ } - STACK_PUSH(interp->sysdict); + MARK_ENQUEUE(interp->sysdict); #ifdef GC_MARK_ALIVE_EXTRA_ROOTS - STACK_PUSH(interp->builtins); - STACK_PUSH(interp->dict); + MARK_ENQUEUE(interp->builtins); + MARK_ENQUEUE(interp->dict); struct types_state *types = &interp->types; for (int i = 0; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES; i++) { - STACK_PUSH(types->builtins.initialized[i].tp_dict); - STACK_PUSH(types->builtins.initialized[i].tp_subclasses); + MARK_ENQUEUE(types->builtins.initialized[i].tp_dict); + MARK_ENQUEUE(types->builtins.initialized[i].tp_subclasses); } for (int i = 0; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES; i++) { - STACK_PUSH(types->for_extensions.initialized[i].tp_dict); - STACK_PUSH(types->for_extensions.initialized[i].tp_subclasses); + MARK_ENQUEUE(types->for_extensions.initialized[i].tp_dict); + MARK_ENQUEUE(types->for_extensions.initialized[i].tp_subclasses); } #endif #ifdef GC_MARK_ALIVE_STACKS - if (gc_visit_thread_stacks_mark_alive(interp, &stack) < 0) { - gc_abort_mark_alive(interp, state, &stack); + if (gc_visit_thread_stacks_mark_alive(interp, &mark_args) < 0) { + gc_abort_mark_alive(interp, state, &mark_args); return -1; } #endif - #undef STACK_PUSH + #undef MARK_ENQUEUE // Use tp_traverse to find everything reachable from roots. - if (propagate_alive_bits(&stack) < 0) { - gc_abort_mark_alive(interp, state, &stack); + if (gc_propagate_alive(&mark_args) < 0) { + gc_abort_mark_alive(interp, state, &mark_args); return -1; } + assert(mark_args.spans.size == 0); + if (mark_args.spans.stack != NULL) { + PyMem_Free(mark_args.spans.stack); + } + assert(mark_args.stack.head == NULL); + return 0; } #endif // GC_ENABLE_MARK_ALIVE @@ -1559,7 +1947,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, if (!state->gcstate->freeze_active) { // Mark objects reachable from known roots as "alive". These will // be ignored for rest of the GC pass. - int err = mark_alive_from_roots(interp, state); + int err = gc_mark_alive_from_roots(interp, state); if (err < 0) { _PyEval_StartTheWorld(interp); PyErr_NoMemory();