Skip to content

Commit c61f197

Browse files
committed
Merge branch 'pep703-support' of https://github.com/colesbury/pybind11 into pep703-support
2 parents 44d1fc7 + 85f3ff4 commit c61f197

File tree

1 file changed

+17
-6
lines changed

1 file changed

+17
-6
lines changed

include/pybind11/detail/internals.h

+17-6
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,17 @@ struct override_hash {
173173
using instance_map = std::unordered_multimap<const void *, instance *>;
174174

175175
// ignore: structure was padded due to alignment specifier
176+
PYBIND11_WARNING_PUSH
176177
PYBIND11_WARNING_DISABLE_MSVC(4324)
177178

179+
// Instance map shards are used to reduce mutex contention in free-threaded Python.
178180
struct alignas(64) instance_map_shard {
179181
std::mutex mutex;
180182
instance_map registered_instances;
181183
};
182184

185+
PYBIND11_WARNING_POP
186+
183187
/// Internal data structure used to track registered instances and types.
184188
/// Whenever binary incompatible changes are made to this structure,
185189
/// `PYBIND11_INTERNALS_VERSION` must be incremented.
@@ -495,7 +499,7 @@ inline internals **get_internals_pp_from_capsule(handle obj) {
495499
return static_cast<internals **>(raw_ptr);
496500
}
497501

498-
inline uint64_t next_pow2(uint64_t x) {
502+
inline uint64_t round_up_to_next_pow2(uint64_t x) {
499503
// Round-up to the next power of two.
500504
// See https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
501505
x--;
@@ -578,7 +582,9 @@ PYBIND11_NOINLINE internals &get_internals() {
578582
internals_ptr->default_metaclass = make_default_metaclass();
579583
internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
580584
#ifdef Py_GIL_DISABLED
581-
size_t num_shards = (size_t) next_pow2(2 * std::thread::hardware_concurrency());
585+
// Scale proportional to the number of cores. 2x is a heuristic to reduce contention.
586+
auto num_shards
587+
= static_cast<size_t>(round_up_to_next_pow2(2 * std::thread::hardware_concurrency()));
582588
if (num_shards == 0) {
583589
num_shards = 1;
584590
}
@@ -658,7 +664,10 @@ inline auto with_internals(const F &cb) -> decltype(cb(get_internals())) {
658664
return cb(internals);
659665
}
660666

661-
inline uint64_t splitmix64(uint64_t z) {
667+
inline std::uint64_t mix64(std::uint64_t z) {
668+
// David Stafford's variant 13 of the MurmurHash3 finalizer popularized
669+
// by the SplitMix PRNG.
670+
// https://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html
662671
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
663672
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
664673
return z ^ (z >> 31);
@@ -675,9 +684,9 @@ inline auto with_instance_map(const void *ptr,
675684
// other threads/cores to map to other shards. Using the high bits is a good
676685
// heuristic because memory allocators often have a per-thread
677686
// arena/superblock/segment from which smaller allocations are served.
678-
auto addr = reinterpret_cast<uintptr_t>(ptr);
679-
uint64_t hash = splitmix64((uint64_t) (addr >> 20));
680-
size_t idx = (size_t) hash & internals.instance_shards_mask;
687+
auto addr = reinterpret_cast<std::uintptr_t>(ptr);
688+
auto hash = mix64(static_cast<std::uint64_t>(addr >> 20));
689+
auto idx = static_cast<size_t>(hash & internals.instance_shards_mask);
681690

682691
auto &shard = internals.instance_shards[idx];
683692
std::unique_lock<std::mutex> lock(shard.mutex);
@@ -688,6 +697,8 @@ inline auto with_instance_map(const void *ptr,
688697
#endif
689698
}
690699

700+
// Returns the number of registered instances for testing purposes. The result may not be
701+
// consistent if other threads are registering or unregistering instances concurrently.
691702
inline size_t num_registered_instances() {
692703
auto &internals = get_internals();
693704
#ifdef Py_GIL_DISABLED

0 commit comments

Comments
 (0)