@@ -173,13 +173,17 @@ struct override_hash {
173
173
using instance_map = std::unordered_multimap<const void *, instance *>;
174
174
175
175
// ignore: structure was padded due to alignment specifier
176
+ PYBIND11_WARNING_PUSH
176
177
PYBIND11_WARNING_DISABLE_MSVC (4324 )
177
178
179
+ // Instance map shards are used to reduce mutex contention in free-threaded Python.
178
180
struct alignas(64 ) instance_map_shard {
179
181
std::mutex mutex;
180
182
instance_map registered_instances;
181
183
};
182
184
185
+ PYBIND11_WARNING_POP
186
+
183
187
// / Internal data structure used to track registered instances and types.
184
188
// / Whenever binary incompatible changes are made to this structure,
185
189
// / `PYBIND11_INTERNALS_VERSION` must be incremented.
@@ -495,7 +499,7 @@ inline internals **get_internals_pp_from_capsule(handle obj) {
495
499
return static_cast <internals **>(raw_ptr);
496
500
}
497
501
498
- inline uint64_t next_pow2 (uint64_t x) {
502
+ inline uint64_t round_up_to_next_pow2 (uint64_t x) {
499
503
// Round-up to the next power of two.
500
504
// See https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
501
505
x--;
@@ -578,7 +582,9 @@ PYBIND11_NOINLINE internals &get_internals() {
578
582
internals_ptr->default_metaclass = make_default_metaclass ();
579
583
internals_ptr->instance_base = make_object_base_type (internals_ptr->default_metaclass );
580
584
#ifdef Py_GIL_DISABLED
581
- size_t num_shards = (size_t ) next_pow2 (2 * std::thread::hardware_concurrency ());
585
+ // Scale proportional to the number of cores. 2x is a heuristic to reduce contention.
586
+ auto num_shards
587
+ = static_cast <size_t >(round_up_to_next_pow2 (2 * std::thread::hardware_concurrency ()));
582
588
if (num_shards == 0 ) {
583
589
num_shards = 1 ;
584
590
}
@@ -658,7 +664,10 @@ inline auto with_internals(const F &cb) -> decltype(cb(get_internals())) {
658
664
return cb (internals);
659
665
}
660
666
661
- inline uint64_t splitmix64 (uint64_t z) {
667
+ inline std::uint64_t mix64 (std::uint64_t z) {
668
+ // David Stafford's variant 13 of the MurmurHash3 finalizer popularized
669
+ // by the SplitMix PRNG.
670
+ // https://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html
662
671
z = (z ^ (z >> 30 )) * 0xbf58476d1ce4e5b9 ;
663
672
z = (z ^ (z >> 27 )) * 0x94d049bb133111eb ;
664
673
return z ^ (z >> 31 );
@@ -675,9 +684,9 @@ inline auto with_instance_map(const void *ptr,
675
684
// other threads/cores to map to other shards. Using the high bits is a good
676
685
// heuristic because memory allocators often have a per-thread
677
686
// arena/superblock/segment from which smaller allocations are served.
678
- auto addr = reinterpret_cast <uintptr_t >(ptr);
679
- uint64_t hash = splitmix64 (( uint64_t ) (addr >> 20 ));
680
- size_t idx = ( size_t ) hash & internals.instance_shards_mask ;
687
+ auto addr = reinterpret_cast <std:: uintptr_t >(ptr);
688
+ auto hash = mix64 ( static_cast <std:: uint64_t > (addr >> 20 ));
689
+ auto idx = static_cast < size_t >( hash & internals.instance_shards_mask ) ;
681
690
682
691
auto &shard = internals.instance_shards [idx];
683
692
std::unique_lock<std::mutex> lock (shard.mutex );
@@ -688,6 +697,8 @@ inline auto with_instance_map(const void *ptr,
688
697
#endif
689
698
}
690
699
700
+ // Returns the number of registered instances for testing purposes. The result may not be
701
+ // consistent if other threads are registering or unregistering instances concurrently.
691
702
inline size_t num_registered_instances () {
692
703
auto &internals = get_internals ();
693
704
#ifdef Py_GIL_DISABLED
0 commit comments