Skip to content

Commit

Permalink
Merge pull request #378 from Shopify/key-cache-2
Browse files Browse the repository at this point in the history
Unpacker: `key_cache` option.
  • Loading branch information
byroot authored Nov 14, 2024
2 parents 83a2600 + f8bc6da commit eb4859d
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 8 deletions.
2 changes: 2 additions & 0 deletions doclib/msgpack/unpacker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Unpacker
# Supported options:
#
# * *:symbolize_keys* deserialize keys of Hash objects as Symbol instead of String
# * *:freeze* freeze the deserialized objects. Can allow string deduplication and some allocation elision.
# * *:key_cache* Enable caching of map keys, this can improve performance significantly if the same map keys are frequently encountered, but also degrade performance if that's not the case.
# * *:allow_unknown_ext* allow to deserialize ext type object with unknown type id as ExtensionValue instance. Otherwise (by default), unpacker throws UnknownExtTypeError.
#
# See also Buffer#initialize for other options.
Expand Down
127 changes: 127 additions & 0 deletions ext/msgpack/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,4 +473,131 @@ static inline VALUE msgpack_buffer_read_top_as_symbol(msgpack_buffer_t* b, size_
return rb_str_intern(msgpack_buffer_read_top_as_string(b, length, true, utf8));
}

// Hash keys are likely to be repeated, and are frozen.
// As such we can re-use them if we keep a cache of the ones we've seen so far,
// and save much more expensive lookups into the global fstring table.
// This cache implementation is deliberately simple, as we're optimizing for compactness,
// to be able to fit easily embeded inside msgpack_unpacker_t.
// As such, binary search into a sorted array gives a good tradeoff between compactness and
// performance.
#define MSGPACK_KEY_CACHE_CAPACITY 63

typedef struct msgpack_key_cache_t msgpack_key_cache_t;
struct msgpack_key_cache_t {
int length;
VALUE entries[MSGPACK_KEY_CACHE_CAPACITY];
};

static inline VALUE build_interned_string(const char *str, const long length)
{
# ifdef HAVE_RB_ENC_INTERNED_STR
return rb_enc_interned_str(str, length, rb_utf8_encoding());
# else
VALUE rstring = rb_utf8_str_new(str, length);
return rb_funcall(rb_str_freeze(rstring), s_uminus, 0);
# endif
}

static inline VALUE build_symbol(const char *str, const long length)
{
return rb_str_intern(build_interned_string(str, length));
}

static void rvalue_cache_insert_at(msgpack_key_cache_t *cache, int index, VALUE rstring)
{
MEMMOVE(&cache->entries[index + 1], &cache->entries[index], VALUE, cache->length - index);
cache->length++;
cache->entries[index] = rstring;
}

static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
{
long rstring_length = RSTRING_LEN(rstring);
if (length == rstring_length) {
return memcmp(str, RSTRING_PTR(rstring), length);
} else {
return (int)(length - rstring_length);
}
}

static VALUE rstring_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
{
int low = 0;
int high = cache->length - 1;
int mid = 0;
int last_cmp = 0;

while (low <= high) {
mid = (high + low) >> 1;
VALUE entry = cache->entries[mid];
last_cmp = rstring_cache_cmp(str, length, entry);

if (last_cmp == 0) {
return entry;
} else if (last_cmp > 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}

VALUE rstring = build_interned_string(str, length);

if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
if (last_cmp > 0) {
mid += 1;
}

rvalue_cache_insert_at(cache, mid, rstring);
}
return rstring;
}

static VALUE rsymbol_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
{
int low = 0;
int high = cache->length - 1;
int mid = 0;
int last_cmp = 0;

while (low <= high) {
mid = (high + low) >> 1;
VALUE entry = cache->entries[mid];
last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));

if (last_cmp == 0) {
return entry;
} else if (last_cmp > 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}

VALUE rsymbol = build_symbol(str, length);

if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
if (last_cmp > 0) {
mid += 1;
}

rvalue_cache_insert_at(cache, mid, rsymbol);
}
return rsymbol;
}

static inline VALUE msgpack_buffer_read_top_as_interned_symbol(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
{
VALUE result = rsymbol_cache_fetch(cache, b->read_buffer, length);
_msgpack_buffer_consumed(b, length);
return result;
}

static inline VALUE msgpack_buffer_read_top_as_interned_string(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
{
VALUE result = rstring_cache_fetch(cache, b->read_buffer, length);
_msgpack_buffer_consumed(b, length);
return result;
}

#endif
1 change: 1 addition & 0 deletions ext/msgpack/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
have_func("rb_enc_interned_str", "ruby.h") # Ruby 3.0+
have_func("rb_hash_new_capa", "ruby.h") # Ruby 3.2+
have_func("rb_proc_call_with_block", "ruby.h") # CRuby (TruffleRuby doesn't have it)
have_func("rb_gc_mark_locations", "ruby.h") # Missing on TruffleRuby

append_cflags([
"-fvisibility=hidden",
Expand Down
45 changes: 41 additions & 4 deletions ext/msgpack/unpacker.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,19 @@
#define rb_proc_call_with_block(recv, argc, argv, block) rb_funcallv(recv, rb_intern("call"), argc, argv)
#endif

#ifndef HAVE_RB_GC_MARK_LOCATIONS
// For TruffleRuby
void rb_gc_mark_locations(const VALUE *start, const VALUE *end)
{
VALUE *value = start;

while (value < end) {
rb_gc_mark(*value);
value++;
}
}
#endif

struct protected_proc_call_args {
VALUE proc;
int argc;
Expand Down Expand Up @@ -130,11 +143,18 @@ void msgpack_unpacker_mark_stack(msgpack_unpacker_stack_t* stack)
}
}

void msgpack_unpacker_mark_key_cache(msgpack_key_cache_t *cache)
{
const VALUE *entries = &cache->entries[0];
rb_gc_mark_locations(entries, entries + cache->length);
}

void msgpack_unpacker_mark(msgpack_unpacker_t* uk)
{
rb_gc_mark(uk->last_object);
rb_gc_mark(uk->reading_raw);
msgpack_unpacker_mark_stack(&uk->stack);
msgpack_unpacker_mark_key_cache(&uk->key_cache);
/* See MessagePack_Buffer_wrap */
/* msgpack_buffer_mark(UNPACKER_BUFFER_(uk)); */
rb_gc_mark(uk->buffer_ref);
Expand Down Expand Up @@ -374,15 +394,32 @@ static inline int read_raw_body_begin(msgpack_unpacker_t* uk, int raw_type)
size_t length = uk->reading_raw_remaining;
if(length <= msgpack_buffer_top_readable_size(UNPACKER_BUFFER_(uk))) {
int ret;
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type) || (uk->symbolize_keys && is_reading_map_key(uk))) {
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type)) {
VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, raw_type != RAW_TYPE_BINARY);
ret = object_complete_symbol(uk, symbol);
} else if (is_reading_map_key(uk) && raw_type == RAW_TYPE_STRING) {
/* don't use zerocopy for hash keys but get a frozen string directly
* because rb_hash_aset freezes keys and it causes copying */
VALUE key;
if (uk->symbolize_keys) {
if (uk->use_key_cache) {
key = msgpack_buffer_read_top_as_interned_symbol(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
} else {
key = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, true);
}
ret = object_complete_symbol(uk, key);
} else {
if (uk->use_key_cache) {
key = msgpack_buffer_read_top_as_interned_string(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
} else {
key = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, true, true);
}

ret = object_complete(uk, key);
}
} else {
bool will_freeze = uk->freeze;
if(raw_type == RAW_TYPE_STRING || raw_type == RAW_TYPE_BINARY) {
/* don't use zerocopy for hash keys but get a frozen string directly
* because rb_hash_aset freezes keys and it causes copying */
will_freeze = will_freeze || is_reading_map_key(uk);
VALUE string = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, will_freeze, raw_type == RAW_TYPE_STRING);
ret = object_complete(uk, string);
} else {
Expand Down
16 changes: 12 additions & 4 deletions ext/msgpack/unpacker.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ struct msgpack_unpacker_stack_t {
struct msgpack_unpacker_t {
msgpack_buffer_t buffer;
msgpack_unpacker_stack_t stack;
msgpack_key_cache_t key_cache;

VALUE self;
VALUE last_object;
Expand All @@ -66,10 +67,12 @@ struct msgpack_unpacker_t {

/* options */
int symbol_ext_type;
bool symbolize_keys;
bool freeze;
bool allow_unknown_ext;
bool optimized_symbol_ext_type;

bool use_key_cache: 1;
bool symbolize_keys: 1;
bool freeze: 1;
bool allow_unknown_ext: 1;
bool optimized_symbol_ext_type: 1;
};

#define UNPACKER_BUFFER_(uk) (&(uk)->buffer)
Expand Down Expand Up @@ -101,6 +104,11 @@ static inline void msgpack_unpacker_set_symbolized_keys(msgpack_unpacker_t* uk,
uk->symbolize_keys = enable;
}

static inline void msgpack_unpacker_set_key_cache(msgpack_unpacker_t* uk, bool enable)
{
uk->use_key_cache = enable;
}

static inline void msgpack_unpacker_set_freeze(msgpack_unpacker_t* uk, bool enable)
{
uk->freeze = enable;
Expand Down
5 changes: 5 additions & 0 deletions ext/msgpack/unpacker_class.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ static VALUE eUnknownExtTypeError;
static VALUE mTypeError; // obsoleted. only for backward compatibility. See #86.

static VALUE sym_symbolize_keys;
static VALUE sym_key_cache;
static VALUE sym_freeze;
static VALUE sym_allow_unknown_ext;

Expand Down Expand Up @@ -128,6 +129,9 @@ VALUE MessagePack_Unpacker_initialize(int argc, VALUE* argv, VALUE self)
if(options != Qnil) {
VALUE v;

v = rb_hash_aref(options, sym_key_cache);
msgpack_unpacker_set_key_cache(uk, RTEST(v));

v = rb_hash_aref(options, sym_symbolize_keys);
msgpack_unpacker_set_symbolized_keys(uk, RTEST(v));

Expand Down Expand Up @@ -413,6 +417,7 @@ void MessagePack_Unpacker_module_init(VALUE mMessagePack)
eUnknownExtTypeError = rb_define_class_under(mMessagePack, "UnknownExtTypeError", eUnpackError);

sym_symbolize_keys = ID2SYM(rb_intern("symbolize_keys"));
sym_key_cache = ID2SYM(rb_intern("key_cache"));
sym_freeze = ID2SYM(rb_intern("freeze"));
sym_allow_unknown_ext = ID2SYM(rb_intern("allow_unknown_ext"));

Expand Down

0 comments on commit eb4859d

Please sign in to comment.