From 4dbd495e9f3d87a83c3201ef9d851e85f7133db7 Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Tue, 28 May 2024 09:27:51 -0400 Subject: [PATCH] Find the symbols via DT_GNU_HASH instead of DT_HASH. (#36) * Switch to CMake 3.5. This avoids a warning when building with modern CMake. Signed-off-by: Chris Lalancette * Find the symbols via DT_GNU_HASH instead of DT_HASH. Since glibc 2.36 (released in August 2022), builds of libc.so.6 are built with the default value of --hash-style on all platforms. The immediate effect of this is that linker no longer generates a DT_HASH section, which is what Mimick uses to detect vital functions like vfprintf and abort. It turns out that Ubuntu and Debian specifically override this behavior on amd64 and i386, since there are some proprietary applications on those platforms that depend on this. However, this override is *not* applied on aarch64, so there is no DT_HASH. This explains the discrepancy we see when running CI on amd64 (where Mimick tests succeed) and aarch64 (where Mimick tests fail). It also turns out that DT_HASH is "deprecated", and has been for about 15 years. Thus, all of our platforms (going back to RHEL-8) support this construct. Thus, this commit implements getting symbols from DT_GNU_HASH instead of from DT_HASH. Note that it also changes it so that we *prefer* to get the data from DT_GNU_HASH, as someday DT_HASH may go away entirely. I should note that I borrowed heavily from https://flapenguin.me/elf-dt-gnu-hash and https://sourceware.org/git/?p=glibc.git;a=blob;f=elf/dl-lookup.c;h=3d2369dbf2b7ca219eaf80a820e2a8e1329fbf50;hb=HEAD#l350 to implement this, though I made a bunch of changes to fix warnings and better integrate into the Mimick source code. Signed-off-by: Chris Lalancette --- CMakeLists.txt | 2 +- src/plt-elf.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 83 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f7a7b54..c911dd3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ # Redistribution and use of this file is allowed according to the terms of the MIT license. # For details see the LICENSE file distributed with Mimick. -cmake_minimum_required (VERSION 2.8.12) +cmake_minimum_required (VERSION 3.5) project (Mimick C CXX) # Default to C++11 diff --git a/src/plt-elf.c b/src/plt-elf.c index d2501aa..953f820 100644 --- a/src/plt-elf.c +++ b/src/plt-elf.c @@ -317,6 +317,16 @@ void plt_reset_offsets(plt_offset *offset, size_t nb_off) } } +static uint32_t elf_gnu_hash(const char * name) +{ + uint32_t h = 5381; + + for (unsigned char c = *name; c != '\0'; c = *++name) + h = (h << 5) + h + *name; + + return h; +} + static unsigned long elf_hash (const char *s) { unsigned long h = 0, high; @@ -329,6 +339,54 @@ static unsigned long elf_hash (const char *s) return h; } +static ElfW(Sym) *elf_gnu_hash_find(const ElfW(Word) *gnu_hash, ElfW(Sym) *symtab, + const char *strtab, const char *name) +{ + const uint32_t namehash = elf_gnu_hash(name); + + const uint32_t nbuckets = gnu_hash[0]; + const uint32_t symoffset = gnu_hash[1]; + const uint32_t bloom_size = gnu_hash[2]; + const uint32_t bloom_shift = gnu_hash[3]; + const ElfWord *bloom = (void*)&gnu_hash[4]; + const uint32_t *buckets = (void*)&bloom[bloom_size]; + const uint32_t *chain = &buckets[nbuckets]; + + ElfWord word = bloom[(namehash / MMK_BITS) % bloom_size]; + ElfWord mask = 0 + | (ElfWord)1 << (namehash % MMK_BITS) + | (ElfWord)1 << ((namehash >> bloom_shift) % MMK_BITS); + + /* If at least one bit is not set, a symbol is surely missing. */ + if ((word & mask) != mask) + return NULL; + + uint32_t symix = buckets[namehash % nbuckets]; + if (symix < symoffset) + return NULL; + + size_t name_len = mmk_strlen(name); + + /* Loop through the chain. */ + while (1) { + const char *symname = strtab + symtab[symix].st_name; + const uint32_t hash = chain[symix - symoffset]; + size_t symname_len = strlen(symname); + size_t cmp_len = (name_len < symname_len) ? name_len : symname_len; + + if ((namehash|1) == (hash|1) && mmk_memcmp(name, symname, cmp_len) == 0) + return &symtab[symix]; + + /* Chain ends with an element with the lowest bit set to 1. */ + if (hash & 1) + break; + + symix++; + } + + return NULL; +} + static ElfW(Sym) *elf_hash_find(ElfW(Word) *hash, ElfW(Sym) *symtab, const char *strtab, const char *name) { @@ -351,13 +409,32 @@ static ElfW(Sym) *elf_hash_find(ElfW(Word) *hash, ElfW(Sym) *symtab, static ElfW(Sym) *sym_lookup_dyn(plt_lib lib, const char *name) { - ElfW(Word) *hash = (ElfW(Word)*) lib_dt_lookup(lib, DT_HASH); - ElfW(Sym) *symtab = (ElfW(Sym)*) lib_dt_lookup(lib, DT_SYMTAB); - const char *strtab = (const char*) lib_dt_lookup(lib, DT_STRTAB); + ElfW(Sym) *symtab = (ElfW(Sym)*) lib_dt_lookup(lib, DT_SYMTAB); + if (!symtab) + return NULL; - if (!hash || !symtab || !strtab) + const char *strtab = (const char*) lib_dt_lookup(lib, DT_STRTAB); + if (!strtab) return NULL; - return elf_hash_find (hash, symtab, strtab, name); + + ElfW(Sym) *symbol = NULL; + + // DT_GNU_HASH is the "modern" way to lookup symbols. If we have that, + // use it. + ElfW(Word) *gnu_hash = (ElfW(Word)*) lib_dt_lookup(lib, DT_GNU_HASH); + if (gnu_hash) { + symbol = elf_gnu_hash_find(gnu_hash, symtab, strtab, name); + if (symbol) + return symbol; + } + + // DT_HASH is the older, deprecated way to find symbols. Only + // attempt to use it if we can't find the symbols via DT_GNU_HASH. + ElfW(Word) *hash = (ElfW(Word)*) lib_dt_lookup(lib, DT_HASH); + if (hash) + symbol = elf_hash_find (hash, symtab, strtab, name); + + return symbol; } plt_fn *plt_get_real_fn(plt_ctx ctx, const char *name)