diff --git a/libr/bin/bfile.c b/libr/bin/bfile.c index 37be0b32ebab5c..93fe931f720721 100644 --- a/libr/bin/bfile.c +++ b/libr/bin/bfile.c @@ -102,7 +102,13 @@ static void print_string(RBinFile *bf, RBinString *string, int raw, PJ *pj) { } // TODO: this code must be implemented in RSearch as options for the strings mode -static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from, const ut64 to, int type, int raw, RBinSection *section) { +static int string_scan_range(R_NULLABLE RList *list, RBinFile *bf, int min, const ut64 from, const ut64 to, int type, int raw, RBinSection *section) { + R_RETURN_VAL_IF_FAIL (bf, -1); +#if R2_USE_NEW_ABI + int utf_list_size = 0; + int *utf_list = NULL; + int *utf_freq = NULL; +#endif RBin *bin = bf->rbin; const bool strings_nofp = bin->strings_nofp; ut8 tmp[64]; // temporal buffer to encode characters in utf8 form @@ -120,9 +126,6 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from maxstr = R_STRING_SCAN_BUFFER_SIZE; } - // if list is null it means its gonna dump - R_RETURN_VAL_IF_FAIL (bf, -1); - if (type == -1) { type = R_STRING_TYPE_DETECT; } @@ -213,7 +216,6 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from if (!addr_aligned) { is_wide32le = false; } - ///is_wide32be &= (n1 < 0xff && n11 < 0xff); // false; // n11 < 0xff; if (is_wide32le && addr_aligned) { str_type = R_STRING_TYPE_WIDE32; // asume big endian,is there little endian w32? } else { @@ -222,11 +224,9 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from str_type = is_wide? R_STRING_TYPE_WIDE: R_STRING_TYPE_ASCII; } } else { - if (rc > 1) { - str_type = R_STRING_TYPE_UTF8; // could be charset if set :? - } else { - str_type = R_STRING_TYPE_ASCII; - } + str_type = (rc > 1) + ? R_STRING_TYPE_UTF8 + : R_STRING_TYPE_ASCII; } } else if (type == R_STRING_TYPE_UTF8) { str_type = R_STRING_TYPE_ASCII; // initial assumption @@ -315,15 +315,20 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from // back up past the \0 to the last char just in case it starts a wide string needle -= 2; } + // TODO: allow the user to filter strings by type at scan time, this is, dont expect utf32 or utf16 strings if (runes >= min) { const char *tmpstr = r_strbuf_get (sb); size_t tmplen = r_strbuf_length (sb); // reduce false positives +#if R2_USE_NEW_ABI + int j, num_blocks; +#else int j, num_blocks, *block_list; +#endif int *freq_list = NULL, expected_ascii, actual_ascii, num_chars; if (str_type == R_STRING_TYPE_ASCII) { for (j = 0; j < tmplen; j++) { - char ch = tmpstr[j]; + const char ch = tmpstr[j]; if (ch != '\n' && ch != '\r' && ch != '\t') { if (!IS_PRINTABLE (ch)) { continue; @@ -335,13 +340,64 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from case R_STRING_TYPE_UTF8: case R_STRING_TYPE_WIDE: case R_STRING_TYPE_WIDE32: +#if R2_USE_NEW_ABI + if (tmplen > utf_list_size) { + int newsize = tmplen + 128; + int *a = realloc (utf_list, sizeof (int) * newsize); + int *b = realloc (freq_list, sizeof (int) * newsize); + if (a && b) { + utf_list_size = newsize; + utf_list = a; + utf_freq = b; + } else { + R_LOG_ERROR ("Cannot allocate %d", tmplen); + return 0; + } + } + // freq_list = (str_type == R_STRING_TYPE_WIDE || str_type == R_STRING_TYPE_WIDE32)? utf_freq: NULL; + freq_list = (str_type == R_STRING_TYPE_WIDE)? utf_freq: NULL; + num_blocks = r_utf_block_list2 ((const ut8*)tmpstr, tmplen - 1, utf_list, freq_list); + if (freq_list) { + num_chars = 0; + actual_ascii = 0; + for (j = 0; j < num_blocks; j++) { + num_chars += freq_list[j]; + if (!utf_list[j]) { // ASCII + actual_ascii = freq_list[j]; + } + } + expected_ascii = num_blocks ? num_chars / num_blocks : 0; + if (actual_ascii > expected_ascii) { + ascii_only = true; + if (str_type == R_STRING_TYPE_UTF8) { + str_type = R_STRING_TYPE_ASCII; + R_LOG_DEBUG ("ascii string miss identified as utf8"); + } + needle = str_start; + continue; + } + } + if (num_blocks > R_STRING_MAX_UNI_BLOCKS) { + needle++; + continue; + } +#else num_blocks = 0; block_list = r_utf_block_list ((const ut8*)tmpstr, tmplen - 1, str_type == R_STRING_TYPE_WIDE? &freq_list: NULL); if (block_list) { +#if 0 + for (j = 0; block_list[j] != -1 && block_list[j] < 200; j++) { + num_blocks++; + } +#else for (j = 0; block_list[j] != -1; j++) { num_blocks++; } +#endif + if (num_blocks > 0) { + num_blocks--; + } } if (freq_list) { num_chars = 0; @@ -356,9 +412,16 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from expected_ascii = num_blocks ? num_chars / num_blocks : 0; if (actual_ascii > expected_ascii) { ascii_only = true; - needle = str_start; + if (str_type == R_STRING_TYPE_UTF8) { + str_type = R_STRING_TYPE_ASCII; + R_LOG_DEBUG ("ascii string miss identified as utf8"); + } free (block_list); - continue; + if (str_start > needle) { + needle = str_start; + continue; + } + break; } } free (block_list); @@ -366,6 +429,7 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from needle++; continue; } +#endif } RBinString *bs = R_NEW0 (RBinString); if (!bs) { @@ -447,6 +511,10 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from pj_free (pj); } r_strbuf_free (sb); +#if R2_USE_NEW_ABI + free (utf_list); + free (utf_freq); +#endif return bf->string_count; } diff --git a/libr/include/r_util/r_utf8.h b/libr/include/r_util/r_utf8.h index c6ecda7120b910..6b7f0c19a75e27 100644 --- a/libr/include/r_util/r_utf8.h +++ b/libr/include/r_util/r_utf8.h @@ -22,7 +22,10 @@ R_API char *r_utf16_to_utf8_l(const wchar_t *wc, int len); R_API const char *r_utf_block_name(int idx); R_API wchar_t *r_utf8_to_utf16_l(const char *cstring, int len); R_API int r_utf_block_idx(RRune ch); -R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list); +R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list); // XXX deprecate +#if R2_USE_NEW_ABI +R_API int r_utf_block_list2(const ut8 *str, int len, int *list, int *freq_list); +#endif R_API RStrEnc r_utf_bom_encoding(const ut8 *ptr, int ptrlen); #define r_utf16_to_utf8(wc) r_utf16_to_utf8_l ((wchar_t *)wc, -1) #define r_utf8_to_utf16(cstring) r_utf8_to_utf16_l ((char *)cstring, -1) diff --git a/libr/util/utf8.c b/libr/util/utf8.c index 6ea24a5b6252b9..6a9e0965ac221a 100644 --- a/libr/util/utf8.c +++ b/libr/util/utf8.c @@ -1,4 +1,6 @@ -/* radare2 - LGPL - Copyright 2014-2018 - thelemon, kazarmy, pancake */ +/* radare2 - LGPL - Copyright 2014-2024 - thelemon, kazarmy, pancake */ + +// R2R db/cmd/cmd_iz #include #include @@ -495,17 +497,20 @@ R_API int r_utf8_decode(const ut8 *ptr, int ptrlen, RRune *ch) { *ch = (ut32)ptr[0]; } return 1; - } else if (ptrlen>1 && (ptr[0]&0xe0) == 0xc0 && (ptr[1]&0xc0) == 0x80) { + } + if (ptrlen > 1 && (ptr[0]&0xe0) == 0xc0 && (ptr[1]&0xc0) == 0x80) { if (ch) { *ch = (ptr[0] & 0x1f) << 6 | (ptr[1] & 0x3f); } return 2; - } else if (ptrlen>2 && (ptr[0]&0xf0) == 0xe0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80) { + } + if (ptrlen > 2 && (ptr[0]&0xf0) == 0xe0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80) { if (ch) { *ch = (ptr[0] & 0xf) << 12 | (ptr[1] & 0x3f) << 6 | (ptr[2] & 0x3f); } return 3; - } else if (ptrlen>3 && (ptr[0]&0xf8) == 0xf0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80 && (ptr[3]&0xc0) == 0x80) { + } + if (ptrlen > 3 && (ptr[0]&0xf8) == 0xf0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80 && (ptr[3]&0xc0) == 0x80) { if (ch) { *ch = (ptr[0] & 7) << 18 | (ptr[1] & 0x3f) << 12 | (ptr[2] & 0x3f) << 6 | (ptr[3] & 0x3f); } @@ -519,16 +524,19 @@ R_API int r_utf8_encode(ut8 *ptr, const RRune ch) { if (ch < 0x80) { ptr[0] = (ut8)ch; return 1; - } else if (ch < 0x800) { + } + if (ch < 0x800) { ptr[0] = 0xc0 | (ch >> 6); ptr[1] = 0x80 | (ch & 0x3f); return 2; - } else if (ch < 0x10000) { + } + if (ch < 0x10000) { ptr[0] = 0xe0 | (ch >> 12); ptr[1] = 0x80 | ((ch >> 6) & 0x3f); ptr[2] = 0x80 | (ch & 0x3f); return 3; - } else if (ch < 0x200000) { + } + if (ch < 0x200000) { ptr[0] = 0xf0 | (ch >> 18); ptr[1] = 0x80 | ((ch >> 12) & 0x3f); ptr[2] = 0x80 | ((ch >> 6) & 0x3f); @@ -726,13 +734,11 @@ R_API char *r_acp_to_utf8_l(const char *str, int len) { R_API int r_utf_block_idx(RRune ch) { const int last = R_UTF_BLOCKS_COUNT; - int low, hi, mid; - - low = 0; - hi = last - 1; + int low = 0; + int hi = last - 1; do { - mid = (low + hi) >> 1; + int mid = (low + hi) >> 1; if (ch >= r_utf_blocks[mid].from && ch <= r_utf_blocks[mid].to) { return mid; } @@ -747,11 +753,57 @@ R_API int r_utf_block_idx(RRune ch) { return R_UTF_BLOCKS_COUNT - 1; /* index for "No_Block" */ } +#if R2_USE_NEW_ABI +R_API int r_utf_block_list2(const ut8 *str, int len, int *list, int *freq_list) { + // list must be sizeof (int) * len + 1 at least + if (!str || len < 1) { + return 0; + } + R_RETURN_VAL_IF_FAIL (len >= 0, 0); + int block_freq[R_UTF_BLOCKS_COUNT] = {0}; + int num_blocks = 0; + int *list_ptr = list; + const ut8 *str_ptr = str; + const ut8 *str_end = str + len; + RRune ch; + bool eos = false; + while (str_ptr < str_end) { + int block_idx; + int runesize = r_utf8_decode (str_ptr, str_end - str_ptr, &ch); + if (runesize > 0) { + block_idx = r_utf_block_idx (ch); + if (!block_freq[block_idx]) { + *list_ptr++ = block_idx; + if (block_idx == -1) { + eos = true; + } + if (!eos) { + num_blocks++; + } + } + block_freq[block_idx]++; + str_ptr += runesize; + } else { + str_ptr++; + } + } + *list_ptr = -1; + int i; + if (freq_list) { + int *p = freq_list; + for (i = 0; i < num_blocks; i++) { + *p++ = block_freq[list[i]]; + } + *p = -1; + } + return num_blocks; +} +#endif + /* str must be UTF8-encoded */ +// R2_600 DEPRECATE THIS R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list) { - if (!str) { - return NULL; - } +#if 1 if (len < 0) { len = strlen ((const char *)str); } @@ -797,12 +849,36 @@ R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list) { } *freq_list_ptr = -1; } - for (list_ptr = list; *list_ptr != -1; list_ptr++) { - block_freq[*list_ptr] = 0; +#else + int *freq_list_ptr = NULL; + int *list = R_NEWS (int, len + 1); + if (freq_list) { + *freq_list = R_NEWS (int, len + 1); + if (!*freq_list) { + free (list); + return NULL; + } + freq_list_ptr = *freq_list; + } + int count = r_utf_block_list2 (str, len, list, freq_list_ptr); + if (count > 0) { + if (freq_list) { + freq_list[count] = -1; + } } +#if 1 + if (count < 1) { + free (list); + free (freq_list_ptr); + return NULL; + } +#endif + +#endif return list; } + R_API RStrEnc r_utf_bom_encoding(const ut8 *ptr, int ptrlen) { if (ptrlen > 3) { if (ptr[0] == 0xff && ptr[1] == 0xfe && !ptr[2] && !ptr[3]) { diff --git a/test/db/cmd/cmd_iz b/test/db/cmd/cmd_iz index b2a33b328b0321..9dbc88bf8f8faa 100644 --- a/test/db/cmd/cmd_iz +++ b/test/db/cmd/cmd_iz @@ -1,3 +1,16 @@ +NAME=iz le32 +FILE=- +CMDS=<