Skip to content

Commit

Permalink
Dont use the heap when scanning for utf8 strings ##bin
Browse files Browse the repository at this point in the history
  • Loading branch information
radare committed Jul 10, 2024
1 parent 4d2a841 commit c326599
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 43 deletions.
94 changes: 81 additions & 13 deletions libr/bin/bfile.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,13 @@ static void print_string(RBinFile *bf, RBinString *string, int raw, PJ *pj) {
}

// TODO: this code must be implemented in RSearch as options for the strings mode
static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from, const ut64 to, int type, int raw, RBinSection *section) {
static int string_scan_range(R_NULLABLE RList *list, RBinFile *bf, int min, const ut64 from, const ut64 to, int type, int raw, RBinSection *section) {
R_RETURN_VAL_IF_FAIL (bf, -1);
#if R2_USE_NEW_ABI
int utf_list_size = 0;
int *utf_list = NULL;
int *utf_freq = NULL;
#endif
RBin *bin = bf->rbin;
const bool strings_nofp = bin->strings_nofp;
ut8 tmp[64]; // temporal buffer to encode characters in utf8 form
Expand All @@ -120,9 +126,6 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
maxstr = R_STRING_SCAN_BUFFER_SIZE;
}

// if list is null it means its gonna dump
R_RETURN_VAL_IF_FAIL (bf, -1);

if (type == -1) {
type = R_STRING_TYPE_DETECT;
}
Expand Down Expand Up @@ -213,7 +216,6 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
if (!addr_aligned) {
is_wide32le = false;
}
///is_wide32be &= (n1 < 0xff && n11 < 0xff); // false; // n11 < 0xff;
if (is_wide32le && addr_aligned) {
str_type = R_STRING_TYPE_WIDE32; // asume big endian,is there little endian w32?
} else {
Expand All @@ -222,11 +224,9 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
str_type = is_wide? R_STRING_TYPE_WIDE: R_STRING_TYPE_ASCII;
}
} else {
if (rc > 1) {
str_type = R_STRING_TYPE_UTF8; // could be charset if set :?
} else {
str_type = R_STRING_TYPE_ASCII;
}
str_type = (rc > 1)
? R_STRING_TYPE_UTF8
: R_STRING_TYPE_ASCII;
}
} else if (type == R_STRING_TYPE_UTF8) {
str_type = R_STRING_TYPE_ASCII; // initial assumption
Expand Down Expand Up @@ -315,15 +315,20 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
// back up past the \0 to the last char just in case it starts a wide string
needle -= 2;
}
// TODO: allow the user to filter strings by type at scan time, this is, dont expect utf32 or utf16 strings
if (runes >= min) {
const char *tmpstr = r_strbuf_get (sb);
size_t tmplen = r_strbuf_length (sb);
// reduce false positives
#if R2_USE_NEW_ABI
int j, num_blocks;
#else
int j, num_blocks, *block_list;
#endif
int *freq_list = NULL, expected_ascii, actual_ascii, num_chars;
if (str_type == R_STRING_TYPE_ASCII) {
for (j = 0; j < tmplen; j++) {
char ch = tmpstr[j];
const char ch = tmpstr[j];
if (ch != '\n' && ch != '\r' && ch != '\t') {
if (!IS_PRINTABLE (ch)) {
continue;
Expand All @@ -335,13 +340,64 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
case R_STRING_TYPE_UTF8:
case R_STRING_TYPE_WIDE:
case R_STRING_TYPE_WIDE32:
#if R2_USE_NEW_ABI
if (tmplen > utf_list_size) {
int newsize = tmplen + 128;
int *a = realloc (utf_list, sizeof (int) * newsize);
int *b = realloc (freq_list, sizeof (int) * newsize);
if (a && b) {
utf_list_size = newsize;
utf_list = a;
utf_freq = b;
} else {
R_LOG_ERROR ("Cannot allocate %d", tmplen);
return 0;
}
}
// freq_list = (str_type == R_STRING_TYPE_WIDE || str_type == R_STRING_TYPE_WIDE32)? utf_freq: NULL;
freq_list = (str_type == R_STRING_TYPE_WIDE)? utf_freq: NULL;
num_blocks = r_utf_block_list2 ((const ut8*)tmpstr, tmplen - 1, utf_list, freq_list);
if (freq_list) {
num_chars = 0;
actual_ascii = 0;
for (j = 0; j < num_blocks; j++) {
num_chars += freq_list[j];
if (!utf_list[j]) { // ASCII
actual_ascii = freq_list[j];
}
}
expected_ascii = num_blocks ? num_chars / num_blocks : 0;
if (actual_ascii > expected_ascii) {
ascii_only = true;
if (str_type == R_STRING_TYPE_UTF8) {
str_type = R_STRING_TYPE_ASCII;
R_LOG_DEBUG ("ascii string miss identified as utf8");
}
needle = str_start;
continue;
}
}
if (num_blocks > R_STRING_MAX_UNI_BLOCKS) {
needle++;
continue;
}
#else
num_blocks = 0;
block_list = r_utf_block_list ((const ut8*)tmpstr, tmplen - 1,
str_type == R_STRING_TYPE_WIDE? &freq_list: NULL);
if (block_list) {
#if 0
for (j = 0; block_list[j] != -1 && block_list[j] < 200; j++) {
num_blocks++;
}
#else
for (j = 0; block_list[j] != -1; j++) {
num_blocks++;
}
#endif
if (num_blocks > 0) {
num_blocks--;
}
}
if (freq_list) {
num_chars = 0;
Expand All @@ -356,16 +412,24 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
expected_ascii = num_blocks ? num_chars / num_blocks : 0;
if (actual_ascii > expected_ascii) {
ascii_only = true;
needle = str_start;
if (str_type == R_STRING_TYPE_UTF8) {
str_type = R_STRING_TYPE_ASCII;
R_LOG_DEBUG ("ascii string miss identified as utf8");
}
free (block_list);
continue;
if (str_start > needle) {
needle = str_start;
continue;
}
break;
}
}
free (block_list);
if (num_blocks > R_STRING_MAX_UNI_BLOCKS) {
needle++;
continue;
}
#endif
}
RBinString *bs = R_NEW0 (RBinString);
if (!bs) {
Expand Down Expand Up @@ -447,6 +511,10 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
pj_free (pj);
}
r_strbuf_free (sb);
#if R2_USE_NEW_ABI
free (utf_list);
free (utf_freq);
#endif
return bf->string_count;
}

Expand Down
5 changes: 4 additions & 1 deletion libr/include/r_util/r_utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ R_API char *r_utf16_to_utf8_l(const wchar_t *wc, int len);
R_API const char *r_utf_block_name(int idx);
R_API wchar_t *r_utf8_to_utf16_l(const char *cstring, int len);
R_API int r_utf_block_idx(RRune ch);
R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list);
R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list); // XXX deprecate
#if R2_USE_NEW_ABI
R_API int r_utf_block_list2(const ut8 *str, int len, int *list, int *freq_list);
#endif
R_API RStrEnc r_utf_bom_encoding(const ut8 *ptr, int ptrlen);
#define r_utf16_to_utf8(wc) r_utf16_to_utf8_l ((wchar_t *)wc, -1)
#define r_utf8_to_utf16(cstring) r_utf8_to_utf16_l ((char *)cstring, -1)
Expand Down
110 changes: 93 additions & 17 deletions libr/util/utf8.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
/* radare2 - LGPL - Copyright 2014-2018 - thelemon, kazarmy, pancake */
/* radare2 - LGPL - Copyright 2014-2024 - thelemon, kazarmy, pancake */

// R2R db/cmd/cmd_iz

#include <r_types.h>
#include <r_util.h>
Expand Down Expand Up @@ -495,17 +497,20 @@ R_API int r_utf8_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
*ch = (ut32)ptr[0];
}
return 1;
} else if (ptrlen>1 && (ptr[0]&0xe0) == 0xc0 && (ptr[1]&0xc0) == 0x80) {
}
if (ptrlen > 1 && (ptr[0]&0xe0) == 0xc0 && (ptr[1]&0xc0) == 0x80) {
if (ch) {
*ch = (ptr[0] & 0x1f) << 6 | (ptr[1] & 0x3f);
}
return 2;
} else if (ptrlen>2 && (ptr[0]&0xf0) == 0xe0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80) {
}
if (ptrlen > 2 && (ptr[0]&0xf0) == 0xe0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80) {
if (ch) {
*ch = (ptr[0] & 0xf) << 12 | (ptr[1] & 0x3f) << 6 | (ptr[2] & 0x3f);
}
return 3;
} else if (ptrlen>3 && (ptr[0]&0xf8) == 0xf0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80 && (ptr[3]&0xc0) == 0x80) {
}
if (ptrlen > 3 && (ptr[0]&0xf8) == 0xf0 && (ptr[1]&0xc0) == 0x80 && (ptr[2]&0xc0) == 0x80 && (ptr[3]&0xc0) == 0x80) {
if (ch) {
*ch = (ptr[0] & 7) << 18 | (ptr[1] & 0x3f) << 12 | (ptr[2] & 0x3f) << 6 | (ptr[3] & 0x3f);
}
Expand All @@ -519,16 +524,19 @@ R_API int r_utf8_encode(ut8 *ptr, const RRune ch) {
if (ch < 0x80) {
ptr[0] = (ut8)ch;
return 1;
} else if (ch < 0x800) {
}
if (ch < 0x800) {
ptr[0] = 0xc0 | (ch >> 6);
ptr[1] = 0x80 | (ch & 0x3f);
return 2;
} else if (ch < 0x10000) {
}
if (ch < 0x10000) {
ptr[0] = 0xe0 | (ch >> 12);
ptr[1] = 0x80 | ((ch >> 6) & 0x3f);
ptr[2] = 0x80 | (ch & 0x3f);
return 3;
} else if (ch < 0x200000) {
}
if (ch < 0x200000) {
ptr[0] = 0xf0 | (ch >> 18);
ptr[1] = 0x80 | ((ch >> 12) & 0x3f);
ptr[2] = 0x80 | ((ch >> 6) & 0x3f);
Expand Down Expand Up @@ -726,13 +734,11 @@ R_API char *r_acp_to_utf8_l(const char *str, int len) {

R_API int r_utf_block_idx(RRune ch) {
const int last = R_UTF_BLOCKS_COUNT;
int low, hi, mid;

low = 0;
hi = last - 1;
int low = 0;
int hi = last - 1;

do {
mid = (low + hi) >> 1;
int mid = (low + hi) >> 1;
if (ch >= r_utf_blocks[mid].from && ch <= r_utf_blocks[mid].to) {
return mid;
}
Expand All @@ -747,11 +753,57 @@ R_API int r_utf_block_idx(RRune ch) {
return R_UTF_BLOCKS_COUNT - 1; /* index for "No_Block" */
}

#if R2_USE_NEW_ABI
R_API int r_utf_block_list2(const ut8 *str, int len, int *list, int *freq_list) {
// list must be sizeof (int) * len + 1 at least
if (!str || len < 1) {
return 0;
}
R_RETURN_VAL_IF_FAIL (len >= 0, 0);
int block_freq[R_UTF_BLOCKS_COUNT] = {0};
int num_blocks = 0;
int *list_ptr = list;
const ut8 *str_ptr = str;
const ut8 *str_end = str + len;
RRune ch;
bool eos = false;
while (str_ptr < str_end) {
int block_idx;
int runesize = r_utf8_decode (str_ptr, str_end - str_ptr, &ch);
if (runesize > 0) {
block_idx = r_utf_block_idx (ch);
if (!block_freq[block_idx]) {
*list_ptr++ = block_idx;
if (block_idx == -1) {
eos = true;
}
if (!eos) {
num_blocks++;
}
}
block_freq[block_idx]++;
str_ptr += runesize;
} else {
str_ptr++;
}
}
*list_ptr = -1;
int i;
if (freq_list) {
int *p = freq_list;
for (i = 0; i < num_blocks; i++) {
*p++ = block_freq[list[i]];
}
*p = -1;
}
return num_blocks;
}
#endif

/* str must be UTF8-encoded */
// R2_600 DEPRECATE THIS
R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list) {
if (!str) {
return NULL;
}
#if 1
if (len < 0) {
len = strlen ((const char *)str);
}
Expand Down Expand Up @@ -797,12 +849,36 @@ R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list) {
}
*freq_list_ptr = -1;
}
for (list_ptr = list; *list_ptr != -1; list_ptr++) {
block_freq[*list_ptr] = 0;
#else
int *freq_list_ptr = NULL;
int *list = R_NEWS (int, len + 1);
if (freq_list) {
*freq_list = R_NEWS (int, len + 1);
if (!*freq_list) {
free (list);
return NULL;
}
freq_list_ptr = *freq_list;
}
int count = r_utf_block_list2 (str, len, list, freq_list_ptr);
if (count > 0) {
if (freq_list) {
freq_list[count] = -1;
}
}
#if 1
if (count < 1) {
free (list);
free (freq_list_ptr);
return NULL;
}
#endif

#endif
return list;
}


R_API RStrEnc r_utf_bom_encoding(const ut8 *ptr, int ptrlen) {
if (ptrlen > 3) {
if (ptr[0] == 0xff && ptr[1] == 0xfe && !ptr[2] && !ptr[3]) {
Expand Down
Loading

0 comments on commit c326599

Please sign in to comment.