Skip to content

Commit

Permalink
Fix encoding last code point of a certain width (e.g. U+007F) an extr…
Browse files Browse the repository at this point in the history
…a byte up

The maximum codepoint for encoding was checked exclusively instead of inclusively.
  • Loading branch information
penguin-teal committed Dec 21, 2023
1 parent d852701 commit 34b1f2a
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions src/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,15 @@ size_t utf8Size(uint8_t binary)
uint32_t utf8Encode(uint64_t codePoint, size_t *size)
{
// Is the code point ASCII?
if(codePoint < 0x007Fu)
if(codePoint <= 0x007Fu)
{
// ASCII characters are 1 byte
if(size) *size = 1;
// Then just return the code point (the first bit will already be 0).
return codePoint;
}
// Between U+0080 and U+07FF?
else if(codePoint < 0x07FFu)
else if(codePoint <= 0x07FFu)
{
// This range is encoded in 2 bytes
if(size) *size = 2;
Expand All @@ -77,7 +77,7 @@ uint32_t utf8Encode(uint64_t codePoint, size_t *size)
return (uint32_t)byte1 << 8 | (uint32_t)byte2;
}
// Between U+0800 and U+FFFF?
else if(codePoint < 0xFFFFu)
else if(codePoint <= 0xFFFFu)
{
// This range is encoded in 3 bytes
if(size) *size = 3;
Expand All @@ -91,7 +91,7 @@ uint32_t utf8Encode(uint64_t codePoint, size_t *size)
return (uint32_t)byte1 << 16 | (uint32_t)byte2 << 8 | byte3;
}
// Between U+10000 and U+10FFFF?
else if(codePoint < 0x10FFFFu)
else if(codePoint <= 0x10FFFFu)
{
// This range is encoded in 4 bytes
if(size) *size = 4;
Expand Down

0 comments on commit 34b1f2a

Please sign in to comment.