Blender V4.5
string_utf8.cc File Reference
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cwchar>
#include <cwctype>
#include <wcwidth.h>
#include "BLI_utildefines.h"
#include "BLI_string.h"
#include "BLI_string_utf8.h"
#include "BLI_strict_flags.h"

Go to the source code of this file.

Macros

#define UTF8_VARS_FROM_CHAR32(Char, First, Len)

Functions

ptrdiff_t BLI_str_utf8_invalid_byte (const char *str, size_t str_len)
int BLI_str_utf8_invalid_strip (char *str, size_t str_len)
int BLI_str_utf8_invalid_substitute (char *str, size_t str_len, const char substitute)
const char * BLI_str_utf8_invalid_substitute_as_needed (const char *str, const size_t str_len, const char substitute, char *buf, const size_t buf_maxncpy)
BLI_INLINE char * str_utf8_copy_max_bytes_impl (char *dst, const char *src, size_t dst_maxncpy)
char * BLI_strncpy_utf8 (char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
size_t BLI_strncpy_utf8_rlen (char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
size_t BLI_strncpy_utf8_rlen_unterminated (char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
size_t BLI_strncpy_wchar_as_utf8 (char *__restrict dst, const wchar_t *__restrict src, const size_t dst_maxncpy)
size_t BLI_wstrlen_utf8 (const wchar_t *src)
size_t BLI_strlen_utf8_ex (const char *strc, size_t *r_len_bytes)
size_t BLI_strlen_utf8 (const char *strc)
size_t BLI_strnlen_utf8_ex (const char *strc, const size_t strc_maxlen, size_t *r_len_bytes)
size_t BLI_strnlen_utf8 (const char *strc, const size_t strc_maxlen)
size_t BLI_strncpy_wchar_from_utf8 (wchar_t *__restrict dst_w, const char *__restrict src_c, const size_t dst_w_maxncpy)
int BLI_wcwidth_or_error (char32_t ucs)
int BLI_wcwidth_safe (char32_t ucs)
int BLI_wcswidth_or_error (const char32_t *pwcs, size_t n)
int BLI_str_utf8_char_width_or_error (const char *p)
int BLI_str_utf8_char_width_safe (const char *p)
int BLI_str_utf8_size_or_error (const char *p)
int BLI_str_utf8_size_safe (const char *p)
uint BLI_str_utf8_as_unicode_or_error (const char *p)
uint BLI_str_utf8_as_unicode_safe (const char *p)
uint BLI_str_utf8_as_unicode_step_or_error (const char *__restrict p, const size_t p_len, size_t *__restrict index)
uint BLI_str_utf8_as_unicode_step_safe (const char *__restrict p, const size_t p_len, size_t *__restrict index)
size_t BLI_str_utf8_from_unicode_len (const uint c)
size_t BLI_str_utf8_from_unicode (uint c, char *dst, const size_t dst_maxncpy)
size_t BLI_str_utf8_as_utf32 (char32_t *__restrict dst_w, const char *__restrict src_c, const size_t dst_w_maxncpy)
size_t BLI_str_utf32_as_utf8 (char *__restrict dst, const char32_t *__restrict src, const size_t dst_maxncpy)
size_t BLI_str_utf32_as_utf8_len_ex (const char32_t *src, const size_t src_maxlen)
size_t BLI_str_utf32_as_utf8_len (const char32_t *src)
const char * BLI_str_find_prev_char_utf8 (const char *p, const char *str_start)
const char * BLI_str_find_next_char_utf8 (const char *p, const char *str_end)
size_t BLI_str_partition_utf8 (const char *str, const uint delim[], const char **r_sep, const char **r_suf)
size_t BLI_str_rpartition_utf8 (const char *str, const uint delim[], const char **r_sep, const char **r_suf)
size_t BLI_str_partition_ex_utf8 (const char *str, const char *end, const uint delim[], const char **r_sep, const char **r_suf, const bool from_right)
bool BLI_str_utf8_truncate_at_size (char *str, const size_t str_size)
UTF8 Character Decoding (Skip & Mask Lookup)

Derived from GLIB gutf8.c.

Ranges (zero based, inclusive):

  • 000..127: 1 byte.
  • 128..191: invalid.
  • 192..223: 2 bytes.
  • 224..239: 3 bytes.
  • 240..247: 4 bytes.
  • 248..251: 4 bytes.
  • 252..253: 4 bytes.
  • 254..255: invalid.

Invalid values fall back to 1 byte or -1 (for an error value).

Note
From testing string copying via BLI_strncpy_utf8 with large (multi-megabyte) strings, using a function instead of a lookup-table is between 2 & 3 times faster.
BLI_INLINE int utf8_char_compute_skip (const char c)
BLI_INLINE int utf8_char_compute_skip_or_error (const char c)
BLI_INLINE int utf8_char_compute_skip_or_error_with_mask (const char c, char *r_mask)
BLI_INLINE uint utf8_char_decode (const char *p, const char mask, const int len, const uint err)
UTF32 Case Conversion
Warning
the lower/uppercase form of some characters use multiple characters. These cases are not accounted for by this conversion function. A common example is the German eszett / scharfes. Supporting such cases would have to operate on a character array, with support for resizing. (for reference - Python's upper/lower functions support this).
char32_t BLI_str_utf32_char_to_upper (const char32_t wc)
char32_t BLI_str_utf32_char_to_lower (const char32_t wc)
UTF32 Text Boundary Analysis

Helper functions to help locating linguistic boundaries, like word, sentence, and paragraph boundaries.

bool BLI_str_utf32_char_is_breaking_space (char32_t codepoint)
bool BLI_str_utf32_char_is_optional_break_after (char32_t codepoint, char32_t codepoint_prev)
bool BLI_str_utf32_char_is_optional_break_before (char32_t codepoint, char32_t codepoint_prev)
Offset Conversion in Strings
Note
Regarding the assertion: BLI_assert(offset <= offset_target) The offset_target is likely in the middle of a UTF8 byte-sequence. Most likely the offset passed in is incorrect, although it may be impractical to avoid this happening in the case of invalid UTF8 byte sequences. If the assert is impractical to avoid, it could be demoted to a warning.
int BLI_str_utf8_offset_to_index (const char *str, const size_t str_len, const int offset_target)
int BLI_str_utf8_offset_from_index (const char *str, const size_t str_len, const int index_target)
int BLI_str_utf8_offset_to_column (const char *str, const size_t str_len, const int offset_target)
int BLI_str_utf8_offset_from_column (const char *str, const size_t str_len, const int column_target)
int BLI_str_utf8_offset_to_column_with_tabs (const char *str, const size_t str_len, const int offset_target, const int tab_width)
int BLI_str_utf8_offset_from_column_with_tabs (const char *str, const size_t str_len, const int column_target, const int tab_width)

Macro Definition Documentation

◆ UTF8_VARS_FROM_CHAR32

#define UTF8_VARS_FROM_CHAR32 ( Char,
First,
Len )
Value:
if (Char < 0x80) { \
First = 0; \
Len = 1; \
} \
else if (Char < 0x800) { \
First = 0xc0; \
Len = 2; \
} \
else if (Char < 0x10000) { \
First = 0xe0; \
Len = 3; \
} \
else if (Char < 0x200000) { \
First = 0xf0; \
Len = 4; \
} \
else if (Char < 0x4000000) { \
First = 0xf8; \
Len = 5; \
} \
else { \
First = 0xfc; \
Len = 6; \
} \
(void)0

Definition at line 1028 of file string_utf8.cc.

Referenced by BLI_str_utf8_from_unicode(), and BLI_str_utf8_from_unicode_len().

Function Documentation

◆ BLI_str_find_next_char_utf8()

const char * BLI_str_find_next_char_utf8 ( const char * p,
const char * str_end )

Definition at line 1182 of file string_utf8.cc.

References BLI_assert.

Referenced by BLI_str_utf8_as_utf32().

◆ BLI_str_find_prev_char_utf8()

const char * BLI_str_find_prev_char_utf8 ( const char * p,
const char * str_start )

Definition at line 1167 of file string_utf8.cc.

References BLI_assert.

Referenced by BLI_str_partition_ex_utf8().

◆ BLI_str_partition_ex_utf8()

size_t BLI_str_partition_ex_utf8 ( const char * str,
const char * end,
const uint delim[],
const char ** r_sep,
const char ** r_suf,
const bool from_right )

◆ BLI_str_partition_utf8()

size_t BLI_str_partition_utf8 ( const char * str,
const uint delim[],
const char ** r_sep,
const char ** r_suf )

Definition at line 1195 of file string_utf8.cc.

References BLI_str_partition_ex_utf8(), and str.

◆ BLI_str_rpartition_utf8()

size_t BLI_str_rpartition_utf8 ( const char * str,
const uint delim[],
const char ** r_sep,
const char ** r_suf )

Definition at line 1203 of file string_utf8.cc.

References BLI_str_partition_ex_utf8(), and str.

◆ BLI_str_utf32_as_utf8()

size_t BLI_str_utf32_as_utf8 ( char *__restrict dst,
const char32_t *__restrict src,
const size_t dst_maxncpy )

◆ BLI_str_utf32_as_utf8_len()

size_t BLI_str_utf32_as_utf8_len ( const char32_t * src)
Returns
The UTF32 len in UTF8.

Definition at line 1156 of file string_utf8.cc.

References BLI_str_utf8_from_unicode_len(), and len.

Referenced by BKE_vfont_clipboard_set(), and ED_curve_editfont_load().

◆ BLI_str_utf32_as_utf8_len_ex()

size_t BLI_str_utf32_as_utf8_len_ex ( const char32_t * src,
size_t src_maxlen )
Returns
The UTF32 len in UTF8 with a clamped length.

Definition at line 1144 of file string_utf8.cc.

References BLI_str_utf8_from_unicode_len(), and len.

Referenced by font_select_to_buffer().

◆ BLI_str_utf32_char_is_breaking_space()

bool BLI_str_utf32_char_is_breaking_space ( char32_t codepoint)

◆ BLI_str_utf32_char_is_optional_break_after()

bool BLI_str_utf32_char_is_optional_break_after ( char32_t codepoint,
char32_t codepoint_prev )

◆ BLI_str_utf32_char_is_optional_break_before()

bool BLI_str_utf32_char_is_optional_break_before ( char32_t codepoint,
char32_t codepoint_prev )

◆ BLI_str_utf32_char_to_lower()

char32_t BLI_str_utf32_char_to_lower ( char32_t wc)

Return the lowercase of a 32-bit character or the character when no case change is needed.

Note
A 1:1 mapping doesn't account for multiple characters as part of conversion in some cases.

Definition at line 713 of file string_utf8.cc.

References ARRAY_SIZE, max, min, and U.

Referenced by BLI_strncpy_wchar_from_utf8(), and set_case().

◆ BLI_str_utf32_char_to_upper()

char32_t BLI_str_utf32_char_to_upper ( char32_t wc)

Return the uppercase of a 32-bit character or the character when no case change is needed.

Note
A 1:1 mapping doesn't account for multiple characters as part of conversion in some cases.

Definition at line 602 of file string_utf8.cc.

References ARRAY_SIZE, max, min, and U.

Referenced by BLI_strncpy_wchar_from_utf8(), and set_case().

◆ BLI_str_utf8_as_unicode_or_error()

uint BLI_str_utf8_as_unicode_or_error ( const char * p)
Parameters
pa pointer to Unicode character encoded as UTF8

Converts a sequence of bytes encoded as UTF8 to a Unicode character. If p does not point to a valid UTF8 encoded character, results are undefined. If you are not sure that the bytes are complete valid Unicode characters, you should use g_utf8_get_char_validated() instead.

Return value: the resulting character

Definition at line 966 of file string_utf8.cc.

References BLI_UTF8_ERR, len, mask(), UNLIKELY, utf8_char_compute_skip_or_error_with_mask(), and utf8_char_decode().

Referenced by BLI_str_partition_ex_utf8(), BLI_str_utf8_as_unicode_safe(), BLI_str_utf8_char_width_or_error(), BLI_str_utf8_char_width_safe(), insert_text_invoke(), key_event_glyph_or_text(), text_autocomplete_build(), and text_insert_invoke().

◆ BLI_str_utf8_as_unicode_safe()

◆ BLI_str_utf8_as_unicode_step_or_error()

uint BLI_str_utf8_as_unicode_step_or_error ( const char *__restrict p,
const size_t p_len,
size_t *__restrict index )

◆ BLI_str_utf8_as_unicode_step_safe()

◆ BLI_str_utf8_as_utf32()

size_t BLI_str_utf8_as_utf32 ( char32_t *__restrict dst_w,
const char *__restrict src_c,
const size_t dst_w_maxncpy )

◆ BLI_str_utf8_char_width_or_error()

int BLI_str_utf8_char_width_or_error ( const char * p)

◆ BLI_str_utf8_char_width_safe()

◆ BLI_str_utf8_from_unicode()

size_t BLI_str_utf8_from_unicode ( unsigned int c,
char * dst,
size_t dst_maxncpy )

BLI_str_utf8_from_unicode:

Parameters
ca Unicode character code
dstoutput buffer, must have at least dst_maxncpy bytes of space. If the length required by c exceeds dst_maxncpy, the bytes available bytes will be zeroed and dst_maxncpy returned.

Converts a single character to UTF8.

Returns
number of bytes written.

Definition at line 1067 of file string_utf8.cc.

References BLI_string_debug_size, i, len, UNLIKELY, and UTF8_VARS_FROM_CHAR32.

Referenced by BLI_str_utf32_as_utf8(), BLI_strncpy_wchar_as_utf8(), find_family_object(), blender::io::usd::make_safe_name(), txt_add_char_intern(), txt_extended_ascii_as_utf8(), txt_replace_char(), and wm_event_add_ghostevent().

◆ BLI_str_utf8_from_unicode_len()

size_t BLI_str_utf8_from_unicode_len ( const uint c)

◆ BLI_str_utf8_invalid_byte()

ptrdiff_t BLI_str_utf8_invalid_byte ( const char * str,
size_t str_len )

◆ BLI_str_utf8_invalid_strip()

int BLI_str_utf8_invalid_strip ( char * str,
size_t str_len )

Remove any invalid UTF8 byte (taking into account multi-bytes sequences).

Parameters
stra null terminated string.
str_lenthe result of strlen(str).
Returns
number of stripped bytes.

Definition at line 285 of file string_utf8.cc.

References BLI_assert, BLI_str_utf8_invalid_byte(), and str.

Referenced by BKE_id_new_name_validate(), BKE_vfontdata_from_freetypefont(), blo_do_versions_450(), blender::seq::edit_strip_name_set(), id_name_final_build(), outputNumInput(), TEST(), ui_textedit_end(), and wm_clipboard_text_get_ex().

◆ BLI_str_utf8_invalid_substitute()

int BLI_str_utf8_invalid_substitute ( char * str,
size_t str_len,
const char substitute )

Substitute any invalid UTF8 byte with substitute (taking into account multi-bytes sequences). The length of the string remains unchanged.

Parameters
stra null terminated string.
str_lenthe result of strlen(str).
Returns
number of bytes replaced.

Definition at line 310 of file string_utf8.cc.

References BLI_assert, BLI_str_utf8_invalid_byte(), and str.

Referenced by BLI_str_utf8_invalid_substitute_as_needed(), and TEST().

◆ BLI_str_utf8_invalid_substitute_as_needed()

const char * BLI_str_utf8_invalid_substitute_as_needed ( const char * str,
const size_t str_len,
const char substitute,
char * buf,
const size_t buf_maxncpy )

◆ BLI_str_utf8_offset_from_column()

int BLI_str_utf8_offset_from_column ( const char * str,
const size_t str_len,
const int column_target )

Definition at line 1324 of file string_utf8.cc.

References BLI_str_utf8_as_unicode_step_safe(), BLI_wcwidth_safe(), and str.

Referenced by BLI_str_partition_ex_utf8().

◆ BLI_str_utf8_offset_from_column_with_tabs()

int BLI_str_utf8_offset_from_column_with_tabs ( const char * str,
const size_t str_len,
const int column_target,
const int tab_width )

◆ BLI_str_utf8_offset_from_index()

int BLI_str_utf8_offset_from_index ( const char * str,
size_t str_len,
int index_target )

Return the byte offset in str from index_target.

Parameters
index_targetThe unicode index, where multi-byte characters are counted once. There is no need to clamp this value, the index is logically clamped to BLI_strlen_utf8(str) or below.

Definition at line 1296 of file string_utf8.cc.

References BLI_assert, BLI_str_utf8_as_unicode_step_safe(), str, and UNUSED_VARS.

Referenced by BLI_str_partition_ex_utf8(), ED_space_text_region_location_from_cursor(), blender::nodes::node_geo_string_to_curves_cc::get_text_layout(), blender::nodes::node_fn_slice_string_cc::node_build_multi_function(), TEST(), and txt_sel_set().

◆ BLI_str_utf8_offset_to_column()

int BLI_str_utf8_offset_to_column ( const char * str,
const size_t str_len,
const int offset_target )

◆ BLI_str_utf8_offset_to_column_with_tabs()

int BLI_str_utf8_offset_to_column_with_tabs ( const char * str,
const size_t str_len,
const int offset_target,
const int tab_width )

◆ BLI_str_utf8_offset_to_index()

int BLI_str_utf8_offset_to_index ( const char * str,
const size_t str_len,
const int offset_target )

◆ BLI_str_utf8_size_or_error()

int BLI_str_utf8_size_or_error ( const char * p)
Returns
The size (in bytes) of a single UTF8 char.
Warning
Can return -1 on bad chars.

Definition at line 956 of file string_utf8.cc.

References utf8_char_compute_skip_or_error().

Referenced by handleNumInput(), ui_do_but_textedit(), ui_handle_menu_letter_press_search(), wm_event_add_ghostevent(), WM_event_print(), and WM_event_utf8_to_ascii().

◆ BLI_str_utf8_size_safe()

◆ BLI_str_utf8_truncate_at_size()

bool BLI_str_utf8_truncate_at_size ( char * str,
const size_t str_size )

Ensure that str has a null byte in the range of [0..str_size], while not generating any invalid UTF-8 code. The resulting strlen(str) is guaranteed to be less than str_size.

Returns
true when str was truncated.

Definition at line 1256 of file string_utf8.cc.

References BLI_assert, BLI_strnlen_utf8_ex(), and str.

Referenced by BLI_str_partition_ex_utf8(), bone_autoside_name(), long_id_names_process_action_slots_identifiers(), and read_id_struct().

◆ BLI_strlen_utf8()

◆ BLI_strlen_utf8_ex()

size_t BLI_strlen_utf8_ex ( const char * strc,
size_t * r_len_bytes )

Definition at line 467 of file string_utf8.cc.

References BLI_str_utf8_size_safe(), i, len, step, and UNLIKELY.

Referenced by BLI_strlen_utf8().

◆ BLI_strncpy_utf8()

char * BLI_strncpy_utf8 ( char *__restrict dst,
const char *__restrict src,
size_t dst_maxncpy )

◆ BLI_strncpy_utf8_rlen()

size_t BLI_strncpy_utf8_rlen ( char *__restrict dst,
const char *__restrict src,
size_t dst_maxncpy )

Definition at line 410 of file string_utf8.cc.

References BLI_assert, BLI_string_debug_size, and str_utf8_copy_max_bytes_impl().

◆ BLI_strncpy_utf8_rlen_unterminated()

size_t BLI_strncpy_utf8_rlen_unterminated ( char *__restrict dst,
const char *__restrict src,
size_t dst_maxncpy )

A version of BLI_strncpy_utf8_rlen that doesn't null terminate the string.

Note
Useful for C++ APIs that don't null terminate strings.

Definition at line 422 of file string_utf8.cc.

References BLI_string_debug_size, and str_utf8_copy_max_bytes_impl().

Referenced by blender::StringRefBase::copy_utf8_truncated().

◆ BLI_strncpy_wchar_as_utf8()

size_t BLI_strncpy_wchar_as_utf8 ( char *__restrict dst,
const wchar_t *__restrict src,
const size_t dst_maxncpy )

◆ BLI_strncpy_wchar_from_utf8()

size_t BLI_strncpy_wchar_from_utf8 ( wchar_t *__restrict dst_w,
const char *__restrict src_c,
const size_t dst_w_maxncpy )

◆ BLI_strnlen_utf8()

size_t BLI_strnlen_utf8 ( const char * strc,
size_t strc_maxlen )
Parameters
strcthe string to measure the length.
strc_maxlenthe string length (in bytes)
Returns
the unicode length (not in bytes!)

Definition at line 526 of file string_utf8.cc.

References BLI_strnlen_utf8_ex().

Referenced by BLI_strnlen_utf8_ex(), blender::string_search::count_utf8_code_points(), space_text_draw_wrapped(), TEST(), and ui_text_position_to_hidden().

◆ BLI_strnlen_utf8_ex()

size_t BLI_strnlen_utf8_ex ( const char * strc,
const size_t strc_maxlen,
size_t * r_len_bytes )

Definition at line 498 of file string_utf8.cc.

References BLI_str_utf8_size_safe(), i, len, step, and UNLIKELY.

Referenced by BLI_str_utf8_truncate_at_size(), and BLI_strnlen_utf8().

◆ BLI_wcswidth_or_error()

int BLI_wcswidth_or_error ( const char32_t * pwcs,
size_t n )

Definition at line 567 of file string_utf8.cc.

Referenced by BLI_strncpy_wchar_from_utf8().

◆ BLI_wcwidth_or_error()

int BLI_wcwidth_or_error ( char32_t ucs)

Count columns that character/string occupies (based on wcwidth.co).

Definition at line 549 of file string_utf8.cc.

Referenced by blf_glyph_render(), BLI_str_cursor_step_next_utf32(), BLI_str_cursor_step_prev_utf32(), BLI_str_utf8_char_width_or_error(), BLI_strncpy_wchar_from_utf8(), and BLI_wcwidth_safe().

◆ BLI_wcwidth_safe()

◆ BLI_wstrlen_utf8()

size_t BLI_wstrlen_utf8 ( const wchar_t * src)
Returns
the wchar_t length in UTF8.

Definition at line 456 of file string_utf8.cc.

References BLI_str_utf8_from_unicode_len(), and len.

◆ str_utf8_copy_max_bytes_impl()

BLI_INLINE char * str_utf8_copy_max_bytes_impl ( char * dst,
const char * src,
size_t dst_maxncpy )

Internal utility for implementing BLI_strncpy_utf8 / BLI_strncpy_utf8_rlen.

Compatible with BLI_strncpy, but ensure no partial UTF8 chars.

Parameters
dst_maxncpyThe maximum number of bytes to copy. This does not include the null terminator.
Note
currently we don't attempt to deal with invalid UTF8 chars. See BLI_str_utf8_invalid_strip for if that is needed.
the caller is responsible for null terminating the string.

Definition at line 376 of file string_utf8.cc.

References ATTR_FALLTHROUGH, BLI_INLINE, UNLIKELY, and utf8_char_compute_skip().

Referenced by BLI_strncpy_utf8(), BLI_strncpy_utf8_rlen(), and BLI_strncpy_utf8_rlen_unterminated().

◆ utf8_char_compute_skip()

BLI_INLINE int utf8_char_compute_skip ( const char c)

◆ utf8_char_compute_skip_or_error()

BLI_INLINE int utf8_char_compute_skip_or_error ( const char c)

Definition at line 78 of file string_utf8.cc.

References BLI_INLINE.

Referenced by BLI_str_utf8_size_or_error().

◆ utf8_char_compute_skip_or_error_with_mask()

BLI_INLINE int utf8_char_compute_skip_or_error_with_mask ( const char c,
char * r_mask )

◆ utf8_char_decode()

BLI_INLINE uint utf8_char_decode ( const char * p,
const char mask,
const int len,
const uint err )

Decode a UTF8 code-point, use in combination with utf8_char_compute_skip_or_error_with_mask.

Definition at line 134 of file string_utf8.cc.

References BLI_INLINE, count, len, mask(), and result.

Referenced by BLI_str_utf8_as_unicode_or_error(), and BLI_str_utf8_as_unicode_step_or_error().