Blender  V2.93
string_search.cc
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or
3  * modify it under the terms of the GNU General Public License
4  * as published by the Free Software Foundation; either version 2
5  * of the License, or (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software Foundation,
14  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
15  */
16 
17 #include "BLI_array.hh"
18 #include "BLI_linear_allocator.hh"
19 #include "BLI_multi_value_map.hh"
20 #include "BLI_span.hh"
21 #include "BLI_string.h"
22 #include "BLI_string_ref.hh"
23 #include "BLI_string_search.h"
24 #include "BLI_string_utf8.h"
25 #include "BLI_timeit.hh"
26 
28 
30 {
31  return static_cast<int64_t>(BLI_strnlen_utf8(str.data(), static_cast<size_t>(str.size())));
32 }
33 
43 {
44  constexpr int deletion_cost = 1;
45  constexpr int insertion_cost = 1;
46  constexpr int substitution_cost = 1;
47  constexpr int transposition_cost = 1;
48 
49  const int size_a = count_utf8_code_points(a);
50  const int size_b = count_utf8_code_points(b);
51 
52  /* Instead of keeping the entire table in memory, only keep three rows. The algorithm only
53  * accesses these rows and nothing older.
54  * All three rows are usually allocated on the stack. At most a single heap allocation is done,
55  * if the reserved stack space is too small. */
56  const int row_length = size_b + 1;
57  Array<int, 64> rows(row_length * 3);
58 
59  /* Store rows as spans so that it is cheap to swap them. */
60  MutableSpan v0{rows.data() + row_length * 0, row_length};
61  MutableSpan v1{rows.data() + row_length * 1, row_length};
62  MutableSpan v2{rows.data() + row_length * 2, row_length};
63 
64  /* Only v1 needs to be initialized. */
65  for (const int i : IndexRange(row_length)) {
66  v1[i] = i * insertion_cost;
67  }
68 
69  uint32_t prev_unicode_a;
70  size_t offset_a = 0;
71  for (const int i : IndexRange(size_a)) {
72  v2[0] = (i + 1) * deletion_cost;
73 
74  const uint32_t unicode_a = BLI_str_utf8_as_unicode_and_size(a.data() + offset_a, &offset_a);
75 
76  uint32_t prev_unicode_b;
77  size_t offset_b = 0;
78  for (const int j : IndexRange(size_b)) {
79  const uint32_t unicode_b = BLI_str_utf8_as_unicode_and_size(b.data() + offset_b, &offset_b);
80 
81  /* Check how costly the different operations would be and pick the cheapest - the one with
82  * minimal cost. */
83  int new_cost = std::min({v1[j + 1] + deletion_cost,
84  v2[j] + insertion_cost,
85  v1[j] + (unicode_a != unicode_b) * substitution_cost});
86  if (i > 0 && j > 0) {
87  if (unicode_a == prev_unicode_b && prev_unicode_a == unicode_b) {
88  new_cost = std::min(new_cost, v0[j - 1] + transposition_cost);
89  }
90  }
91 
92  v2[j + 1] = new_cost;
93  prev_unicode_b = unicode_b;
94  }
95 
96  /* Swap the three rows, so that the next row can be computed. */
97  std::tie(v0, v1, v2) = std::tuple<MutableSpan<int>, MutableSpan<int>, MutableSpan<int>>(
98  v1, v2, v0);
99  prev_unicode_a = unicode_a;
100  }
101 
102  return v1.last();
103 }
104 
110 {
111  /* If it is a perfect partial match, return immediately. */
112  if (full.find(query) != StringRef::not_found) {
113  return 0;
114  }
115 
116  const int query_size = count_utf8_code_points(query);
117  const int full_size = count_utf8_code_points(full);
118 
119  /* If there is only a single character which is not in the full string, this is not a match. */
120  if (query_size == 1) {
121  return -1;
122  }
123  BLI_assert(query.size() >= 2);
124 
125  /* Allow more errors when the size grows larger. */
126  const int max_errors = query_size <= 1 ? 0 : query_size / 8 + 1;
127 
128  /* If the query is too large, this cannot be a match. */
129  if (query_size - full_size > max_errors) {
130  return -1;
131  }
132 
133  const uint32_t query_first_unicode = BLI_str_utf8_as_unicode(query.data());
134  const uint32_t query_second_unicode = BLI_str_utf8_as_unicode(query.data() +
135  BLI_str_utf8_size(query.data()));
136 
137  const char *full_begin = full.begin();
138  const char *full_end = full.end();
139 
140  const char *window_begin = full_begin;
141  const char *window_end = window_begin;
142  const int window_size = std::min(query_size + max_errors, full_size);
143  const int extra_chars = window_size - query_size;
144  const int max_acceptable_distance = max_errors + extra_chars;
145 
146  for (int i = 0; i < window_size; i++) {
147  window_end += BLI_str_utf8_size(window_end);
148  }
149 
150  while (true) {
151  StringRef window{window_begin, window_end};
152  const uint32_t window_begin_unicode = BLI_str_utf8_as_unicode(window_begin);
153  int distance = 0;
154  /* Expect that the first or second character of the query is correct. This helps to avoid
155  * computing the more expensive distance function. */
156  if (ELEM(window_begin_unicode, query_first_unicode, query_second_unicode)) {
158  if (distance <= max_acceptable_distance) {
159  return distance;
160  }
161  }
162  if (window_end == full_end) {
163  return -1;
164  }
165 
166  /* When the distance is way too large, we can skip a couple of code points, because the
167  * distance can't possibly become as short as required. */
168  const int window_offset = std::max(1, distance / 2);
169  for (int i = 0; i < window_offset && window_end < full_end; i++) {
170  window_begin += BLI_str_utf8_size(window_begin);
171  window_end += BLI_str_utf8_size(window_end);
172  }
173  }
174 }
175 
187  Span<StringRef> words,
188  Span<bool> word_is_usable,
189  MutableSpan<bool> r_word_is_matched,
190  int start = 0)
191 {
192  if (start >= words.size()) {
193  return false;
194  }
195 
196  r_word_is_matched.fill(false);
197 
198  size_t query_index = 0;
199  int word_index = start;
200  size_t char_index = 0;
201 
202  int first_found_word_index = -1;
203 
204  while (query_index < query.size()) {
205  const uint query_unicode = BLI_str_utf8_as_unicode_and_size(query.data() + query_index,
206  &query_index);
207  while (true) {
208  /* We are at the end of words, no complete match has been found yet. */
209  if (word_index >= words.size()) {
210  if (first_found_word_index >= 0) {
211  /* Try starting to match at another word. In some cases one can still find matches this
212  * way. */
213  return match_word_initials(
214  query, words, word_is_usable, r_word_is_matched, first_found_word_index + 1);
215  }
216  return false;
217  }
218 
219  /* Skip words that the caller does not want us to use. */
220  if (!word_is_usable[word_index]) {
221  word_index++;
222  BLI_assert(char_index == 0);
223  continue;
224  }
225 
226  StringRef word = words[word_index];
227  /* Try to match the current character with the current word. */
228  if (static_cast<int>(char_index) < word.size()) {
229  const uint32_t char_unicode = BLI_str_utf8_as_unicode_and_size(word.data() + char_index,
230  &char_index);
231  if (query_unicode == char_unicode) {
232  r_word_is_matched[word_index] = true;
233  if (first_found_word_index == -1) {
234  first_found_word_index = word_index;
235  }
236  break;
237  }
238  }
239 
240  /* Could not find a match in the current word, go to the beginning of the next word. */
241  word_index += 1;
242  char_index = 0;
243  }
244  }
245  return true;
246 }
247 
249  Span<StringRef> words,
250  Span<bool> word_is_usable)
251 {
252  int best_word_size = INT32_MAX;
253  int best_word_index = -1;
254  for (const int i : words.index_range()) {
255  if (!word_is_usable[i]) {
256  continue;
257  }
258  StringRef word = words[i];
259  if (word.startswith(query)) {
260  if (word.size() < best_word_size) {
261  best_word_index = i;
262  best_word_size = word.size();
263  }
264  }
265  }
266  return best_word_index;
267 }
268 
270  Span<StringRef> words,
271  Span<bool> word_is_usable,
272  int *r_error_count)
273 {
274  for (const int i : words.index_range()) {
275  if (!word_is_usable[i]) {
276  continue;
277  }
278  StringRef word = words[i];
279  const int error_count = get_fuzzy_match_errors(query, word);
280  if (error_count >= 0) {
281  *r_error_count = error_count;
282  return i;
283  }
284  }
285  return -1;
286 }
287 
292 static int score_query_against_words(Span<StringRef> query_words, Span<StringRef> result_words)
293 {
294  /* Remember which words have been matched, so that they are not matched again. */
295  Array<bool, 64> word_is_usable(result_words.size(), true);
296 
297  /* Start with some high score, because otherwise the final score might become negative. */
298  int total_match_score = 1000;
299 
300  for (StringRef query_word : query_words) {
301  {
302  /* Check if any result word begins with the query word. */
303  const int word_index = get_shortest_word_index_that_startswith(
304  query_word, result_words, word_is_usable);
305  if (word_index >= 0) {
306  total_match_score += 10;
307  word_is_usable[word_index] = false;
308  continue;
309  }
310  }
311  {
312  /* Try to match against word initials. */
313  Array<bool, 64> matched_words(result_words.size());
314  const bool success = match_word_initials(
315  query_word, result_words, word_is_usable, matched_words);
316  if (success) {
317  total_match_score += 3;
318  for (const int i : result_words.index_range()) {
319  if (matched_words[i]) {
320  word_is_usable[i] = false;
321  }
322  }
323  continue;
324  }
325  }
326  {
327  /* Fuzzy match against words. */
328  int error_count = 0;
329  const int word_index = get_word_index_that_fuzzy_matches(
330  query_word, result_words, word_is_usable, &error_count);
331  if (word_index >= 0) {
332  total_match_score += 3 - error_count;
333  word_is_usable[word_index] = false;
334  continue;
335  }
336  }
337 
338  /* Couldn't match query word with anything. */
339  return -1;
340  }
341 
342  return total_match_score;
343 }
344 
350  LinearAllocator<> &allocator,
351  Vector<StringRef, 64> &r_words)
352 {
353  const uint32_t unicode_space = BLI_str_utf8_as_unicode(" ");
354  const uint32_t unicode_right_triangle = BLI_str_utf8_as_unicode("▶");
355 
356  auto is_separator = [&](uint32_t unicode) {
357  return ELEM(unicode, unicode_space, unicode_right_triangle);
358  };
359 
360  /* Make a copy of the string so that we can edit it. */
361  StringRef str_copy = allocator.copy_string(str);
362  char *mutable_copy = const_cast<char *>(str_copy.data());
363  const size_t str_size_in_bytes = static_cast<size_t>(str.size());
364  BLI_str_tolower_ascii(mutable_copy, str_size_in_bytes);
365 
366  /* Iterate over all unicode code points to split individual words. */
367  bool is_in_word = false;
368  size_t word_start = 0;
369  size_t offset = 0;
370  while (offset < str_size_in_bytes) {
371  size_t size = 0;
372  uint32_t unicode = BLI_str_utf8_as_unicode_and_size(str.data() + offset, &size);
373  if (is_separator(unicode)) {
374  if (is_in_word) {
375  r_words.append(
376  str_copy.substr(static_cast<int>(word_start), static_cast<int>(offset - word_start)));
377  is_in_word = false;
378  }
379  }
380  else {
381  if (!is_in_word) {
382  word_start = offset;
383  is_in_word = true;
384  }
385  }
386  offset += size;
387  }
388  /* If the last word is not followed by a separator, it has to be handled separately. */
389  if (is_in_word) {
390  r_words.append(str_copy.drop_prefix(static_cast<int>(word_start)));
391  }
392 }
393 
394 } // namespace blender::string_search
395 
396 struct SearchItem {
398  int length;
399  void *user_data;
400 };
401 
402 struct StringSearch {
405 };
406 
408 {
409  return new StringSearch();
410 }
411 
416 void BLI_string_search_add(StringSearch *search, const char *str, void *user_data)
417 {
418  using namespace blender;
419  Vector<StringRef, 64> words;
420  StringRef str_ref{str};
421  string_search::extract_normalized_words(str_ref, search->allocator, words);
422  search->items.append(
423  {search->allocator.construct_array_copy(words.as_span()), (int)str_ref.size(), user_data});
424 }
425 
431 int BLI_string_search_query(StringSearch *search, const char *query, void ***r_data)
432 {
433  using namespace blender;
434 
435  const StringRef query_str = query;
436 
437  LinearAllocator<> allocator;
438  Vector<StringRef, 64> query_words;
439  string_search::extract_normalized_words(query_str, allocator, query_words);
440 
441  /* Compute score of every result. */
442  MultiValueMap<int, int> result_indices_by_score;
443  for (const int result_index : search->items.index_range()) {
445  query_words, search->items[result_index].normalized_words);
446  if (score >= 0) {
447  result_indices_by_score.add(score, result_index);
448  }
449  }
450 
451  Vector<int> found_scores;
452  for (const int score : result_indices_by_score.keys()) {
453  found_scores.append(score);
454  }
455  std::sort(found_scores.begin(), found_scores.end(), std::greater<>());
456 
457  /* Add results to output vector in correct order. First come the results with the best match
458  * score. Results with the same score are in the order they have been added to the search. */
459  Vector<int> sorted_result_indices;
460  for (const int score : found_scores) {
461  MutableSpan<int> indices = result_indices_by_score.lookup(score);
462  if (score == found_scores[0] && !query_str.is_empty()) {
463  /* Sort items with best score by length. Shorter items are more likely the ones you are
464  * looking for. This also ensures that exact matches will be at the top, even if the query is
465  * a substring of another item. */
466  std::sort(indices.begin(), indices.end(), [&](int a, int b) {
467  return search->items[a].length < search->items[b].length;
468  });
469  }
470  sorted_result_indices.extend(indices);
471  }
472 
473  void **sorted_data = static_cast<void **>(
474  MEM_malloc_arrayN(static_cast<size_t>(sorted_result_indices.size()), sizeof(void *), AT));
475  for (const int i : sorted_result_indices.index_range()) {
476  const int result_index = sorted_result_indices[i];
477  SearchItem &item = search->items[result_index];
478  sorted_data[i] = item.user_data;
479  }
480 
481  *r_data = sorted_data;
482 
483  return sorted_result_indices.size();
484 }
485 
487 {
488  delete string_search;
489 }
#define BLI_assert(a)
Definition: BLI_assert.h:58
void BLI_str_tolower_ascii(char *str, const size_t len) ATTR_NONNULL()
Definition: string.c:890
struct StringSearch StringSearch
int BLI_str_utf8_size(const char *p) ATTR_NONNULL()
Definition: string_utf8.c:495
unsigned int BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index) ATTR_NONNULL()
Definition: string_utf8.c:550
unsigned int BLI_str_utf8_as_unicode(const char *p) ATTR_NONNULL()
Definition: string_utf8.c:533
size_t BLI_strnlen_utf8(const char *strc, const size_t maxlen) ATTR_NONNULL()
Definition: string_utf8.c:387
unsigned int uint
Definition: BLI_sys_types.h:83
#define ELEM(...)
#define AT
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum query
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble v1
ATTR_WARN_UNUSED_RESULT const BMVert * v2
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
void sort(btMatrix3x3 &U, btVector3 &sigma, btMatrix3x3 &V, int t)
Helper function of 3X3 SVD for sorting singular values.
const T * data() const
Definition: BLI_array.hh:297
MutableSpan< T > construct_array_copy(Span< T > src)
StringRefNull copy_string(StringRef str)
MapType::KeyIterator keys() const
Span< Value > lookup(const Key &key) const
void add(const Key &key, const Value &value)
constexpr void fill(const T &value)
Definition: BLI_span.hh:540
constexpr int64_t size() const
Definition: BLI_span.hh:254
constexpr IndexRange index_range() const
Definition: BLI_span.hh:414
constexpr const char * data() const
static constexpr int64_t not_found
constexpr int64_t find(char c, int64_t pos=0) const
constexpr bool is_empty() const
constexpr const char * end() const
constexpr const char * begin() const
constexpr bool startswith(StringRef prefix) const
constexpr StringRef substr(int64_t start, const int64_t size) const
constexpr int64_t size() const
constexpr StringRef drop_prefix(const int64_t n) const
int64_t size() const
Definition: BLI_vector.hh:662
void append(const T &value)
Definition: BLI_vector.hh:438
Span< T > as_span() const
Definition: BLI_vector.hh:340
IndexRange index_range() const
Definition: BLI_vector.hh:887
void extend(Span< T > array)
Definition: BLI_vector.hh:515
void * user_data
#define str(s)
static ushort indices[]
void *(* MEM_malloc_arrayN)(size_t len, size_t size, const char *str)
Definition: mallocn.c:48
static unsigned a[3]
Definition: RandGen.cpp:92
void extract_normalized_words(StringRef str, LinearAllocator<> &allocator, Vector< StringRef, 64 > &r_words)
static int get_shortest_word_index_that_startswith(StringRef query, Span< StringRef > words, Span< bool > word_is_usable)
static bool match_word_initials(StringRef query, Span< StringRef > words, Span< bool > word_is_usable, MutableSpan< bool > r_word_is_matched, int start=0)
static int64_t count_utf8_code_points(StringRef str)
static int get_word_index_that_fuzzy_matches(StringRef query, Span< StringRef > words, Span< bool > word_is_usable, int *r_error_count)
int get_fuzzy_match_errors(StringRef query, StringRef full)
static int score_query_against_words(Span< StringRef > query_words, Span< StringRef > result_words)
int damerau_levenshtein_distance(StringRef a, StringRef b)
#define min(a, b)
Definition: sort.c:51
#define INT32_MAX
Definition: stdint.h:140
unsigned int uint32_t
Definition: stdint.h:83
__int64 int64_t
Definition: stdint.h:92
StringSearch * BLI_string_search_new()
void BLI_string_search_add(StringSearch *search, const char *str, void *user_data)
int BLI_string_search_query(StringSearch *search, const char *query, void ***r_data)
void BLI_string_search_free(StringSearch *string_search)
void * user_data
blender::Span< blender::StringRef > normalized_words
blender::Vector< SearchItem > items
blender::LinearAllocator allocator
float max
ccl_device_inline float distance(const float2 &a, const float2 &b)