Coverage Report

Created: 2026-03-31 07:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mysql-server/mysys/charset.cc
Line
Count
Source
1
/* Copyright (c) 2000, 2025, Oracle and/or its affiliates.
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License, version 2.0,
5
   as published by the Free Software Foundation.
6
7
   This program is designed to work with certain software (including
8
   but not limited to OpenSSL) that is licensed under separate terms,
9
   as designated in a particular file or component or in included license
10
   documentation.  The authors of MySQL hereby grant you an additional
11
   permission to link the program and your derivative works with the
12
   separately licensed software that they have either included with
13
   the program or referenced in the documentation.
14
15
   Without limiting anything contained in the foregoing, this file,
16
   which is part of C Driver for MySQL (Connector/C), is also subject to the
17
   Universal FOSS Exception, version 1.0, a copy of which can be found at
18
   http://oss.oracle.com/licenses/universal-foss-exception.
19
20
   This program is distributed in the hope that it will be useful,
21
   but WITHOUT ANY WARRANTY; without even the implied warranty of
22
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
   GNU General Public License, version 2.0, for more details.
24
25
   You should have received a copy of the GNU General Public License
26
   along with this program; if not, write to the Free Software
27
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
28
29
/**
30
  @file mysys/charset.cc
31
*/
32
33
#include <fcntl.h>
34
#include <stdlib.h>
35
#include <sys/stat.h>
36
#include <sys/types.h>
37
38
#include <cassert>
39
#include <cstdarg>
40
#include <cstdint>
41
#include <cstring>
42
#include <functional>
43
#include <memory>
44
#include <mutex>
45
#include <new>
46
#include <string>
47
#include <string_view>
48
49
#include "my_config.h"
50
51
#include "m_string.h"
52
#include "map_helpers.h"
53
#include "my_dbug.h"
54
#include "my_dir.h"
55
#include "my_inttypes.h"
56
#include "my_io.h"
57
#include "my_sys.h"
58
#include "mysql/my_loglevel.h"
59
#include "mysql/psi/mysql_file.h"
60
#include "mysql/service_mysql_alloc.h"
61
#include "mysql/strings/collations.h"
62
#include "mysql/strings/int2str.h"
63
#include "mysql/strings/m_ctype.h"
64
#include "mysys/mysys_priv.h"
65
#include "mysys_err.h"
66
#include "nulls.h"
67
#include "strings/collations_internal.h"
68
#include "strmake.h"
69
#include "strxmov.h"
70
71
#ifdef MYSQL_SERVER
72
#include "mysql/components/services/log_builtins.h"
73
#include "mysqld_error.h"
74
#include "sql/current_thd.h"
75
#include "sql/mysqld.h"
76
#include "sql/sql_class.h"
77
#endif
78
79
using myf = int;
80
81
/*
82
  The code below implements this functionality:
83
84
    - Initializing charset related structures
85
    - Loading dynamic charsets
86
    - Searching for a proper CHARSET_INFO
87
      using charset name, collation name or collation ID
88
    - Setting server default character set
89
*/
90
91
namespace {
92
93
class Mysys_charset_loader : public MY_CHARSET_LOADER {
94
 public:
95
  Mysys_charset_loader(const Mysys_charset_loader &) = delete;
96
  Mysys_charset_loader(const Mysys_charset_loader &&) = delete;
97
  Mysys_charset_loader &operator=(const Mysys_charset_loader &) = delete;
98
  Mysys_charset_loader &operator=(const Mysys_charset_loader &&) = delete;
99
100
2
  Mysys_charset_loader() = default;
101
  ~Mysys_charset_loader() override = default;
102
103
0
  void reporter(loglevel level, uint errcode, ...) override {
104
0
    va_list args;
105
0
    va_start(args, errcode);
106
0
    my_charset_error_reporter(level, errcode, args);
107
0
    va_end(args);
108
0
  }
109
110
930
  void *once_alloc(size_t sz) override {
111
930
    return my_once_alloc(sz, MYF(MY_WME));
112
930
  }
113
114
0
  void *mem_malloc(size_t sz) override {
115
0
    return my_malloc(key_memory_charset_loader, sz, MYF(MY_WME));
116
0
  }
117
118
0
  void mem_free(void *ptr) override { my_free(ptr); }
119
120
  void *read_file(const char *path, size_t *size) override;
121
};
122
123
}  // namespace
124
125
29.1k
static mysql::collation_internals::Collations *entry() {
126
29.1k
  return mysql::collation_internals::entry;
127
29.1k
}
128
129
/**
130
  Report character set initialization errors and warnings.
131
  Be silent by default: no warnings on the client side.
132
*/
133
static void default_reporter(loglevel /*unused*/, uint errcode [[maybe_unused]],
134
0
                             va_list /* unused */) {}
135
my_error_vreporter my_charset_error_reporter = default_reporter;
136
137
constexpr size_t MY_MAX_ALLOWED_BUF = static_cast<size_t>(1024) * 1024;
138
139
const char *charsets_dir = nullptr;
140
141
2
void *Mysys_charset_loader::read_file(const char *path, size_t *size) {
142
2
  MY_STAT stat_info{};
143
2
  if (!my_stat(path, &stat_info, 0)) {
144
2
    return nullptr;
145
2
  }
146
147
0
  size_t const len = stat_info.st_size;
148
0
  if (len > MY_MAX_ALLOWED_BUF) {
149
0
    return nullptr;
150
0
  }
151
152
  // NOLINTNEXTLINE we *do* use a smart pointer here.
153
0
  unique_ptr_free<uint8_t> buf(static_cast<uint8_t *>(malloc(len)));
154
0
  if (buf == nullptr) {
155
0
    return nullptr;
156
0
  }
157
158
0
  int const fd = mysql_file_open(key_file_charset, path, O_RDONLY, 0);
159
0
  if (fd < 0) {
160
0
    return nullptr;
161
0
  }
162
163
0
  size_t const tmp_len = mysql_file_read(fd, buf.get(), len, 0);
164
0
  mysql_file_close(fd, 0);
165
0
  if (tmp_len != len) {
166
0
    return nullptr;
167
0
  }
168
169
0
  *size = len;
170
0
  return buf.release();
171
0
}
172
173
2
char *get_charsets_dir(char *buf) {
174
2
  const char *sharedir = SHAREDIR;
175
2
  DBUG_TRACE;
176
177
2
  if (charsets_dir != nullptr)
178
0
    strmake(buf, charsets_dir, FN_REFLEN - 1);
179
2
  else {
180
2
    if (test_if_hard_path(sharedir) ||
181
0
        is_prefix(sharedir, DEFAULT_CHARSET_HOME))
182
2
      strxmov(buf, sharedir, "/", CHARSET_DIR, NullS);
183
0
    else
184
0
      strxmov(buf, DEFAULT_CHARSET_HOME, "/", sharedir, "/", CHARSET_DIR,
185
0
              NullS);
186
2
  }
187
2
  char *res = convert_dirname(buf, buf, NullS);
188
2
  DBUG_PRINT("info", ("charsets dir: '%s'", buf));
189
2
  return res;
190
2
}
191
192
const CHARSET_INFO *all_charsets[MY_ALL_CHARSETS_SIZE] = {nullptr};
193
CHARSET_INFO *default_charset_info = &my_charset_latin1;
194
195
static std::once_flag charsets_initialized;
196
197
static Mysys_charset_loader *loader = nullptr;
198
199
2
static void init_available_charsets() {
200
2
  assert(loader == nullptr);
201
2
  loader = new Mysys_charset_loader;
202
203
2
  char charset_dir[FN_REFLEN];
204
2
  get_charsets_dir(charset_dir);
205
206
2
  mysql::collation::initialize(charset_dir, loader);
207
2
  entry()->iterate(
208
572
      [](const CHARSET_INFO *cs) { all_charsets[cs->number] = cs; });
209
2
}
210
211
2
uint get_collation_number(const char *collation_name) {
212
2
  std::call_once(charsets_initialized, init_available_charsets);
213
2
  mysql::collation::Name const name{collation_name};
214
2
  return entry()->get_collation_id(name);
215
2
}
216
217
0
unsigned get_charset_number(const char *cs_name, uint cs_flags) {
218
0
  std::call_once(charsets_initialized, init_available_charsets);
219
0
  mysql::collation::Name const name{cs_name};
220
0
  if ((cs_flags & MY_CS_PRIMARY)) {
221
0
    return entry()->get_primary_collation_id(name);
222
0
  }
223
0
  if ((cs_flags & MY_CS_BINSORT)) {
224
0
    return entry()->get_default_binary_collation_id(name);
225
0
  }
226
0
  assert(false);
227
0
  return 0;
228
0
}
229
230
0
const char *get_collation_name(uint charset_number) {
231
0
  std::call_once(charsets_initialized, init_available_charsets);
232
233
0
  CHARSET_INFO *cs = entry()->find_by_id(charset_number);
234
0
  if (cs != nullptr) {
235
0
    assert(cs->number == charset_number);
236
0
    assert(cs->m_coll_name != nullptr);
237
0
    return cs->m_coll_name;
238
0
  }
239
240
0
  return "?"; /* this mimics find_type() */
241
0
}
242
243
9.79k
CHARSET_INFO *get_charset(uint cs_number, myf flags) {
244
9.79k
  std::call_once(charsets_initialized, init_available_charsets);
245
246
9.79k
  if (cs_number == default_charset_info->number) return default_charset_info;
247
248
4.89k
  if (cs_number == 0 || cs_number >= MY_ALL_CHARSETS_SIZE) {
249
0
    return nullptr;
250
0
  }
251
252
4.89k
  CHARSET_INFO *cs = entry()->find_by_id(cs_number);
253
4.89k
  if (!cs && (flags & MY_WME)) {
254
0
    char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
255
0
    constexpr int cs_string_size = 23;
256
0
    char cs_string[cs_string_size];
257
0
    my_stpcpy(get_charsets_dir(index_file), MY_CHARSET_INDEX);
258
0
    cs_string[0] = '#';
259
0
    longlong10_to_str(cs_number, cs_string + 1, 10);
260
0
    my_error(EE_UNKNOWN_CHARSET, MYF(0), cs_string, index_file);
261
0
  }
262
4.89k
  return cs;
263
4.89k
}
264
265
namespace {
266
267
0
bool starts_with_utf8(const char *name) {
268
  // See normalization of names in mysql::collation::Name::Name()
269
0
  const uint8_t *map = my_charset_latin1.to_lower;
270
0
  constexpr const char prefix[] = "UTF8_";
271
0
  if (strlen(name) < strlen(prefix)) return false;
272
0
  for (unsigned int ix = 0; ix < strlen(prefix); ++ix) {
273
0
    if (map[static_cast<uint8_t>(name[ix])] !=
274
0
        map[static_cast<uint8_t>(prefix[ix])]) {
275
0
      return false;
276
0
    }
277
0
  }
278
0
  return true;
279
0
}
280
281
/**
282
  What does "utf8" mean, "utf8mb3" or "utf8mb4"?
283
284
  We ask 'current_thd' whether MODE_INTERPRET_UTF8_AS_UTF8MB4 has been
285
  set in SQL_MODE.
286
287
  We return 3 or 4 to indicate what alias is active.  A special value
288
  42 is used during bootstrap: reject command line arguments, and
289
  abort. There are no lookups during shutdown.
290
291
  For client-side requests (e.g. mysql --default-character-set=utf8)
292
  we always return 3.
293
294
  @param [in] cname Charset/Collation name, used during bootstrap
295
              to give the desired error message.
296
297
  @return 3,4,42 to indicate what alias to use, or to abort.
298
 */
299
1.54k
int utf8_alias_lookup(const char *cname [[maybe_unused]]) {
300
#ifdef MYSQL_SERVER
301
  enum_server_operational_state server_state = get_server_state();
302
  if (server_state == SERVER_BOOTING) {
303
    if (native_strcasecmp(cname, "utf8") == 0) {
304
      LogErr(ERROR_LEVEL, ER_INVALID_SERVER_OPTION_CHARSET);
305
      return 42;  // abort
306
    }
307
    if (native_strncasecmp(cname, "utf8_", 5) == 0) {
308
      LogErr(ERROR_LEVEL, ER_INVALID_SERVER_OPTION_COLLATION, cname);
309
      return 42;  // abort
310
    }
311
    assert(false);
312
    return 3;
313
  }
314
  if (server_state == SERVER_OPERATING) {
315
    assert(current_thd != nullptr);
316
    return (current_thd->interpret_utf8_as_utf8mb4() ? 4 : 3);
317
  }
318
  if (server_state == SERVER_SHUTTING_DOWN) {
319
    assert(false);
320
    return 42;
321
  }
322
  assert(false);
323
  return 42;
324
#else
325
  // We do not want to report an error
326
  //    Character set 'utf8' is not a compiled character set ...
327
  // whenever a client is invoked with --default-character-set=utf8
328
1.54k
  return 3;
329
1.54k
#endif  // MYSQL_SERVER
330
1.54k
}
331
332
}  // namespace
333
334
/**
335
  Find collation by name: extended version of get_charset_by_name()
336
  to return error messages to the caller.
337
  @param [in]  collation_name Collation name
338
  @param [in]  flags   Flags
339
  @param [out] errmsg  Error message buffer (if any)
340
341
  @return          NULL on error, pointer to collation on success
342
*/
343
344
CHARSET_INFO *my_collation_get_by_name(const char *collation_name, myf flags,
345
0
                                       MY_CHARSET_ERRMSG *errmsg) {
346
0
  std::call_once(charsets_initialized, init_available_charsets);
347
348
0
  std::string collation_name_string(collation_name);
349
0
  if (starts_with_utf8(collation_name)) {
350
0
    int retval = utf8_alias_lookup(collation_name);
351
    // abort, assume that utf8_alias_lookup has logged something
352
0
    if (retval == 42) return nullptr;
353
0
    assert(retval == 3 || retval == 4);
354
355
    // Insert "mb3" to get "utf8mb3_xxxx" or
356
    //        "mb4" to get "utf8mb4_xxxx"
357
    // These have no utf8mb4 counterparts:
358
    //   utf8mb3_general_mysql500_ci
359
    //   utf8mb3_tolower_ci
360
    // So for retval 4 we return an error.
361
0
    const char *mb3_or_mb4 = nullptr;
362
0
    if (retval == 4) {
363
      // See normalization of names in mysql::collation::Name::Name()
364
0
      if (0 == my_strcasecmp(&my_charset_latin1, collation_name,
365
0
                             "utf8_general_mysql500_ci") ||
366
0
          0 == my_strcasecmp(&my_charset_latin1, collation_name,
367
0
                             "utf8_tolower_ci")) {
368
0
        errmsg->errcode = EE_COLLATION_ALIAS_ERROR;
369
0
        collation_name_string.insert(4, "mb3");
370
0
        snprintf(errmsg->errarg, sizeof(errmsg->errarg),
371
0
                 "Collation '%s' must be specified explicitly as '%s'",
372
0
                 collation_name, collation_name_string.c_str());
373
0
        return nullptr;
374
0
      }
375
0
      mb3_or_mb4 = "mb4";
376
0
    } else {
377
0
      mb3_or_mb4 = "mb3";
378
0
    }
379
380
0
    collation_name_string.insert(4, mb3_or_mb4);
381
0
    collation_name = collation_name_string.c_str();
382
0
  }
383
384
0
  mysql::collation::Name const name{collation_name};
385
0
  CHARSET_INFO *cs = entry()->find_by_name(name, flags, errmsg);
386
0
  if (cs == nullptr && (flags & MY_WME)) {
387
0
    char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
388
0
    my_stpcpy(get_charsets_dir(index_file), MY_CHARSET_INDEX);
389
0
    my_error(EE_UNKNOWN_COLLATION, MYF(0),
390
0
             std::string(name.to_string_view()).c_str(), index_file);
391
0
  }
392
0
  return cs;
393
0
}
394
395
0
CHARSET_INFO *get_charset_by_name(const char *collation_name, myf flags) {
396
0
  MY_CHARSET_ERRMSG dummy;
397
0
  return my_collation_get_by_name(collation_name, flags, &dummy);
398
0
}
399
400
/**
401
  Find character set by name: extended version of get_charset_by_csname()
402
  to return error messages to the caller.
403
  @param [in]  cs_name  Collation name
404
  @param [in]  cs_flags Character set flags (e.g. default or binary collation)
405
  @param [in]  flags    Flags
406
  @param [out] errmsg   Error message buffer (if any)
407
408
  @return           NULL on error, pointer to collation on success
409
*/
410
CHARSET_INFO *my_charset_get_by_name(const char *cs_name, uint cs_flags,
411
22.7k
                                     myf flags, MY_CHARSET_ERRMSG *errmsg) {
412
22.7k
  DBUG_TRACE;
413
22.7k
  DBUG_PRINT("enter", ("name: '%s'", cs_name));
414
415
22.7k
  std::call_once(charsets_initialized, init_available_charsets);
416
417
22.7k
  mysql::collation::Name const name{cs_name};
418
22.7k
  CHARSET_INFO *cs = nullptr;
419
22.7k
  if (cs_flags & MY_CS_PRIMARY) {
420
22.7k
    cs = entry()->find_primary(name, flags, errmsg);
421
422
22.7k
    if (cs == nullptr && name.to_string_view() == "utf8") {
423
1.54k
      int retval = utf8_alias_lookup("utf8");
424
      // abort, assume that utf8_alias_lookup has logged something
425
1.54k
      if (retval == 42) return nullptr;
426
1.54k
      assert(retval == 3 || retval == 4);
427
428
      // The parser does get_charset_by_csname().
429
      // Also needed for e.g. SET character_set_client= 'utf8'.
430
      // Also needed by the lexer for: "select _utf8 0xD0B0D0B1D0B2;"
431
1.54k
      cs = entry()->find_primary(
432
1.54k
          mysql::collation::Name(retval == 4 ? "utf8mb4" : "utf8mb3"), flags,
433
1.54k
          errmsg);
434
1.54k
    }
435
22.7k
  } else if (cs_flags & MY_CS_BINSORT) {
436
0
    cs = entry()->find_default_binary(name, flags, errmsg);
437
0
    if (cs == nullptr && name.to_string_view() == "utf8") {
438
0
      int retval = utf8_alias_lookup("utf8");
439
0
      if (retval == 42) return nullptr;
440
0
      assert(retval == 3 || retval == 4);
441
442
0
      cs = entry()->find_default_binary(
443
0
          mysql::collation::Name(retval == 4 ? "utf8mb4" : "utf8mb3"), flags,
444
0
          errmsg);
445
0
    }
446
0
  }
447
22.7k
  if (!cs && (flags & MY_WME)) {
448
0
    char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
449
0
    my_stpcpy(get_charsets_dir(index_file), MY_CHARSET_INDEX);
450
0
    my_error(EE_UNKNOWN_CHARSET, MYF(0), cs_name, index_file);
451
0
  }
452
453
22.7k
  return cs;
454
22.7k
}
455
456
CHARSET_INFO *get_charset_by_csname(const char *cs_name, uint cs_flags,
457
22.7k
                                    myf flags) {
458
22.7k
  MY_CHARSET_ERRMSG dummy;
459
22.7k
  return my_charset_get_by_name(cs_name, cs_flags, flags, &dummy);
460
22.7k
}
461
462
/**
463
  Resolve character set by the character set name (utf8, latin1, ...).
464
465
  The function tries to resolve character set by the specified name. If
466
  there is character set with the given name, it is assigned to the "cs"
467
  parameter and false is returned. If there is no such character set,
468
  "default_cs" is assigned to the "cs" and true is returned.
469
470
  @param[in] cs_name    Character set name.
471
  @param[in] default_cs Default character set.
472
  @param[out] cs        Variable to store character set.
473
474
  @return false if character set was resolved successfully; true if there
475
  is no character set with given name.
476
*/
477
478
bool resolve_charset(const char *cs_name, const CHARSET_INFO *default_cs,
479
0
                     const CHARSET_INFO **cs) {
480
0
  *cs = get_charset_by_csname(cs_name, MY_CS_PRIMARY, MYF(0));
481
482
0
  if (*cs == nullptr) {
483
0
    *cs = default_cs;
484
0
    return true;
485
0
  }
486
487
0
  return false;
488
0
}
489
490
/**
491
  Resolve collation by the collation name (utf8_general_ci, ...).
492
493
  The function tries to resolve collation by the specified name. If there
494
  is collation with the given name, it is assigned to the "cl" parameter
495
  and false is returned. If there is no such collation, "default_cl" is
496
  assigned to the "cl" and true is returned.
497
498
  @param[in] cl_name    Collation name.
499
  @param[in] default_cl Default collation.
500
  @param[out] cl        Variable to store collation.
501
502
  @return false if collation was resolved successfully; true if there is no
503
  collation with given name.
504
*/
505
506
bool resolve_collation(const char *cl_name, const CHARSET_INFO *default_cl,
507
0
                       const CHARSET_INFO **cl) {
508
0
  *cl = get_charset_by_name(cl_name, MYF(0));
509
510
0
  if (*cl == nullptr) {
511
0
    *cl = default_cl;
512
0
    return true;
513
0
  }
514
515
0
  return false;
516
0
}
517
518
#ifdef _WIN32
519
extern CHARSET_INFO my_charset_cp932_japanese_ci;
520
521
static CHARSET_INFO *fs_cset_cache = nullptr;
522
523
CHARSET_INFO *fs_character_set() {
524
  if (fs_cset_cache == nullptr) {
525
    char buf[10] = "cp";
526
    GetLocaleInfo(LOCALE_SYSTEM_DEFAULT, LOCALE_IDEFAULTANSICODEPAGE, buf + 2,
527
                  sizeof(buf) - 3);
528
    /*
529
      We cannot call get_charset_by_name here,
530
      we will end up in a deadlock (in std::call_once) because of recursion:
531
        init-avaliable-charsets ->
532
        get_charsets_dir ->
533
        convert_dirname ->
534
        fs_character_set
535
    */
536
    fs_cset_cache =
537
        !strcmp(buf, "cp932") ? &my_charset_cp932_japanese_ci : &my_charset_bin;
538
  }
539
  return fs_cset_cache;
540
}
541
#endif
542
543
0
void charset_uninit() {
544
0
  mysql::collation::shutdown();
545
546
0
  delete loader;
547
0
  loader = nullptr;
548
549
0
  new (&charsets_initialized) std::once_flag;
550
0
}