Coverage Report

Created: 2025-06-13 07:09

/src/server/mysys/charset.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
   Copyright (c) 2000, 2011, Oracle and/or its affiliates
3
   Copyright (c) 2009, 2020, MariaDB Corporation.
4
5
   This program is free software; you can redistribute it and/or modify
6
   it under the terms of the GNU General Public License as published by
7
   the Free Software Foundation; version 2 of the License.
8
9
   This program is distributed in the hope that it will be useful,
10
   but WITHOUT ANY WARRANTY; without even the implied warranty of
11
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
   GNU General Public License for more details.
13
14
   You should have received a copy of the GNU General Public License
15
   along with this program; if not, write to the Free Software
16
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
17
18
#include "mysys_priv.h"
19
#include "mysys_err.h"
20
#include <m_ctype.h>
21
#include <m_string.h>
22
#include <my_dir.h>
23
#include <hash.h>
24
#include <my_xml.h>
25
#ifdef HAVE_LANGINFO_H
26
#include <langinfo.h>
27
#endif
28
#ifdef HAVE_LOCALE_H
29
#include <locale.h>
30
#endif
31
32
static HASH charset_name_hash;
33
static HASH collation_name_hash;
34
35
/*
36
  The code below implements this functionality:
37
  
38
    - Initializing charset related structures
39
    - Loading dynamic charsets
40
    - Searching for a proper CHARSET_INFO 
41
      using charset name, collation name or collation ID
42
    - Setting server default character set
43
*/
44
45
static uint
46
get_collation_number_internal(const char *name)
47
0
{
48
0
  CHARSET_INFO *cs= (CHARSET_INFO*) my_hash_search(&collation_name_hash,
49
0
                                                   (uchar*) name, strlen(name));
50
0
  return cs ? cs->number : 0;
51
0
}
52
53
54
static my_bool is_multi_byte_ident(CHARSET_INFO *cs, uchar ch)
55
0
{
56
0
  int chlen= my_ci_charlen(cs, &ch, &ch + 1);
57
0
  return MY_CS_IS_TOOSMALL(chlen) ? TRUE : FALSE;
58
0
}
59
60
static my_bool init_state_maps(struct charset_info_st *cs)
61
0
{
62
0
  uint i;
63
0
  uchar *state_map;
64
0
  uchar *ident_map;
65
66
0
  if (!(cs->state_map= state_map= (uchar*) my_once_alloc(256*2, MYF(MY_WME))))
67
0
    return 1;
68
    
69
0
  cs->ident_map= ident_map= state_map + 256;
70
71
  /* Fill state_map with states to get a faster parser */
72
0
  for (i=0; i < 256 ; i++)
73
0
  {
74
0
    if (my_isalpha(cs,i))
75
0
      state_map[i]=(uchar) MY_LEX_IDENT;
76
0
    else if (my_isdigit(cs,i))
77
0
      state_map[i]=(uchar) MY_LEX_NUMBER_IDENT;
78
0
    else if (is_multi_byte_ident(cs, i))
79
0
      state_map[i]=(uchar) MY_LEX_IDENT;
80
0
    else if (my_isspace(cs,i))
81
0
      state_map[i]=(uchar) MY_LEX_SKIP;
82
0
    else
83
0
      state_map[i]=(uchar) MY_LEX_CHAR;
84
0
  }
85
0
  state_map[(uchar)'_']=state_map[(uchar)'$']=(uchar) MY_LEX_IDENT;
86
0
  state_map[(uchar)'\'']=(uchar) MY_LEX_STRING;
87
0
  state_map[(uchar)'.']=(uchar) MY_LEX_REAL_OR_POINT;
88
0
  state_map[(uchar)'>']=state_map[(uchar)'=']=state_map[(uchar)'!']= (uchar) MY_LEX_CMP_OP;
89
0
  state_map[(uchar)'<']= (uchar) MY_LEX_LONG_CMP_OP;
90
0
  state_map[(uchar)'&']=state_map[(uchar)'|']=(uchar) MY_LEX_BOOL;
91
0
  state_map[(uchar)'#']=(uchar) MY_LEX_COMMENT;
92
0
  state_map[(uchar)';']=(uchar) MY_LEX_SEMICOLON;
93
0
  state_map[(uchar)':']=(uchar) MY_LEX_SET_VAR;
94
0
  state_map[0]=(uchar) MY_LEX_EOL;
95
0
  state_map[(uchar)'\\']= (uchar) MY_LEX_ESCAPE;
96
0
  state_map[(uchar)'/']= (uchar) MY_LEX_LONG_COMMENT;
97
0
  state_map[(uchar)'*']= (uchar) MY_LEX_END_LONG_COMMENT;
98
0
  state_map[(uchar)'@']= (uchar) MY_LEX_USER_END;
99
0
  state_map[(uchar) '`']= (uchar) MY_LEX_USER_VARIABLE_DELIMITER;
100
0
  state_map[(uchar)'"']= (uchar) MY_LEX_STRING_OR_DELIMITER;
101
0
  state_map[(uchar)'-']= (uchar) MY_LEX_MINUS_OR_COMMENT;
102
0
  state_map[(uchar)',']= (uchar) MY_LEX_COMMA;
103
0
  state_map[(uchar)'?']= (uchar) MY_LEX_PLACEHOLDER;
104
105
  /*
106
    Create a second map to make it faster to find identifiers
107
  */
108
0
  for (i=0; i < 256 ; i++)
109
0
  {
110
0
    ident_map[i]= (uchar) (state_map[i] == MY_LEX_IDENT ||
111
0
         state_map[i] == MY_LEX_NUMBER_IDENT);
112
0
  }
113
114
  /* Special handling of hex and binary strings */
115
0
  state_map[(uchar)'x']= state_map[(uchar)'X']= (uchar) MY_LEX_IDENT_OR_HEX;
116
0
  state_map[(uchar)'b']= state_map[(uchar)'B']= (uchar) MY_LEX_IDENT_OR_BIN;
117
0
  state_map[(uchar)'n']= state_map[(uchar)'N']= (uchar) MY_LEX_IDENT_OR_NCHAR;
118
0
  return 0;
119
0
}
120
121
122
static MY_COLLATION_HANDLER *get_simple_collation_handler_by_flags(uint flags)
123
0
{
124
0
  return flags & MY_CS_BINSORT ?
125
0
           (flags & MY_CS_NOPAD ?
126
0
            &my_collation_8bit_nopad_bin_handler :
127
0
            &my_collation_8bit_bin_handler) :
128
0
           (flags & MY_CS_NOPAD ?
129
0
            &my_collation_8bit_simple_nopad_ci_handler :
130
0
            &my_collation_8bit_simple_ci_handler);
131
0
}
132
133
134
static void simple_cs_init_functions(struct charset_info_st *cs)
135
0
{
136
0
  cs->coll= get_simple_collation_handler_by_flags(cs->state);
137
0
  cs->cset= &my_charset_8bit_handler;
138
0
}
139
140
141
142
static int cs_copy_data(struct charset_info_st *to, CHARSET_INFO *from)
143
0
{
144
0
  to->number= from->number ? from->number : to->number;
145
146
  /* Don't replace csname if already set */
147
0
  if (from->cs_name.str && !to->cs_name.str)
148
0
  {
149
0
    if (!(to->cs_name.str= my_once_memdup(from->cs_name.str,
150
0
                                          from->cs_name.length + 1,
151
0
                                          MYF(MY_WME))))
152
0
      goto err;
153
0
    to->cs_name.length= from->cs_name.length;
154
0
  }
155
  
156
0
  if (from->coll_name.str)
157
0
  {
158
0
    if (!(to->coll_name.str= my_once_memdup(from->coll_name.str,
159
0
                                            from->coll_name.length + 1,
160
0
                                            MYF(MY_WME))))
161
0
      goto err;
162
0
    to->coll_name.length= from->coll_name.length;
163
0
  }
164
  
165
0
  if (from->comment)
166
0
    if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
167
0
      goto err;
168
  
169
0
  if (from->m_ctype)
170
0
  {
171
0
    if (!(to->m_ctype= (uchar*) my_once_memdup((char*) from->m_ctype,
172
0
                                               MY_CS_CTYPE_TABLE_SIZE,
173
0
                                               MYF(MY_WME))))
174
0
      goto err;
175
0
    if (init_state_maps(to))
176
0
      goto err;
177
0
  }
178
0
  if (from->to_lower)
179
0
    if (!(to->to_lower= (uchar*) my_once_memdup((char*) from->to_lower,
180
0
            MY_CS_TO_LOWER_TABLE_SIZE,
181
0
            MYF(MY_WME))))
182
0
      goto err;
183
184
0
  if (from->to_upper)
185
0
    if (!(to->to_upper= (uchar*) my_once_memdup((char*) from->to_upper,
186
0
            MY_CS_TO_UPPER_TABLE_SIZE,
187
0
            MYF(MY_WME))))
188
0
      goto err;
189
0
  if (from->sort_order)
190
0
  {
191
0
    if (!(to->sort_order= (uchar*) my_once_memdup((char*) from->sort_order,
192
0
              MY_CS_SORT_ORDER_TABLE_SIZE,
193
0
              MYF(MY_WME))))
194
0
      goto err;
195
196
0
  }
197
0
  if (from->tab_to_uni)
198
0
  {
199
0
    uint sz= MY_CS_TO_UNI_TABLE_SIZE*sizeof(uint16);
200
0
    if (!(to->tab_to_uni= (uint16*)  my_once_memdup((char*)from->tab_to_uni,
201
0
                sz, MYF(MY_WME))))
202
0
      goto err;
203
0
  }
204
0
  if (from->tailoring)
205
0
    if (!(to->tailoring= my_once_strdup(from->tailoring,MYF(MY_WME))))
206
0
      goto err;
207
208
0
  return 0;
209
210
0
err:
211
0
  return 1;
212
0
}
213
214
215
static my_bool simple_8bit_charset_data_is_full(CHARSET_INFO *cs)
216
0
{
217
0
  return cs->m_ctype && cs->to_upper && cs->to_lower && cs->tab_to_uni;
218
0
}
219
220
221
/**
222
  Inherit missing 8bit charset data from another collation.
223
  Arrays pointed by refcs must be in the permanent memory already,
224
  e.g. static memory, or allocated by my_once_xxx().
225
*/
226
static void
227
inherit_charset_data(struct charset_info_st *cs, CHARSET_INFO *refcs)
228
0
{
229
0
  if (!cs->to_upper)
230
0
    cs->to_upper= refcs->to_upper;
231
0
  if (!cs->to_lower)
232
0
    cs->to_lower= refcs->to_lower;
233
0
  if (!cs->m_ctype)
234
0
    cs->m_ctype= refcs->m_ctype;
235
0
  if (!cs->tab_to_uni)
236
0
    cs->tab_to_uni= refcs->tab_to_uni;
237
0
}
238
239
240
static my_bool simple_8bit_collation_data_is_full(CHARSET_INFO *cs)
241
0
{
242
0
  return cs->sort_order || (cs->state & MY_CS_BINSORT);
243
0
}
244
245
246
/**
247
  Inherit 8bit simple collation data from another collation.
248
  refcs->sort_order must be in the permanent memory already,
249
  e.g. static memory, or allocated by my_once_xxx().
250
*/
251
static void
252
inherit_collation_data(struct charset_info_st *cs, CHARSET_INFO *refcs)
253
0
{
254
0
  if (!simple_8bit_collation_data_is_full(cs))
255
0
    cs->sort_order= refcs->sort_order;
256
0
}
257
258
259
static my_bool simple_cs_is_full(CHARSET_INFO *cs)
260
0
{
261
0
  return  cs->number && cs->cs_name.str && cs->coll_name.str &&
262
0
          simple_8bit_charset_data_is_full(cs) &&
263
0
          (simple_8bit_collation_data_is_full(cs) || cs->tailoring);
264
0
}
265
266
267
#if defined(HAVE_UCA_COLLATIONS) && (defined(HAVE_CHARSET_ucs2) || defined(HAVE_CHARSET_utf8mb3))
268
/**
269
  Initialize a loaded collation.
270
  @param [OUT] to     - The new charset_info_st structure to initialize.
271
  @param [IN]  from   - A template collation, to fill the missing data from.
272
  @param [IN]  loaded - The collation data loaded from the LDML file.
273
                        some data may be missing in "loaded".
274
*/
275
static void
276
copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from,
277
                   CHARSET_INFO *loaded)
278
0
{
279
0
  to->cset= from->cset;
280
0
  to->coll= from->coll;
281
  /*
282
    Single-level UCA collation have strnxfrm_multiple=8.
283
    In case of a multi-level UCA collation we use strnxfrm_multiply=4.
284
    That means MY_COLLATION_HANDLER::strnfrmlen() will request the caller
285
    to allocate a buffer smaller size for each level, for performance purpose,
286
    and to fit longer VARCHARs to @@max_sort_length.
287
    This makes filesort produce non-precise order for some rare Unicode
288
    characters that produce more than 4 weights (long expansions).
289
    UCA requires 2 bytes per weight multiplied by the number of levels.
290
    In case of a 2-level collation, each character requires 4*2=8 bytes.
291
    Therefore, the longest VARCHAR that fits into the default @@max_sort_length
292
    is 1024/8=VARCHAR(128). With strnxfrm_multiply==8, only VARCHAR(64)
293
    would fit.
294
    Note, the built-in collation utf8_thai_520_w2 also uses strnxfrm_multiply=4,
295
    for the same purpose.
296
    TODO: we could add a new LDML syntax to choose strxfrm_multiply value.
297
  */
298
0
  to->strxfrm_multiply= loaded->levels_for_order > 1 ?
299
0
                        4 : from->strxfrm_multiply;
300
0
  to->min_sort_char= from->min_sort_char;
301
0
  to->max_sort_char= from->max_sort_char;
302
0
  to->mbminlen= from->mbminlen;
303
0
  to->mbmaxlen= from->mbmaxlen;
304
0
  to->state|= MY_CS_AVAILABLE | MY_CS_LOADED |
305
0
              MY_CS_STRNXFRM  | MY_CS_UNICODE;
306
0
}
307
#endif
308
309
310
static int add_collation(struct charset_info_st *cs)
311
0
{
312
0
  if (cs->coll_name.str &&
313
0
      (cs->number ||
314
0
       (cs->number=get_collation_number_internal(cs->coll_name.str))) &&
315
0
      cs->number < array_elements(all_charsets))
316
0
  {
317
0
    struct charset_info_st *newcs;
318
0
    if (!(newcs= (struct charset_info_st*) all_charsets[cs->number]))
319
0
    {
320
0
      if (!(all_charsets[cs->number]= newcs=
321
0
         (struct charset_info_st*) my_once_alloc(sizeof(CHARSET_INFO),MYF(0))))
322
0
        return MY_XML_ERROR;
323
0
      bzero(newcs,sizeof(CHARSET_INFO));
324
0
    }
325
0
    else
326
0
    {
327
      /* Don't allow change of csname */
328
0
      if (newcs->cs_name.str && strcmp(newcs->cs_name.str, cs->cs_name.str))
329
0
      {
330
0
        my_error(EE_DUPLICATE_CHARSET, MYF(ME_WARNING),
331
0
                 cs->number, cs->cs_name.str, newcs->cs_name.str);
332
        /*
333
          Continue parsing rest of Index.xml. We got an warning in the log
334
          so the user can fix the wrong character set definition.
335
        */
336
0
        return MY_XML_OK;
337
0
      }
338
0
    }
339
340
0
    if (cs->primary_number == cs->number)
341
0
      cs->state |= MY_CS_PRIMARY;
342
      
343
0
    if (cs->binary_number == cs->number)
344
0
      cs->state |= MY_CS_BINSORT;
345
    
346
0
    newcs->state|= cs->state;
347
    
348
0
    if (!(newcs->state & MY_CS_COMPILED))
349
0
    {
350
0
      if (cs_copy_data(newcs,cs))
351
0
        return MY_XML_ERROR;
352
353
0
      newcs->levels_for_order= 1;
354
      
355
0
      if (!strcmp(cs->cs_name.str,"ucs2") )
356
0
      {
357
0
#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
358
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
359
0
                                  &my_charset_ucs2_unicode_nopad_ci :
360
0
                                  &my_charset_ucs2_unicode_ci,
361
0
                                  cs);
362
0
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
363
0
#endif        
364
0
      }
365
0
      else if (!strcmp(cs->cs_name.str, "utf8") ||
366
0
               !strcmp(cs->cs_name.str, "utf8mb3"))
367
0
      {
368
0
#if defined (HAVE_CHARSET_utf8mb3) && defined(HAVE_UCA_COLLATIONS)
369
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
370
0
                                  &my_charset_utf8mb3_unicode_nopad_ci :
371
0
                                  &my_charset_utf8mb3_unicode_ci,
372
0
                                  cs);
373
0
        newcs->m_ctype= my_charset_utf8mb3_unicode_ci.m_ctype;
374
0
        if (init_state_maps(newcs))
375
0
          return MY_XML_ERROR;
376
0
#endif
377
0
      }
378
0
      else if (!strcmp(cs->cs_name.str, "utf8mb4"))
379
0
      {
380
0
#if defined (HAVE_CHARSET_utf8mb4) && defined(HAVE_UCA_COLLATIONS)
381
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
382
0
                                  &my_charset_utf8mb4_unicode_nopad_ci :
383
0
                                  &my_charset_utf8mb4_unicode_ci,
384
0
                                  cs);
385
0
        newcs->m_ctype= my_charset_utf8mb4_unicode_ci.m_ctype;
386
0
        if (init_state_maps(newcs))
387
0
          return MY_XML_ERROR;
388
0
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
389
0
#endif
390
0
      }
391
0
      else if (!strcmp(cs->cs_name.str, "utf16"))
392
0
      {
393
0
#if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS)
394
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
395
0
                                  &my_charset_utf16_unicode_nopad_ci :
396
0
                                  &my_charset_utf16_unicode_ci,
397
0
                                  cs);
398
0
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
399
0
#endif
400
0
      }
401
0
      else if (!strcmp(cs->cs_name.str, "utf32"))
402
0
      {
403
0
#if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS)
404
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
405
0
                                  &my_charset_utf32_unicode_nopad_ci :
406
0
                                  &my_charset_utf32_unicode_ci,
407
0
                                  cs);
408
0
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
409
0
#endif
410
0
      }
411
0
      else
412
0
      {
413
0
        simple_cs_init_functions(newcs);
414
0
        newcs->mbminlen= 1;
415
0
        newcs->mbmaxlen= 1;
416
0
        newcs->strxfrm_multiply= 1;
417
0
        if (simple_cs_is_full(newcs))
418
0
        {
419
0
          newcs->state |= MY_CS_LOADED;
420
0
        }
421
0
      }
422
0
      add_compiled_extra_collation(newcs);
423
0
    }
424
0
    else
425
0
    {
426
      /*
427
        We need the below to make get_charset_name()
428
        and get_charset_number() working even if a
429
        character set has not been really incompiled.
430
        The above functions are used for example
431
        in error message compiler extra/comp_err.c.
432
        If a character set was compiled, this information
433
        will get lost and overwritten in add_compiled_collation().
434
      */
435
0
      newcs->number= cs->number;
436
0
      if (cs->comment)
437
0
  if (!(newcs->comment= my_once_strdup(cs->comment,MYF(MY_WME))))
438
0
    return MY_XML_ERROR;
439
0
      if (cs->cs_name.str && ! newcs->cs_name.str)
440
0
      {
441
0
        if (!(newcs->cs_name.str= my_once_memdup(cs->cs_name.str,
442
0
                                                 cs->cs_name.length+1,
443
0
                                                 MYF(MY_WME))))
444
0
    return MY_XML_ERROR;
445
0
        newcs->cs_name.length= cs->cs_name.length;
446
0
      }
447
0
      if (cs->coll_name.str)
448
0
      {
449
0
  if (!(newcs->coll_name.str= my_once_memdup(cs->coll_name.str,
450
0
                                                   cs->coll_name.length+1,
451
0
                                                  MYF(MY_WME))))
452
0
    return MY_XML_ERROR;
453
0
        newcs->coll_name.length= cs->coll_name.length;
454
0
      }
455
0
    }
456
0
    cs->number= 0;
457
0
    cs->primary_number= 0;
458
0
    cs->binary_number= 0;
459
0
    cs->coll_name.str= 0;
460
0
    cs->coll_name.length= 0;
461
0
    cs->state= 0;
462
0
    cs->sort_order= NULL;
463
0
    cs->tailoring= NULL;
464
0
  }
465
0
  return MY_XML_OK;
466
0
}
467
468
469
/**
470
  Report character set initialization errors and warnings.
471
  Be silent by default: no warnings on the client side.
472
*/
473
ATTRIBUTE_FORMAT(printf, 2, 3) static void
474
default_reporter(enum loglevel level  __attribute__ ((unused)),
475
                 const char *format  __attribute__ ((unused)),
476
                 ...)
477
0
{
478
0
}
479
my_error_reporter my_charset_error_reporter= default_reporter;
480
481
482
/**
483
  Wrappers for memory functions my_malloc (and friends)
484
  with C-compatbile API without extra "myf" argument.
485
*/
486
static void *
487
my_once_alloc_c(size_t size)
488
0
{ return my_once_alloc(size, MYF(MY_WME)); }
489
490
491
static void *
492
my_malloc_c(size_t size)
493
0
{ return my_malloc(key_memory_charset_loader, size, MYF(MY_WME)); }
494
495
496
static void *
497
my_realloc_c(void *old, size_t size)
498
0
{ return my_realloc(key_memory_charset_loader, old, size, MYF(MY_WME|MY_ALLOW_ZERO_PTR)); }
499
500
501
/**
502
  Initialize character set loader to use mysys memory management functions.
503
  @param loader  Loader to initialize
504
*/
505
void
506
my_charset_loader_init_mysys(MY_CHARSET_LOADER *loader)
507
0
{
508
0
  loader->error[0]= '\0';
509
0
  loader->once_alloc= my_once_alloc_c;
510
0
  loader->malloc= my_malloc_c;
511
0
  loader->realloc= my_realloc_c;
512
0
  loader->free= my_free;
513
0
  loader->reporter= my_charset_error_reporter;
514
0
  loader->add_collation= add_collation;
515
0
}
516
517
518
0
#define MY_MAX_ALLOWED_BUF 1024*1024
519
#define MY_CHARSET_INDEX "Index.xml"
520
521
const char *charsets_dir= NULL;
522
523
524
static my_bool
525
my_read_charset_file(MY_CHARSET_LOADER *loader,
526
                     const char *filename,
527
                     myf myflags)
528
0
{
529
0
  uchar *buf;
530
0
  int  fd;
531
0
  size_t len, tmp_len;
532
0
  MY_STAT stat_info;
533
  
534
0
  if (!my_stat(filename, &stat_info, MYF(myflags)) ||
535
0
       ((len= (uint)stat_info.st_size) > MY_MAX_ALLOWED_BUF) ||
536
0
       !(buf= (uchar*) my_malloc(key_memory_charset_loader,len,myflags)))
537
0
    return TRUE;
538
  
539
0
  if ((fd= mysql_file_open(key_file_charset, filename, O_RDONLY, myflags)) < 0)
540
0
    goto error;
541
0
  tmp_len= mysql_file_read(fd, buf, len, myflags);
542
0
  mysql_file_close(fd, myflags);
543
0
  if (tmp_len != len)
544
0
    goto error;
545
  
546
0
  if (my_parse_charset_xml(loader, (char *) buf, len))
547
0
  {
548
0
    my_printf_error(EE_UNKNOWN_CHARSET, "Error while parsing '%s': %s\n",
549
0
                    MYF(0), filename, loader->error);
550
0
    goto error;
551
0
  }
552
  
553
0
  my_free(buf);
554
0
  return FALSE;
555
556
0
error:
557
0
  my_free(buf);
558
0
  return TRUE;
559
0
}
560
561
562
char *get_charsets_dir(char *buf)
563
0
{
564
0
  const char *sharedir= SHAREDIR;
565
0
  char *res;
566
0
  DBUG_ENTER("get_charsets_dir");
567
568
0
  if (charsets_dir != NULL)
569
0
    strmake(buf, charsets_dir, FN_REFLEN-1);
570
0
  else
571
0
  {
572
0
    if (test_if_hard_path(sharedir) ||
573
0
  is_prefix(sharedir, DEFAULT_CHARSET_HOME))
574
0
      strxmov(buf, sharedir, "/", CHARSET_DIR, NullS);
575
0
    else
576
0
      strxmov(buf, DEFAULT_CHARSET_HOME, "/", sharedir, "/", CHARSET_DIR,
577
0
        NullS);
578
0
  }
579
0
  res= convert_dirname(buf,buf,NullS);
580
0
  DBUG_PRINT("info",("charsets dir: '%s'", buf));
581
0
  DBUG_RETURN(res);
582
0
}
583
584
CHARSET_INFO *all_charsets[MY_ALL_CHARSETS_SIZE]={NULL};
585
CHARSET_INFO *default_charset_info = &my_charset_latin1;
586
587
588
/*
589
  Add standard character set compiled into the application
590
  All related character sets should share same cname
591
*/
592
593
int add_compiled_collation(struct charset_info_st *cs)
594
0
{
595
0
  DBUG_ASSERT(cs->number < array_elements(all_charsets));
596
0
  all_charsets[cs->number]= cs;
597
0
  cs->state|= MY_CS_AVAILABLE;
598
0
  if ((my_hash_insert(&charset_name_hash, (uchar*) cs)))
599
0
  {
600
#ifndef DBUG_OFF
601
    CHARSET_INFO *org= (CHARSET_INFO*) my_hash_search(&charset_name_hash,
602
                                                      (uchar*) cs->cs_name.str,
603
                                                      cs->cs_name.length);
604
    DBUG_ASSERT(org);
605
    DBUG_ASSERT(org->cs_name.str == cs->cs_name.str);
606
    DBUG_ASSERT(org->cs_name.length == strlen(cs->cs_name.str));
607
#endif
608
0
  }
609
0
  if (cs->coll_name.str)
610
0
    my_hash_insert(&collation_name_hash, (uchar*) cs);
611
0
  return 0;
612
0
}
613
614
615
/*
616
  Add optional characters sets from ctype-extra.c
617
618
  If cname is already in use, replace csname in new object with a pointer to
619
  the already used csname to ensure that all csname's points to the same string
620
  for the same character set.
621
*/
622
623
624
void add_compiled_extra_collation(struct charset_info_st *cs)
625
0
{
626
0
  DBUG_ASSERT(cs->number < array_elements(all_charsets));
627
0
  all_charsets[cs->number]= cs;
628
0
  cs->state|= MY_CS_AVAILABLE;
629
0
  if ((my_hash_insert(&charset_name_hash, (uchar*) cs)))
630
0
  {
631
0
    CHARSET_INFO *org= (CHARSET_INFO*) my_hash_search(&charset_name_hash,
632
0
                                                      (uchar*) cs->cs_name.str,
633
0
                                                      cs->cs_name.length);
634
0
    cs->cs_name= org->cs_name;
635
0
  }
636
0
  if (cs->coll_name.str)
637
0
    my_hash_insert(&collation_name_hash, (uchar*) cs);
638
0
}
639
640
641
static my_pthread_once_t charsets_initialized= MY_PTHREAD_ONCE_INIT;
642
static my_pthread_once_t charsets_template= MY_PTHREAD_ONCE_INIT;
643
644
typedef struct
645
{
646
  ulonglong use_count;
647
} MY_COLLATION_STATISTICS;
648
649
650
static MY_COLLATION_STATISTICS my_collation_statistics[MY_ALL_CHARSETS_SIZE];
651
652
653
my_bool my_collation_is_known_id(uint id)
654
0
{
655
0
  return id > 0 && id < array_elements(all_charsets) && all_charsets[id] ?
656
0
         TRUE : FALSE;
657
0
}
658
659
660
/*
661
  Collation use statistics functions do not lock
662
  counters to avoid mutex contention. This can lose
663
  some counter increments with high thread concurrency.
664
  But this should be Ok, as we don't need exact numbers.
665
*/
666
static inline void my_collation_statistics_inc_use_count(uint id)
667
0
{
668
0
  DBUG_ASSERT(my_collation_is_known_id(id));
669
0
  my_collation_statistics[id].use_count++;
670
0
}
671
672
673
ulonglong my_collation_statistics_get_use_count(uint id)
674
0
{
675
0
  DBUG_ASSERT(my_collation_is_known_id(id));
676
0
  return my_collation_statistics[id].use_count;
677
0
}
678
679
680
const char *my_collation_get_tailoring(uint id)
681
0
{
682
  /* all_charsets[id]->tailoring is never changed after server startup. */
683
0
  DBUG_ASSERT(my_collation_is_known_id(id));
684
0
  return all_charsets[id]->tailoring;
685
0
}
686
687
688
static const uchar *get_charset_key(const void *object, size_t *size,
689
                                    my_bool not_used __attribute__((unused)))
690
0
{
691
0
  CHARSET_INFO *cs= object;
692
0
  *size= cs->cs_name.length;
693
0
  return (const uchar*) cs->cs_name.str;
694
0
}
695
696
static const uchar *get_collation_key(const void *object, size_t *length,
697
                                      my_bool not_used __attribute__((unused)))
698
0
{
699
0
  CHARSET_INFO *cs= (CHARSET_INFO*) object;
700
0
  *length= cs->coll_name.length;
701
0
  return (const uchar*) cs->coll_name.str;
702
0
}
703
704
static void init_available_charsets(void)
705
0
{
706
0
  char fname[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
707
0
  struct charset_info_st **cs;
708
0
  MY_CHARSET_LOADER loader;
709
0
  DBUG_ENTER("init_available_charsets");
710
711
0
  bzero((char*) &all_charsets,sizeof(all_charsets));
712
0
  bzero((char*) &my_collation_statistics, sizeof(my_collation_statistics));
713
714
0
  my_hash_init2(key_memory_charsets, &charset_name_hash, 16,
715
0
                &my_charset_latin1, 64, 0, 0, get_charset_key,
716
0
                0, 0, HASH_UNIQUE);
717
718
0
  my_hash_init2(key_memory_charsets, &collation_name_hash, 16,
719
0
                &my_charset_latin1, 64, 0, 0, get_collation_key,
720
0
                0, 0, HASH_UNIQUE);
721
722
0
  init_compiled_charsets(MYF(0));
723
724
  /* Copy compiled charsets */
725
0
  for (cs= (struct charset_info_st**) all_charsets;
726
0
       cs < (struct charset_info_st**) all_charsets +
727
0
            array_elements(all_charsets)-1 ;
728
0
       cs++)
729
0
  {
730
0
    if (*cs)
731
0
    {
732
0
      DBUG_ASSERT(cs[0]->mbmaxlen <= MY_CS_MBMAXLEN);
733
0
      if (cs[0]->m_ctype && !cs[0]->state_map)
734
0
        if (init_state_maps(*cs))
735
0
          *cs= NULL;
736
0
    }
737
0
  }
738
739
0
  my_charset_loader_init_mysys(&loader);
740
0
  strmov(get_charsets_dir(fname), MY_CHARSET_INDEX);
741
0
  my_read_charset_file(&loader, fname, MYF(0));
742
0
  DBUG_VOID_RETURN;
743
0
}
744
745
746
void free_charsets(void)
747
0
{
748
0
  charsets_initialized= charsets_template;
749
0
  my_hash_free(&charset_name_hash);
750
0
  my_hash_free(&collation_name_hash);
751
0
}
752
753
754
static const char*
755
get_collation_name_alias(const char *name, char *buf, size_t bufsize, myf flags)
756
0
{
757
0
  if (!strncasecmp(name, "utf8_", 5))
758
0
  {
759
0
    my_snprintf(buf, bufsize, "utf8mb%c_%s",
760
0
       flags & MY_UTF8_IS_UTF8MB3 ? '3' : '4', name + 5);
761
0
    return buf;
762
0
  }
763
0
  return NULL;
764
0
}
765
766
767
uint get_collation_number(const char *name, myf flags)
768
0
{
769
0
  uint id;
770
0
  char alias[64];
771
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
772
0
  if ((id= get_collation_number_internal(name)))
773
0
    return id;
774
0
  if ((name= get_collation_name_alias(name, alias, sizeof(alias),flags)))
775
0
    return get_collation_number_internal(name);
776
0
  return 0;
777
0
}
778
779
780
static uint
781
get_charset_number_internal(const char *charset_name, uint cs_flags)
782
0
{
783
0
  CHARSET_INFO **cs;
784
  
785
0
  for (cs= all_charsets;
786
0
       cs < all_charsets + array_elements(all_charsets);
787
0
       cs++)
788
0
  {
789
0
    if ( cs[0] && cs[0]->cs_name.str && (cs[0]->state & cs_flags) &&
790
0
         !my_strcasecmp_latin1(cs[0]->cs_name.str, charset_name))
791
0
      return cs[0]->number;
792
0
  }  
793
0
  return 0;
794
0
}
795
796
797
uint get_charset_number(const char *charset_name, uint cs_flags, myf flags)
798
0
{
799
0
  uint id;
800
0
  const char *new_charset_name= flags & MY_UTF8_IS_UTF8MB3 ? "utf8mb3" :
801
0
                                                             "utf8mb4";
802
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
803
0
  if ((id= get_charset_number_internal(charset_name, cs_flags)))
804
0
    return id;
805
0
  if ((charset_name= !my_strcasecmp_latin1(charset_name, "utf8") ?
806
0
                      new_charset_name : NULL))
807
0
    return get_charset_number_internal(charset_name, cs_flags);
808
0
  return 0;
809
0
}
810
                  
811
812
const char *get_charset_name(uint charset_number)
813
0
{
814
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
815
816
0
  if (charset_number < array_elements(all_charsets))
817
0
  {
818
0
    CHARSET_INFO *cs= all_charsets[charset_number];
819
820
0
    if (cs && (cs->number == charset_number) && cs->coll_name.str)
821
0
      return cs->coll_name.str;
822
0
  }
823
  
824
0
  return "?";   /* this mimics find_type() */
825
0
}
826
827
828
static CHARSET_INFO *inheritance_source_by_id(CHARSET_INFO *cs, uint refid)
829
0
{
830
0
  CHARSET_INFO *refcs;
831
0
  return refid && refid != cs->number &&
832
0
         (refcs= all_charsets[refid]) &&
833
0
         (refcs->state & MY_CS_AVAILABLE) ? refcs : NULL;
834
0
}
835
836
837
static CHARSET_INFO *find_collation_data_inheritance_source(CHARSET_INFO *cs, myf flags)
838
0
{
839
0
  const char *beg, *end;
840
0
  if (cs->tailoring &&
841
0
      !strncmp(cs->tailoring, "[import ", 8) &&
842
0
      (end= strchr(cs->tailoring + 8, ']')) &&
843
0
      (beg= cs->tailoring + 8) + MY_CS_COLLATION_NAME_SIZE > end)
844
0
  {
845
0
    char name[MY_CS_COLLATION_NAME_SIZE + 1];
846
0
    memcpy(name, beg, end - beg);
847
0
    name[end - beg]= '\0';
848
0
    return inheritance_source_by_id(cs, get_collation_number(name,MYF(flags)));
849
0
  }
850
0
  return NULL;
851
0
}
852
853
854
static CHARSET_INFO *find_charset_data_inheritance_source(CHARSET_INFO *cs)
855
0
{
856
0
  uint refid= get_charset_number_internal(cs->cs_name.str, MY_CS_PRIMARY);
857
0
  return inheritance_source_by_id(cs, refid);
858
0
}
859
860
861
static CHARSET_INFO *
862
get_internal_charset(MY_CHARSET_LOADER *loader, uint cs_number, myf flags)
863
0
{
864
0
  char  buf[FN_REFLEN];
865
0
  struct charset_info_st *cs;
866
867
0
  DBUG_ASSERT(cs_number < array_elements(all_charsets));
868
869
0
  if ((cs= (struct charset_info_st*) all_charsets[cs_number]))
870
0
  {
871
0
    if (cs->state & MY_CS_READY)  /* if CS is already initialized */
872
0
    {
873
0
      my_collation_statistics_inc_use_count(cs_number);
874
0
      return cs;
875
0
    }
876
877
    /*
878
      To make things thread safe we are not allowing other threads to interfere
879
      while we may changing the cs_info_table
880
    */
881
0
    mysql_mutex_lock(&THR_LOCK_charset);
882
883
0
    if (!(cs->state & (MY_CS_COMPILED|MY_CS_LOADED))) /* if CS is not in memory */
884
0
    {
885
0
      MY_CHARSET_LOADER loader;
886
0
      strxmov(get_charsets_dir(buf), cs->cs_name.str, ".xml", NullS);
887
0
      my_charset_loader_init_mysys(&loader);
888
0
      my_read_charset_file(&loader, buf, flags);
889
0
    }
890
891
0
    if (cs->state & MY_CS_AVAILABLE)
892
0
    {
893
0
      if (!(cs->state & MY_CS_READY))
894
0
      {
895
0
        if (!simple_8bit_charset_data_is_full(cs))
896
0
        {
897
0
          CHARSET_INFO *refcs= find_charset_data_inheritance_source(cs);
898
0
          if (refcs)
899
0
            inherit_charset_data(cs, refcs);
900
0
        }
901
0
        if (!simple_8bit_collation_data_is_full(cs))
902
0
        {
903
0
          CHARSET_INFO *refcl= find_collation_data_inheritance_source(cs, flags);
904
0
          if (refcl)
905
0
            inherit_collation_data(cs, refcl);
906
0
        }
907
908
0
        if (my_ci_init_charset(cs, loader) ||
909
0
            my_ci_init_collation(cs, loader))
910
0
        {
911
0
          cs= NULL;
912
0
        }
913
0
        else
914
0
          cs->state|= MY_CS_READY;
915
0
      }
916
0
      my_collation_statistics_inc_use_count(cs_number);
917
0
    }
918
0
    else
919
0
      cs= NULL;
920
921
0
    mysql_mutex_unlock(&THR_LOCK_charset);
922
0
  }
923
0
  return cs;
924
0
}
925
926
927
CHARSET_INFO *get_charset(uint cs_number, myf flags)
928
0
{
929
0
  CHARSET_INFO *cs= NULL;
930
931
0
  if (cs_number == default_charset_info->number)
932
0
    return default_charset_info;
933
934
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
935
936
0
  if (cs_number < array_elements(all_charsets))
937
0
  {
938
0
    MY_CHARSET_LOADER loader;
939
0
    my_charset_loader_init_mysys(&loader);
940
0
    cs= get_internal_charset(&loader, cs_number, flags);
941
0
  }
942
943
0
  if (!cs && (flags & MY_WME))
944
0
  {
945
0
    char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)], cs_string[23];
946
0
    strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
947
0
    cs_string[0]='#';
948
0
    int10_to_str(cs_number, cs_string+1, 10);
949
0
    my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_string, index_file);
950
0
  }
951
0
  return cs;
952
0
}
953
954
955
/**
956
  Find collation by name: extended version of get_charset_by_name()
957
  to return error messages to the caller.
958
  @param   loader  Character set loader
959
  @param   name    Collation name
960
  @param   flags   Flags
961
  @return          NULL on error, pointer to collation on success
962
*/
963
964
CHARSET_INFO *
965
my_collation_get_by_name(MY_CHARSET_LOADER *loader,
966
                         const char *name, myf flags)
967
0
{
968
0
  uint cs_number;
969
0
  CHARSET_INFO *cs;
970
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
971
972
0
  cs_number= get_collation_number(name,flags);
973
0
  my_charset_loader_init_mysys(loader);
974
0
  cs= cs_number ? get_internal_charset(loader, cs_number, flags) : NULL;
975
976
0
  if (!cs && (flags & MY_WME))
977
0
  {
978
0
    char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
979
0
    strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
980
0
    my_error(EE_UNKNOWN_COLLATION, MYF(ME_BELL), name, index_file);
981
0
  }
982
0
  return cs;
983
0
}
984
985
986
CHARSET_INFO *get_charset_by_name(const char *cs_name, myf flags)
987
0
{
988
0
  MY_CHARSET_LOADER loader;
989
0
  my_charset_loader_init_mysys(&loader);
990
0
  return my_collation_get_by_name(&loader, cs_name, flags);
991
0
}
992
993
994
/**
995
  Find character set by name: extended version of get_charset_by_csname()
996
  to return error messages to the caller.
997
  @param   loader   Character set loader
998
  @param   name     Collation name
999
  @param   cs_flags Character set flags (e.g. default or binary collation)
1000
  @param   flags    Flags
1001
  @return           NULL on error, pointer to collation on success
1002
*/
1003
CHARSET_INFO *
1004
my_charset_get_by_name(MY_CHARSET_LOADER *loader,
1005
                       const char *cs_name, uint cs_flags, myf flags)
1006
0
{
1007
0
  uint cs_number;
1008
0
  CHARSET_INFO *cs;
1009
0
  DBUG_ENTER("get_charset_by_csname");
1010
0
  DBUG_PRINT("enter",("name: '%s'", cs_name));
1011
1012
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
1013
1014
0
  cs_number= get_charset_number(cs_name, cs_flags, flags);
1015
0
  cs= cs_number ? get_internal_charset(loader, cs_number, flags) : NULL;
1016
1017
0
  if (!cs && (flags & MY_WME))
1018
0
  {
1019
0
    char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
1020
0
    strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
1021
0
    my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_name, index_file);
1022
0
  }
1023
1024
0
  DBUG_RETURN(cs);
1025
0
}
1026
1027
1028
CHARSET_INFO *
1029
get_charset_by_csname(const char *cs_name, uint cs_flags, myf flags)
1030
0
{
1031
0
  MY_CHARSET_LOADER loader;
1032
0
  my_charset_loader_init_mysys(&loader);
1033
0
  return my_charset_get_by_name(&loader, cs_name, cs_flags, flags);
1034
0
}
1035
1036
1037
/**
1038
  Resolve character set by the character set name (utf8, latin1, ...).
1039
1040
  The function tries to resolve character set by the specified name. If
1041
  there is character set with the given name, it is assigned to the "cs"
1042
  parameter and FALSE is returned. If there is no such character set,
1043
  "default_cs" is assigned to the "cs" and TRUE is returned.
1044
1045
  @param[in] cs_name    Character set name.
1046
  @param[in] default_cs Default character set.
1047
  @param[out] cs        Variable to store character set.
1048
1049
  @return FALSE if character set was resolved successfully; TRUE if there
1050
  is no character set with given name.
1051
*/
1052
1053
my_bool resolve_charset(const char *cs_name,
1054
                        CHARSET_INFO *default_cs,
1055
                        CHARSET_INFO **cs,
1056
                        myf flags)
1057
0
{
1058
0
  *cs= get_charset_by_csname(cs_name, MY_CS_PRIMARY, flags);
1059
1060
0
  if (*cs == NULL)
1061
0
  {
1062
0
    *cs= default_cs;
1063
0
    return TRUE;
1064
0
  }
1065
1066
0
  return FALSE;
1067
0
}
1068
1069
1070
/**
1071
  Resolve collation by the collation name (utf8_general_ci, ...).
1072
1073
  The function tries to resolve collation by the specified name. If there
1074
  is collation with the given name, it is assigned to the "cl" parameter
1075
  and FALSE is returned. If there is no such collation, "default_cl" is
1076
  assigned to the "cl" and TRUE is returned.
1077
1078
  @param[out] cl        Variable to store collation.
1079
  @param[in] cl_name    Collation name.
1080
  @param[in] default_cl Default collation.
1081
1082
  @return FALSE if collation was resolved successfully; TRUE if there is no
1083
  collation with given name.
1084
*/
1085
1086
my_bool resolve_collation(const char *cl_name,
1087
                          CHARSET_INFO *default_cl,
1088
                          CHARSET_INFO **cl,
1089
                          myf my_flags)
1090
0
{
1091
0
  *cl= get_charset_by_name(cl_name, my_flags);
1092
1093
0
  if (*cl == NULL)
1094
0
  {
1095
0
    *cl= default_cl;
1096
0
    return TRUE;
1097
0
  }
1098
1099
0
  return FALSE;
1100
0
}
1101
1102
1103
/*
1104
  Escape string with backslashes (\)
1105
1106
  SYNOPSIS
1107
    escape_string_for_mysql()
1108
    charset_info        Charset of the strings
1109
    to                  Buffer for escaped string
1110
    to_length           Length of destination buffer, or 0
1111
    from                The string to escape
1112
    length              The length of the string to escape
1113
    overflow            Set to 1 if the escaped string did not fit in
1114
                        the to buffer
1115
1116
  DESCRIPTION
1117
    This escapes the contents of a string by adding backslashes before special
1118
    characters, and turning others into specific escape sequences, such as
1119
    turning newlines into \n and null bytes into \0.
1120
1121
  NOTE
1122
    To maintain compatibility with the old C API, to_length may be 0 to mean
1123
    "big enough"
1124
1125
  RETURN VALUES
1126
    #           The length of the escaped string
1127
*/
1128
1129
size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
1130
                               char *to, size_t to_length,
1131
                               const char *from, size_t length,
1132
                               my_bool *overflow)
1133
0
{
1134
0
  const char *to_start= to;
1135
0
  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
1136
0
  *overflow= FALSE;
1137
0
  for (end= from + length; from < end; from++)
1138
0
  {
1139
0
    char escape= 0;
1140
0
#ifdef USE_MB
1141
0
    int tmp_length= my_ci_charlen(charset_info, (const uchar *) from, (const uchar *) end);
1142
0
    if (tmp_length > 1)
1143
0
    {
1144
0
      if (to + tmp_length > to_end)
1145
0
      {
1146
0
        *overflow= TRUE;
1147
0
        break;
1148
0
      }
1149
0
      while (tmp_length--)
1150
0
  *to++= *from++;
1151
0
      from--;
1152
0
      continue;
1153
0
    }
1154
    /*
1155
     If the next character appears to begin a multi-byte character, we
1156
     escape that first byte of that apparent multi-byte character. (The
1157
     character just looks like a multi-byte character -- if it were actually
1158
     a multi-byte character, it would have been passed through in the test
1159
     above.)
1160
1161
     Without this check, we can create a problem by converting an invalid
1162
     multi-byte character into a valid one. For example, 0xbf27 is not
1163
     a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
1164
    */
1165
0
    if (tmp_length < 1) /* Bad byte sequence */
1166
0
      escape= *from;
1167
0
    else
1168
0
#endif
1169
0
    switch (*from) {
1170
0
    case 0:       /* Must be escaped for 'mysql' */
1171
0
      escape= '0';
1172
0
      break;
1173
0
    case '\n':        /* Must be escaped for logs */
1174
0
      escape= 'n';
1175
0
      break;
1176
0
    case '\r':
1177
0
      escape= 'r';
1178
0
      break;
1179
0
    case '\\':
1180
0
      escape= '\\';
1181
0
      break;
1182
0
    case '\'':
1183
0
      escape= '\'';
1184
0
      break;
1185
0
    case '"':       /* Better safe than sorry */
1186
0
      escape= '"';
1187
0
      break;
1188
0
    case '\032':      /* This gives problems on Win32 */
1189
0
      escape= 'Z';
1190
0
      break;
1191
0
    }
1192
0
    if (escape)
1193
0
    {
1194
0
      if (to + 2 > to_end)
1195
0
      {
1196
0
        *overflow= TRUE;
1197
0
        break;
1198
0
      }
1199
0
      *to++= '\\';
1200
0
      *to++= escape;
1201
0
    }
1202
0
    else
1203
0
    {
1204
0
      if (to + 1 > to_end)
1205
0
      {
1206
0
        *overflow= TRUE;
1207
0
        break;
1208
0
      }
1209
0
      *to++= *from;
1210
0
    }
1211
0
  }
1212
0
  *to= 0;
1213
0
  return (size_t) (to - to_start);
1214
0
}
1215
1216
1217
#ifdef BACKSLASH_MBTAIL
1218
CHARSET_INFO *fs_character_set()
1219
{
1220
  static CHARSET_INFO *fs_cset_cache;
1221
  if (fs_cset_cache)
1222
    return fs_cset_cache;
1223
#ifdef HAVE_CHARSET_cp932
1224
  else if (GetACP() == 932)
1225
    return fs_cset_cache= &my_charset_cp932_japanese_ci;
1226
#endif
1227
  else
1228
    return fs_cset_cache= &my_charset_bin;
1229
}
1230
#endif
1231
1232
/*
1233
  Escape apostrophes by doubling them up
1234
1235
  SYNOPSIS
1236
    escape_quotes_for_mysql()
1237
    charset_info        Charset of the strings
1238
    to                  Buffer for escaped string
1239
    to_length           Length of destination buffer, or 0
1240
    from                The string to escape
1241
    length              The length of the string to escape
1242
    overflow            Set to 1 if the buffer overflows
1243
1244
  DESCRIPTION
1245
    This escapes the contents of a string by doubling up any apostrophes that
1246
    it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
1247
    effect on the server.
1248
1249
  NOTE
1250
    To be consistent with escape_string_for_mysql(), to_length may be 0 to
1251
    mean "big enough"
1252
1253
  RETURN VALUES
1254
     The length of the escaped string
1255
*/
1256
1257
size_t escape_quotes_for_mysql(CHARSET_INFO *charset_info,
1258
                               char *to, size_t to_length,
1259
                               const char *from, size_t length,
1260
                               my_bool *overflow)
1261
0
{
1262
0
  const char *to_start= to;
1263
0
  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
1264
0
#ifdef USE_MB
1265
0
  my_bool use_mb_flag= my_ci_use_mb(charset_info);
1266
0
#endif
1267
0
  *overflow= FALSE;
1268
0
  for (end= from + length; from < end; from++)
1269
0
  {
1270
0
#ifdef USE_MB
1271
0
    int tmp_length;
1272
0
    if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
1273
0
    {
1274
0
      if (to + tmp_length > to_end)
1275
0
      {
1276
0
        *overflow= TRUE;
1277
0
        break;
1278
0
      }
1279
0
      while (tmp_length--)
1280
0
  *to++= *from++;
1281
0
      from--;
1282
0
      continue;
1283
0
    }
1284
    /*
1285
      We don't have the same issue here with a non-multi-byte character being
1286
      turned into a multi-byte character by the addition of an escaping
1287
      character, because we are only escaping the ' character with itself.
1288
     */
1289
0
#endif
1290
0
    if (*from == '\'')
1291
0
    {
1292
0
      if (to + 2 > to_end)
1293
0
      {
1294
0
        *overflow= TRUE;
1295
0
        break;
1296
0
      }
1297
0
      *to++= '\'';
1298
0
      *to++= '\'';
1299
0
    }
1300
0
    else
1301
0
    {
1302
0
      if (to + 1 > to_end)
1303
0
      {
1304
0
        *overflow= TRUE;
1305
0
        break;
1306
0
      }
1307
0
      *to++= *from;
1308
0
    }
1309
0
  }
1310
0
  *to= 0;
1311
0
  return (size_t) (to - to_start);
1312
0
}
1313
1314
1315
typedef enum my_cs_match_type_enum
1316
{
1317
  /* MySQL and OS charsets are fully compatible */
1318
  my_cs_exact,
1319
  /* MySQL charset is very close to OS charset  */
1320
  my_cs_approx,
1321
  /*
1322
    MySQL knows this charset, but it is not supported as client character set.
1323
  */
1324
  my_cs_unsupp
1325
} my_cs_match_type;
1326
1327
1328
typedef struct str2str_st
1329
{
1330
  const char* os_name;
1331
  const char* my_name;
1332
  my_cs_match_type param;
1333
} MY_CSET_OS_NAME;
1334
1335
static const MY_CSET_OS_NAME charsets[] =
1336
{
1337
#ifdef _WIN32
1338
  {"cp437",          "cp850",    my_cs_approx},
1339
  {"cp850",          "cp850",    my_cs_exact},
1340
  {"cp852",          "cp852",    my_cs_exact},
1341
  {"cp858",          "cp850",    my_cs_approx},
1342
  {"cp866",          "cp866",    my_cs_exact},
1343
  {"cp874",          "tis620",   my_cs_approx},
1344
  {"cp932",          "cp932",    my_cs_exact},
1345
  {"cp936",          "gbk",      my_cs_approx},
1346
  {"cp949",          "euckr",    my_cs_approx},
1347
  {"cp950",          "big5",     my_cs_exact},
1348
  {"cp1200",         "utf16le",  my_cs_unsupp},
1349
  {"cp1201",         "utf16",    my_cs_unsupp},
1350
  {"cp1250",         "cp1250",   my_cs_exact},
1351
  {"cp1251",         "cp1251",   my_cs_exact},
1352
  {"cp1252",         "latin1",   my_cs_exact},
1353
  {"cp1253",         "greek",    my_cs_exact},
1354
  {"cp1254",         "latin5",   my_cs_exact},
1355
  {"cp1255",         "hebrew",   my_cs_approx},
1356
  {"cp1256",         "cp1256",   my_cs_exact},
1357
  {"cp1257",         "cp1257",   my_cs_exact},
1358
  {"cp10000",        "macroman", my_cs_exact},
1359
  {"cp10001",        "sjis",     my_cs_approx},
1360
  {"cp10002",        "big5",     my_cs_approx},
1361
  {"cp10008",        "gb2312",   my_cs_approx},
1362
  {"cp10021",        "tis620",   my_cs_approx},
1363
  {"cp10029",        "macce",    my_cs_exact},
1364
  {"cp12001",        "utf32",    my_cs_unsupp},
1365
  {"cp20107",        "swe7",     my_cs_exact},
1366
  {"cp20127",        "latin1",   my_cs_approx},
1367
  {"cp20866",        "koi8r",    my_cs_exact},
1368
  {"cp20932",        "ujis",     my_cs_exact},
1369
  {"cp20936",        "gb2312",   my_cs_approx},
1370
  {"cp20949",        "euckr",    my_cs_approx},
1371
  {"cp21866",        "koi8u",    my_cs_exact},
1372
  {"cp28591",        "latin1",   my_cs_approx},
1373
  {"cp28592",        "latin2",   my_cs_exact},
1374
  {"cp28597",        "greek",    my_cs_exact},
1375
  {"cp28598",        "hebrew",   my_cs_exact},
1376
  {"cp28599",        "latin5",   my_cs_exact},
1377
  {"cp28603",        "latin7",   my_cs_exact},
1378
#ifdef UNCOMMENT_THIS_WHEN_WL_4579_IS_DONE
1379
  {"cp28605",        "latin9",   my_cs_exact},
1380
#endif
1381
  {"cp38598",        "hebrew",   my_cs_exact},
1382
  {"cp51932",        "ujis",     my_cs_exact},
1383
  {"cp51936",        "gb2312",   my_cs_exact},
1384
  {"cp51949",        "euckr",    my_cs_exact},
1385
  {"cp51950",        "big5",     my_cs_exact},
1386
#ifdef UNCOMMENT_THIS_WHEN_WL_WL_4024_IS_DONE
1387
  {"cp54936",        "gb18030",  my_cs_exact},
1388
#endif
1389
  {"cp65001",        "utf8mb4",  my_cs_exact},
1390
  {"cp65001",        "utf8mb3",  my_cs_approx},
1391
#else /* not Windows */
1392
1393
  {"646",            "latin1",   my_cs_approx}, /* Default on Solaris */
1394
  {"ANSI_X3.4-1968", "latin1",   my_cs_approx},
1395
  {"ansi1251",       "cp1251",   my_cs_exact},
1396
  {"armscii8",       "armscii8", my_cs_exact},
1397
  {"armscii-8",      "armscii8", my_cs_exact},
1398
  {"ASCII",          "latin1",   my_cs_approx},
1399
  {"Big5",           "big5",     my_cs_exact},
1400
  {"cp1251",         "cp1251",   my_cs_exact},
1401
  {"cp1255",         "hebrew",   my_cs_approx},
1402
  {"CP866",          "cp866",    my_cs_exact},
1403
  {"eucCN",          "gb2312",   my_cs_exact},
1404
  {"euc-CN",         "gb2312",   my_cs_exact},
1405
  {"eucJP",          "ujis",     my_cs_exact},
1406
  {"euc-JP",         "ujis",     my_cs_exact},
1407
  {"eucKR",          "euckr",    my_cs_exact},
1408
  {"euc-KR",         "euckr",    my_cs_exact},
1409
#ifdef UNCOMMENT_THIS_WHEN_WL_WL_4024_IS_DONE
1410
  {"gb18030",        "gb18030",  my_cs_exact},
1411
#endif
1412
  {"gb2312",         "gb2312",   my_cs_exact},
1413
  {"gbk",            "gbk",      my_cs_exact},
1414
  {"georgianps",     "geostd8",  my_cs_exact},
1415
  {"georgian-ps",    "geostd8",  my_cs_exact},
1416
  {"IBM-1252",       "cp1252",   my_cs_exact},
1417
1418
  {"iso88591",       "latin1",   my_cs_approx},
1419
  {"ISO_8859-1",     "latin1",   my_cs_approx},
1420
  {"ISO8859-1",      "latin1",   my_cs_approx},
1421
  {"ISO-8859-1",     "latin1",   my_cs_approx},
1422
1423
  {"iso885913",      "latin7",   my_cs_exact},
1424
  {"ISO_8859-13",    "latin7",   my_cs_exact},
1425
  {"ISO8859-13",     "latin7",   my_cs_exact},
1426
  {"ISO-8859-13",    "latin7",   my_cs_exact},
1427
1428
#ifdef UNCOMMENT_THIS_WHEN_WL_4579_IS_DONE
1429
  {"iso885915",      "latin9",   my_cs_exact},
1430
  {"ISO_8859-15",    "latin9",   my_cs_exact},
1431
  {"ISO8859-15",     "latin9",   my_cs_exact},
1432
  {"ISO-8859-15",    "latin9",   my_cs_exact},
1433
#endif
1434
1435
  {"iso88592",       "latin2",   my_cs_exact},
1436
  {"ISO_8859-2",     "latin2",   my_cs_exact},
1437
  {"ISO8859-2",      "latin2",   my_cs_exact},
1438
  {"ISO-8859-2",     "latin2",   my_cs_exact},
1439
1440
  {"iso88597",       "greek",    my_cs_exact},
1441
  {"ISO_8859-7",     "greek",    my_cs_exact},
1442
  {"ISO8859-7",      "greek",    my_cs_exact},
1443
  {"ISO-8859-7",     "greek",    my_cs_exact},
1444
1445
  {"iso88598",       "hebrew",   my_cs_exact},
1446
  {"ISO_8859-8",     "hebrew",   my_cs_exact},
1447
  {"ISO8859-8",      "hebrew",   my_cs_exact},
1448
  {"ISO-8859-8",     "hebrew",   my_cs_exact},
1449
1450
  {"iso88599",       "latin5",   my_cs_exact},
1451
  {"ISO_8859-9",     "latin5",   my_cs_exact},
1452
  {"ISO8859-9",      "latin5",   my_cs_exact},
1453
  {"ISO-8859-9",     "latin5",   my_cs_exact},
1454
1455
  {"koi8r",          "koi8r",    my_cs_exact},
1456
  {"KOI8-R",         "koi8r",    my_cs_exact},
1457
  {"koi8u",          "koi8u",    my_cs_exact},
1458
  {"KOI8-U",         "koi8u",    my_cs_exact},
1459
1460
  {"roman8",         "hp8",      my_cs_exact}, /* Default on HP UX */
1461
1462
  {"Shift_JIS",      "sjis",     my_cs_exact},
1463
  {"SJIS",           "sjis",     my_cs_exact},
1464
  {"shiftjisx0213",  "sjis",     my_cs_exact},
1465
1466
  {"tis620",         "tis620",   my_cs_exact},
1467
  {"tis-620",        "tis620",   my_cs_exact},
1468
1469
  {"ujis",           "ujis",     my_cs_exact},
1470
1471
  {"US-ASCII",       "latin1",   my_cs_approx},
1472
1473
  {"utf8",           "utf8mb4",  my_cs_exact},
1474
  {"utf-8",          "utf8mb4",  my_cs_exact},
1475
#endif
1476
  {NULL,             NULL,       0}
1477
};
1478
1479
1480
static const char*
1481
my_os_charset_to_mysql_charset(const char* csname)
1482
0
{
1483
0
  const MY_CSET_OS_NAME* csp;
1484
0
  for (csp = charsets; csp->os_name; csp++)
1485
0
  {
1486
0
    if (!strcasecmp(csp->os_name, csname))
1487
0
    {
1488
0
      switch (csp->param)
1489
0
      {
1490
0
      case my_cs_exact:
1491
0
        return csp->my_name;
1492
1493
0
      case my_cs_approx:
1494
        /*
1495
          Maybe we should print a warning eventually:
1496
          character set correspondence is not exact.
1497
        */
1498
0
        return csp->my_name;
1499
1500
0
      default:
1501
0
        return NULL;
1502
0
      }
1503
0
    }
1504
0
  }
1505
0
  return NULL;
1506
0
}
1507
1508
const char* my_default_csname()
1509
0
{
1510
0
  const char* csname = NULL;
1511
#ifdef _WIN32
1512
  char cpbuf[64];
1513
  UINT cp;
1514
  if (GetACP() == CP_UTF8)
1515
    cp= CP_UTF8;
1516
  else
1517
  {
1518
    cp= GetConsoleCP();
1519
    if (cp == 0)
1520
      cp= GetACP();
1521
  }
1522
  snprintf(cpbuf, sizeof(cpbuf), "cp%d", (int)cp);
1523
  csname = my_os_charset_to_mysql_charset(cpbuf);
1524
#elif defined(HAVE_SETLOCALE) && defined(HAVE_NL_LANGINFO)
1525
0
  if (setlocale(LC_CTYPE, "") && (csname = nl_langinfo(CODESET)))
1526
0
    csname = my_os_charset_to_mysql_charset(csname);
1527
0
#endif
1528
0
  return csname ? csname : MYSQL_DEFAULT_CHARSET_NAME;
1529
0
}
1530
1531
1532
#ifdef _WIN32
1533
/**
1534
  Extract codepage number from "cpNNNN" string,
1535
  and check that this codepage is supported.
1536
1537
  @return 0 - invalid codepage(or unsupported)
1538
          > 0 - valid codepage number.
1539
*/
1540
static UINT get_codepage(const char *s)
1541
{
1542
  UINT cp;
1543
  if (s[0] != 'c' || s[1] != 'p')
1544
  {
1545
    DBUG_ASSERT(0);
1546
    return 0;
1547
  }
1548
  cp= strtoul(s + 2, NULL, 10);
1549
  if (!IsValidCodePage(cp))
1550
  {
1551
    /*
1552
     Can happen also with documented CP, i.e 51936
1553
     Perhaps differs from one machine to another.
1554
    */
1555
    return 0;
1556
  }
1557
  return cp;
1558
}
1559
1560
static UINT mysql_charset_to_codepage(const char *my_cs_name)
1561
{
1562
  const MY_CSET_OS_NAME *csp;
1563
  UINT cp=0,tmp;
1564
  for (csp= charsets; csp->os_name; csp++)
1565
  {
1566
    if (!strcasecmp(csp->my_name, my_cs_name))
1567
    {
1568
      switch (csp->param)
1569
      {
1570
      case my_cs_exact:
1571
        tmp= get_codepage(csp->os_name);
1572
        if (tmp)
1573
          return tmp;
1574
        break;
1575
      case my_cs_approx:
1576
        /*
1577
          don't return just yet, perhaps there is a better
1578
          (exact) match later.
1579
        */
1580
        if (!cp)
1581
          cp= get_codepage(csp->os_name);
1582
        continue;
1583
1584
      default:
1585
        return 0;
1586
      }
1587
    }
1588
  }
1589
  return cp;
1590
}
1591
1592
/** Set console codepage for MariaDB's charset name */
1593
int my_set_console_cp(const char *csname)
1594
{
1595
  UINT cp;
1596
  if (fileno(stdout) < 0 || !isatty(fileno(stdout)))
1597
    return 0;
1598
  cp= mysql_charset_to_codepage(csname);
1599
  if (!cp)
1600
  {
1601
    /* No compatible os charset.*/
1602
    return -1;
1603
  }
1604
1605
  if (GetConsoleOutputCP() != cp && !SetConsoleOutputCP(cp))
1606
  {
1607
    return -1;
1608
  }
1609
1610
  if (GetConsoleCP() != cp && !SetConsoleCP(cp))
1611
  {
1612
    return -1;
1613
  }
1614
  return 0;
1615
}
1616
#endif