Coverage Report

Created: 2024-06-18 07:03

/src/server/mysys/charset.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
   Copyright (c) 2000, 2011, Oracle and/or its affiliates
3
   Copyright (c) 2009, 2020, MariaDB Corporation.
4
5
   This program is free software; you can redistribute it and/or modify
6
   it under the terms of the GNU General Public License as published by
7
   the Free Software Foundation; version 2 of the License.
8
9
   This program is distributed in the hope that it will be useful,
10
   but WITHOUT ANY WARRANTY; without even the implied warranty of
11
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
   GNU General Public License for more details.
13
14
   You should have received a copy of the GNU General Public License
15
   along with this program; if not, write to the Free Software
16
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
17
18
#include "mysys_priv.h"
19
#include "mysys_err.h"
20
#include <m_ctype.h>
21
#include <m_string.h>
22
#include <my_dir.h>
23
#include <hash.h>
24
#include <my_xml.h>
25
#ifdef HAVE_LANGINFO_H
26
#include <langinfo.h>
27
#endif
28
#ifdef HAVE_LOCALE_H
29
#include <locale.h>
30
#endif
31
32
extern HASH charset_name_hash;
33
34
/*
35
  The code below implements this functionality:
36
  
37
    - Initializing charset related structures
38
    - Loading dynamic charsets
39
    - Searching for a proper CHARSET_INFO 
40
      using charset name, collation name or collation ID
41
    - Setting server default character set
42
*/
43
44
static uint
45
get_collation_number_internal(const char *name)
46
0
{
47
48
0
  CHARSET_INFO **cs;
49
0
  for (cs= all_charsets;
50
0
       cs < all_charsets + array_elements(all_charsets);
51
0
       cs++)
52
0
  {
53
0
    if (cs[0] && cs[0]->coll_name.str &&
54
0
        !my_strcasecmp_latin1(cs[0]->coll_name.str, name))
55
0
      return cs[0]->number;
56
0
  }  
57
0
  return 0;
58
0
}
59
60
61
static my_bool is_multi_byte_ident(CHARSET_INFO *cs, uchar ch)
62
0
{
63
0
  int chlen= my_ci_charlen(cs, &ch, &ch + 1);
64
0
  return MY_CS_IS_TOOSMALL(chlen) ? TRUE : FALSE;
65
0
}
66
67
static my_bool init_state_maps(struct charset_info_st *cs)
68
0
{
69
0
  uint i;
70
0
  uchar *state_map;
71
0
  uchar *ident_map;
72
73
0
  if (!(cs->state_map= state_map= (uchar*) my_once_alloc(256*2, MYF(MY_WME))))
74
0
    return 1;
75
    
76
0
  cs->ident_map= ident_map= state_map + 256;
77
78
  /* Fill state_map with states to get a faster parser */
79
0
  for (i=0; i < 256 ; i++)
80
0
  {
81
0
    if (my_isalpha(cs,i))
82
0
      state_map[i]=(uchar) MY_LEX_IDENT;
83
0
    else if (my_isdigit(cs,i))
84
0
      state_map[i]=(uchar) MY_LEX_NUMBER_IDENT;
85
0
    else if (is_multi_byte_ident(cs, i))
86
0
      state_map[i]=(uchar) MY_LEX_IDENT;
87
0
    else if (my_isspace(cs,i))
88
0
      state_map[i]=(uchar) MY_LEX_SKIP;
89
0
    else
90
0
      state_map[i]=(uchar) MY_LEX_CHAR;
91
0
  }
92
0
  state_map[(uchar)'_']=state_map[(uchar)'$']=(uchar) MY_LEX_IDENT;
93
0
  state_map[(uchar)'\'']=(uchar) MY_LEX_STRING;
94
0
  state_map[(uchar)'.']=(uchar) MY_LEX_REAL_OR_POINT;
95
0
  state_map[(uchar)'>']=state_map[(uchar)'=']=state_map[(uchar)'!']= (uchar) MY_LEX_CMP_OP;
96
0
  state_map[(uchar)'<']= (uchar) MY_LEX_LONG_CMP_OP;
97
0
  state_map[(uchar)'&']=state_map[(uchar)'|']=(uchar) MY_LEX_BOOL;
98
0
  state_map[(uchar)'#']=(uchar) MY_LEX_COMMENT;
99
0
  state_map[(uchar)';']=(uchar) MY_LEX_SEMICOLON;
100
0
  state_map[(uchar)':']=(uchar) MY_LEX_SET_VAR;
101
0
  state_map[0]=(uchar) MY_LEX_EOL;
102
0
  state_map[(uchar)'\\']= (uchar) MY_LEX_ESCAPE;
103
0
  state_map[(uchar)'/']= (uchar) MY_LEX_LONG_COMMENT;
104
0
  state_map[(uchar)'*']= (uchar) MY_LEX_END_LONG_COMMENT;
105
0
  state_map[(uchar)'@']= (uchar) MY_LEX_USER_END;
106
0
  state_map[(uchar) '`']= (uchar) MY_LEX_USER_VARIABLE_DELIMITER;
107
0
  state_map[(uchar)'"']= (uchar) MY_LEX_STRING_OR_DELIMITER;
108
0
  state_map[(uchar)'-']= (uchar) MY_LEX_MINUS_OR_COMMENT;
109
0
  state_map[(uchar)',']= (uchar) MY_LEX_COMMA;
110
0
  state_map[(uchar)'?']= (uchar) MY_LEX_PLACEHOLDER;
111
112
  /*
113
    Create a second map to make it faster to find identifiers
114
  */
115
0
  for (i=0; i < 256 ; i++)
116
0
  {
117
0
    ident_map[i]= (uchar) (state_map[i] == MY_LEX_IDENT ||
118
0
         state_map[i] == MY_LEX_NUMBER_IDENT);
119
0
  }
120
121
  /* Special handling of hex and binary strings */
122
0
  state_map[(uchar)'x']= state_map[(uchar)'X']= (uchar) MY_LEX_IDENT_OR_HEX;
123
0
  state_map[(uchar)'b']= state_map[(uchar)'B']= (uchar) MY_LEX_IDENT_OR_BIN;
124
0
  state_map[(uchar)'n']= state_map[(uchar)'N']= (uchar) MY_LEX_IDENT_OR_NCHAR;
125
0
  return 0;
126
0
}
127
128
129
static MY_COLLATION_HANDLER *get_simple_collation_handler_by_flags(uint flags)
130
0
{
131
0
  return flags & MY_CS_BINSORT ?
132
0
           (flags & MY_CS_NOPAD ?
133
0
            &my_collation_8bit_nopad_bin_handler :
134
0
            &my_collation_8bit_bin_handler) :
135
0
           (flags & MY_CS_NOPAD ?
136
0
            &my_collation_8bit_simple_nopad_ci_handler :
137
0
            &my_collation_8bit_simple_ci_handler);
138
0
}
139
140
141
static void simple_cs_init_functions(struct charset_info_st *cs)
142
0
{
143
0
  cs->coll= get_simple_collation_handler_by_flags(cs->state);
144
0
  cs->cset= &my_charset_8bit_handler;
145
0
}
146
147
148
149
static int cs_copy_data(struct charset_info_st *to, CHARSET_INFO *from)
150
0
{
151
0
  to->number= from->number ? from->number : to->number;
152
153
  /* Don't replace csname if already set */
154
0
  if (from->cs_name.str && !to->cs_name.str)
155
0
  {
156
0
    if (!(to->cs_name.str= my_once_memdup(from->cs_name.str,
157
0
                                          from->cs_name.length + 1,
158
0
                                          MYF(MY_WME))))
159
0
      goto err;
160
0
    to->cs_name.length= from->cs_name.length;
161
0
  }
162
  
163
0
  if (from->coll_name.str)
164
0
  {
165
0
    if (!(to->coll_name.str= my_once_memdup(from->coll_name.str,
166
0
                                            from->coll_name.length + 1,
167
0
                                            MYF(MY_WME))))
168
0
      goto err;
169
0
    to->coll_name.length= from->coll_name.length;
170
0
  }
171
  
172
0
  if (from->comment)
173
0
    if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
174
0
      goto err;
175
  
176
0
  if (from->m_ctype)
177
0
  {
178
0
    if (!(to->m_ctype= (uchar*) my_once_memdup((char*) from->m_ctype,
179
0
                                               MY_CS_CTYPE_TABLE_SIZE,
180
0
                                               MYF(MY_WME))))
181
0
      goto err;
182
0
    if (init_state_maps(to))
183
0
      goto err;
184
0
  }
185
0
  if (from->to_lower)
186
0
    if (!(to->to_lower= (uchar*) my_once_memdup((char*) from->to_lower,
187
0
            MY_CS_TO_LOWER_TABLE_SIZE,
188
0
            MYF(MY_WME))))
189
0
      goto err;
190
191
0
  if (from->to_upper)
192
0
    if (!(to->to_upper= (uchar*) my_once_memdup((char*) from->to_upper,
193
0
            MY_CS_TO_UPPER_TABLE_SIZE,
194
0
            MYF(MY_WME))))
195
0
      goto err;
196
0
  if (from->sort_order)
197
0
  {
198
0
    if (!(to->sort_order= (uchar*) my_once_memdup((char*) from->sort_order,
199
0
              MY_CS_SORT_ORDER_TABLE_SIZE,
200
0
              MYF(MY_WME))))
201
0
      goto err;
202
203
0
  }
204
0
  if (from->tab_to_uni)
205
0
  {
206
0
    uint sz= MY_CS_TO_UNI_TABLE_SIZE*sizeof(uint16);
207
0
    if (!(to->tab_to_uni= (uint16*)  my_once_memdup((char*)from->tab_to_uni,
208
0
                sz, MYF(MY_WME))))
209
0
      goto err;
210
0
  }
211
0
  if (from->tailoring)
212
0
    if (!(to->tailoring= my_once_strdup(from->tailoring,MYF(MY_WME))))
213
0
      goto err;
214
215
0
  return 0;
216
217
0
err:
218
0
  return 1;
219
0
}
220
221
222
static my_bool simple_8bit_charset_data_is_full(CHARSET_INFO *cs)
223
0
{
224
0
  return cs->m_ctype && cs->to_upper && cs->to_lower && cs->tab_to_uni;
225
0
}
226
227
228
/**
229
  Inherit missing 8bit charset data from another collation.
230
  Arrays pointed by refcs must be in the permanent memory already,
231
  e.g. static memory, or allocated by my_once_xxx().
232
*/
233
static void
234
inherit_charset_data(struct charset_info_st *cs, CHARSET_INFO *refcs)
235
0
{
236
0
  if (!cs->to_upper)
237
0
    cs->to_upper= refcs->to_upper;
238
0
  if (!cs->to_lower)
239
0
    cs->to_lower= refcs->to_lower;
240
0
  if (!cs->m_ctype)
241
0
    cs->m_ctype= refcs->m_ctype;
242
0
  if (!cs->tab_to_uni)
243
0
    cs->tab_to_uni= refcs->tab_to_uni;
244
0
}
245
246
247
static my_bool simple_8bit_collation_data_is_full(CHARSET_INFO *cs)
248
0
{
249
0
  return cs->sort_order || (cs->state & MY_CS_BINSORT);
250
0
}
251
252
253
/**
254
  Inherit 8bit simple collation data from another collation.
255
  refcs->sort_order must be in the permanent memory already,
256
  e.g. static memory, or allocated by my_once_xxx().
257
*/
258
static void
259
inherit_collation_data(struct charset_info_st *cs, CHARSET_INFO *refcs)
260
0
{
261
0
  if (!simple_8bit_collation_data_is_full(cs))
262
0
    cs->sort_order= refcs->sort_order;
263
0
}
264
265
266
static my_bool simple_cs_is_full(CHARSET_INFO *cs)
267
0
{
268
0
  return  cs->number && cs->cs_name.str && cs->coll_name.str &&
269
0
          simple_8bit_charset_data_is_full(cs) &&
270
0
          (simple_8bit_collation_data_is_full(cs) || cs->tailoring);
271
0
}
272
273
274
#if defined(HAVE_UCA_COLLATIONS) && (defined(HAVE_CHARSET_ucs2) || defined(HAVE_CHARSET_utf8mb3))
275
/**
276
  Initialize a loaded collation.
277
  @param [OUT] to     - The new charset_info_st structure to initialize.
278
  @param [IN]  from   - A template collation, to fill the missing data from.
279
  @param [IN]  loaded - The collation data loaded from the LDML file.
280
                        some data may be missing in "loaded".
281
*/
282
static void
283
copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from,
284
                   CHARSET_INFO *loaded)
285
0
{
286
0
  to->cset= from->cset;
287
0
  to->coll= from->coll;
288
  /*
289
    Single-level UCA collation have strnxfrm_multiple=8.
290
    In case of a multi-level UCA collation we use strnxfrm_multiply=4.
291
    That means MY_COLLATION_HANDLER::strnfrmlen() will request the caller
292
    to allocate a buffer smaller size for each level, for performance purpose,
293
    and to fit longer VARCHARs to @@max_sort_length.
294
    This makes filesort produce non-precise order for some rare Unicode
295
    characters that produce more than 4 weights (long expansions).
296
    UCA requires 2 bytes per weight multiplied by the number of levels.
297
    In case of a 2-level collation, each character requires 4*2=8 bytes.
298
    Therefore, the longest VARCHAR that fits into the default @@max_sort_length
299
    is 1024/8=VARCHAR(128). With strnxfrm_multiply==8, only VARCHAR(64)
300
    would fit.
301
    Note, the built-in collation utf8_thai_520_w2 also uses strnxfrm_multiply=4,
302
    for the same purpose.
303
    TODO: we could add a new LDML syntax to choose strxfrm_multiply value.
304
  */
305
0
  to->strxfrm_multiply= loaded->levels_for_order > 1 ?
306
0
                        4 : from->strxfrm_multiply;
307
0
  to->min_sort_char= from->min_sort_char;
308
0
  to->max_sort_char= from->max_sort_char;
309
0
  to->mbminlen= from->mbminlen;
310
0
  to->mbmaxlen= from->mbmaxlen;
311
0
  to->state|= MY_CS_AVAILABLE | MY_CS_LOADED |
312
0
              MY_CS_STRNXFRM  | MY_CS_UNICODE;
313
0
}
314
#endif
315
316
317
static int add_collation(struct charset_info_st *cs)
318
0
{
319
0
  if (cs->coll_name.str &&
320
0
      (cs->number ||
321
0
       (cs->number=get_collation_number_internal(cs->coll_name.str))) &&
322
0
      cs->number < array_elements(all_charsets))
323
0
  {
324
0
    struct charset_info_st *newcs;
325
0
    if (!(newcs= (struct charset_info_st*) all_charsets[cs->number]))
326
0
    {
327
0
      if (!(all_charsets[cs->number]= newcs=
328
0
         (struct charset_info_st*) my_once_alloc(sizeof(CHARSET_INFO),MYF(0))))
329
0
        return MY_XML_ERROR;
330
0
      bzero(newcs,sizeof(CHARSET_INFO));
331
0
    }
332
0
    else
333
0
    {
334
      /* Don't allow change of csname */
335
0
      if (newcs->cs_name.str && strcmp(newcs->cs_name.str, cs->cs_name.str))
336
0
      {
337
0
        my_error(EE_DUPLICATE_CHARSET, MYF(ME_WARNING),
338
0
                 cs->number, cs->cs_name.str, newcs->cs_name.str);
339
        /*
340
          Continue parsing rest of Index.xml. We got an warning in the log
341
          so the user can fix the wrong character set definition.
342
        */
343
0
        return MY_XML_OK;
344
0
      }
345
0
    }
346
347
0
    if (cs->primary_number == cs->number)
348
0
      cs->state |= MY_CS_PRIMARY;
349
      
350
0
    if (cs->binary_number == cs->number)
351
0
      cs->state |= MY_CS_BINSORT;
352
    
353
0
    newcs->state|= cs->state;
354
    
355
0
    if (!(newcs->state & MY_CS_COMPILED))
356
0
    {
357
0
      if (cs_copy_data(newcs,cs))
358
0
        return MY_XML_ERROR;
359
360
0
      newcs->levels_for_order= 1;
361
      
362
0
      if (!strcmp(cs->cs_name.str,"ucs2") )
363
0
      {
364
0
#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
365
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
366
0
                                  &my_charset_ucs2_unicode_nopad_ci :
367
0
                                  &my_charset_ucs2_unicode_ci,
368
0
                                  cs);
369
0
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
370
0
#endif        
371
0
      }
372
0
      else if (!strcmp(cs->cs_name.str, "utf8") ||
373
0
               !strcmp(cs->cs_name.str, "utf8mb3"))
374
0
      {
375
0
#if defined (HAVE_CHARSET_utf8mb3) && defined(HAVE_UCA_COLLATIONS)
376
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
377
0
                                  &my_charset_utf8mb3_unicode_nopad_ci :
378
0
                                  &my_charset_utf8mb3_unicode_ci,
379
0
                                  cs);
380
0
        newcs->m_ctype= my_charset_utf8mb3_unicode_ci.m_ctype;
381
0
        if (init_state_maps(newcs))
382
0
          return MY_XML_ERROR;
383
0
#endif
384
0
      }
385
0
      else if (!strcmp(cs->cs_name.str, "utf8mb4"))
386
0
      {
387
0
#if defined (HAVE_CHARSET_utf8mb4) && defined(HAVE_UCA_COLLATIONS)
388
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
389
0
                                  &my_charset_utf8mb4_unicode_nopad_ci :
390
0
                                  &my_charset_utf8mb4_unicode_ci,
391
0
                                  cs);
392
0
        newcs->m_ctype= my_charset_utf8mb4_unicode_ci.m_ctype;
393
0
        if (init_state_maps(newcs))
394
0
          return MY_XML_ERROR;
395
0
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
396
0
#endif
397
0
      }
398
0
      else if (!strcmp(cs->cs_name.str, "utf16"))
399
0
      {
400
0
#if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS)
401
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
402
0
                                  &my_charset_utf16_unicode_nopad_ci :
403
0
                                  &my_charset_utf16_unicode_ci,
404
0
                                  cs);
405
0
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
406
0
#endif
407
0
      }
408
0
      else if (!strcmp(cs->cs_name.str, "utf32"))
409
0
      {
410
0
#if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS)
411
0
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
412
0
                                  &my_charset_utf32_unicode_nopad_ci :
413
0
                                  &my_charset_utf32_unicode_ci,
414
0
                                  cs);
415
0
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
416
0
#endif
417
0
      }
418
0
      else
419
0
      {
420
0
        simple_cs_init_functions(newcs);
421
0
        newcs->mbminlen= 1;
422
0
        newcs->mbmaxlen= 1;
423
0
        newcs->strxfrm_multiply= 1;
424
0
        if (simple_cs_is_full(newcs))
425
0
        {
426
0
          newcs->state |= MY_CS_LOADED;
427
0
        }
428
0
      }
429
0
      add_compiled_extra_collation(newcs);
430
0
    }
431
0
    else
432
0
    {
433
      /*
434
        We need the below to make get_charset_name()
435
        and get_charset_number() working even if a
436
        character set has not been really incompiled.
437
        The above functions are used for example
438
        in error message compiler extra/comp_err.c.
439
        If a character set was compiled, this information
440
        will get lost and overwritten in add_compiled_collation().
441
      */
442
0
      newcs->number= cs->number;
443
0
      if (cs->comment)
444
0
  if (!(newcs->comment= my_once_strdup(cs->comment,MYF(MY_WME))))
445
0
    return MY_XML_ERROR;
446
0
      if (cs->cs_name.str && ! newcs->cs_name.str)
447
0
      {
448
0
        if (!(newcs->cs_name.str= my_once_memdup(cs->cs_name.str,
449
0
                                                 cs->cs_name.length+1,
450
0
                                                 MYF(MY_WME))))
451
0
    return MY_XML_ERROR;
452
0
        newcs->cs_name.length= cs->cs_name.length;
453
0
      }
454
0
      if (cs->coll_name.str)
455
0
      {
456
0
  if (!(newcs->coll_name.str= my_once_memdup(cs->coll_name.str,
457
0
                                                   cs->coll_name.length+1,
458
0
                                                  MYF(MY_WME))))
459
0
    return MY_XML_ERROR;
460
0
        newcs->coll_name.length= cs->coll_name.length;
461
0
      }
462
0
    }
463
0
    cs->number= 0;
464
0
    cs->primary_number= 0;
465
0
    cs->binary_number= 0;
466
0
    cs->coll_name.str= 0;
467
0
    cs->coll_name.length= 0;
468
0
    cs->state= 0;
469
0
    cs->sort_order= NULL;
470
0
    cs->tailoring= NULL;
471
0
  }
472
0
  return MY_XML_OK;
473
0
}
474
475
476
/**
477
  Report character set initialization errors and warnings.
478
  Be silent by default: no warnings on the client side.
479
*/
480
static void
481
default_reporter(enum loglevel level  __attribute__ ((unused)),
482
                 const char *format  __attribute__ ((unused)),
483
                 ...)
484
0
{
485
0
}
486
my_error_reporter my_charset_error_reporter= default_reporter;
487
488
489
/**
490
  Wrappers for memory functions my_malloc (and friends)
491
  with C-compatbile API without extra "myf" argument.
492
*/
493
static void *
494
my_once_alloc_c(size_t size)
495
0
{ return my_once_alloc(size, MYF(MY_WME)); }
496
497
498
static void *
499
my_malloc_c(size_t size)
500
0
{ return my_malloc(key_memory_charset_loader, size, MYF(MY_WME)); }
501
502
503
static void *
504
my_realloc_c(void *old, size_t size)
505
0
{ return my_realloc(key_memory_charset_loader, old, size, MYF(MY_WME|MY_ALLOW_ZERO_PTR)); }
506
507
508
/**
509
  Initialize character set loader to use mysys memory management functions.
510
  @param loader  Loader to initialize
511
*/
512
void
513
my_charset_loader_init_mysys(MY_CHARSET_LOADER *loader)
514
0
{
515
0
  loader->error[0]= '\0';
516
0
  loader->once_alloc= my_once_alloc_c;
517
0
  loader->malloc= my_malloc_c;
518
0
  loader->realloc= my_realloc_c;
519
0
  loader->free= my_free;
520
0
  loader->reporter= my_charset_error_reporter;
521
0
  loader->add_collation= add_collation;
522
0
}
523
524
525
0
#define MY_MAX_ALLOWED_BUF 1024*1024
526
#define MY_CHARSET_INDEX "Index.xml"
527
528
const char *charsets_dir= NULL;
529
530
531
static my_bool
532
my_read_charset_file(MY_CHARSET_LOADER *loader,
533
                     const char *filename,
534
                     myf myflags)
535
0
{
536
0
  uchar *buf;
537
0
  int  fd;
538
0
  size_t len, tmp_len;
539
0
  MY_STAT stat_info;
540
  
541
0
  if (!my_stat(filename, &stat_info, MYF(myflags)) ||
542
0
       ((len= (uint)stat_info.st_size) > MY_MAX_ALLOWED_BUF) ||
543
0
       !(buf= (uchar*) my_malloc(key_memory_charset_loader,len,myflags)))
544
0
    return TRUE;
545
  
546
0
  if ((fd= mysql_file_open(key_file_charset, filename, O_RDONLY, myflags)) < 0)
547
0
    goto error;
548
0
  tmp_len= mysql_file_read(fd, buf, len, myflags);
549
0
  mysql_file_close(fd, myflags);
550
0
  if (tmp_len != len)
551
0
    goto error;
552
  
553
0
  if (my_parse_charset_xml(loader, (char *) buf, len))
554
0
  {
555
0
    my_printf_error(EE_UNKNOWN_CHARSET, "Error while parsing '%s': %s\n",
556
0
                    MYF(0), filename, loader->error);
557
0
    goto error;
558
0
  }
559
  
560
0
  my_free(buf);
561
0
  return FALSE;
562
563
0
error:
564
0
  my_free(buf);
565
0
  return TRUE;
566
0
}
567
568
569
char *get_charsets_dir(char *buf)
570
0
{
571
0
  const char *sharedir= SHAREDIR;
572
0
  char *res;
573
0
  DBUG_ENTER("get_charsets_dir");
574
575
0
  if (charsets_dir != NULL)
576
0
    strmake(buf, charsets_dir, FN_REFLEN-1);
577
0
  else
578
0
  {
579
0
    if (test_if_hard_path(sharedir) ||
580
0
  is_prefix(sharedir, DEFAULT_CHARSET_HOME))
581
0
      strxmov(buf, sharedir, "/", CHARSET_DIR, NullS);
582
0
    else
583
0
      strxmov(buf, DEFAULT_CHARSET_HOME, "/", sharedir, "/", CHARSET_DIR,
584
0
        NullS);
585
0
  }
586
0
  res= convert_dirname(buf,buf,NullS);
587
0
  DBUG_PRINT("info",("charsets dir: '%s'", buf));
588
0
  DBUG_RETURN(res);
589
0
}
590
591
CHARSET_INFO *all_charsets[MY_ALL_CHARSETS_SIZE]={NULL};
592
CHARSET_INFO *default_charset_info = &my_charset_latin1;
593
594
595
/*
596
  Add standard character set compiled into the application
597
  All related character sets should share same cname
598
*/
599
600
void add_compiled_collation(struct charset_info_st *cs)
601
0
{
602
0
  DBUG_ASSERT(cs->number < array_elements(all_charsets));
603
0
  all_charsets[cs->number]= cs;
604
0
  cs->state|= MY_CS_AVAILABLE;
605
0
  if ((my_hash_insert(&charset_name_hash, (uchar*) cs)))
606
0
  {
607
#ifndef DBUG_OFF
608
    CHARSET_INFO *org= (CHARSET_INFO*) my_hash_search(&charset_name_hash,
609
                                                      (uchar*) cs->cs_name.str,
610
                                                      cs->cs_name.length);
611
    DBUG_ASSERT(org);
612
    DBUG_ASSERT(org->cs_name.str == cs->cs_name.str);
613
    DBUG_ASSERT(org->cs_name.length == strlen(cs->cs_name.str));
614
#endif
615
0
  }
616
0
}
617
618
619
/*
620
  Add optional characters sets from ctype-extra.c
621
622
  If cname is already in use, replace csname in new object with a pointer to
623
  the already used csname to ensure that all csname's points to the same string
624
  for the same character set.
625
*/
626
627
628
void add_compiled_extra_collation(struct charset_info_st *cs)
629
0
{
630
0
  DBUG_ASSERT(cs->number < array_elements(all_charsets));
631
0
  all_charsets[cs->number]= cs;
632
0
  cs->state|= MY_CS_AVAILABLE;
633
0
  if ((my_hash_insert(&charset_name_hash, (uchar*) cs)))
634
0
  {
635
0
    CHARSET_INFO *org= (CHARSET_INFO*) my_hash_search(&charset_name_hash,
636
0
                                                      (uchar*) cs->cs_name.str,
637
0
                                                      cs->cs_name.length);
638
0
    cs->cs_name= org->cs_name;
639
0
  }
640
0
}
641
642
643
644
static my_pthread_once_t charsets_initialized= MY_PTHREAD_ONCE_INIT;
645
static my_pthread_once_t charsets_template= MY_PTHREAD_ONCE_INIT;
646
647
typedef struct
648
{
649
  ulonglong use_count;
650
} MY_COLLATION_STATISTICS;
651
652
653
static MY_COLLATION_STATISTICS my_collation_statistics[MY_ALL_CHARSETS_SIZE];
654
655
656
my_bool my_collation_is_known_id(uint id)
657
0
{
658
0
  return id > 0 && id < array_elements(all_charsets) && all_charsets[id] ?
659
0
         TRUE : FALSE;
660
0
}
661
662
663
/*
664
  Collation use statistics functions do not lock
665
  counters to avoid mutex contention. This can lose
666
  some counter increments with high thread concurrency.
667
  But this should be Ok, as we don't need exact numbers.
668
*/
669
static inline void my_collation_statistics_inc_use_count(uint id)
670
0
{
671
0
  DBUG_ASSERT(my_collation_is_known_id(id));
672
0
  my_collation_statistics[id].use_count++;
673
0
}
674
675
676
ulonglong my_collation_statistics_get_use_count(uint id)
677
0
{
678
0
  DBUG_ASSERT(my_collation_is_known_id(id));
679
0
  return my_collation_statistics[id].use_count;
680
0
}
681
682
683
const char *my_collation_get_tailoring(uint id)
684
0
{
685
  /* all_charsets[id]->tailoring is never changed after server startup. */
686
0
  DBUG_ASSERT(my_collation_is_known_id(id));
687
0
  return all_charsets[id]->tailoring;
688
0
}
689
690
691
HASH charset_name_hash;
692
693
static uchar *get_charset_key(const uchar *object,
694
                              size_t *size,
695
                              my_bool not_used __attribute__((unused)))
696
0
{
697
0
  CHARSET_INFO *cs= (CHARSET_INFO*) object;
698
0
  *size= cs->cs_name.length;
699
0
  return (uchar*) cs->cs_name.str;
700
0
}
701
702
static void init_available_charsets(void)
703
0
{
704
0
  char fname[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
705
0
  struct charset_info_st **cs;
706
0
  MY_CHARSET_LOADER loader;
707
0
  DBUG_ENTER("init_available_charsets");
708
709
0
  bzero((char*) &all_charsets,sizeof(all_charsets));
710
0
  bzero((char*) &my_collation_statistics, sizeof(my_collation_statistics));
711
712
0
  my_hash_init2(key_memory_charsets, &charset_name_hash, 16,
713
0
                &my_charset_latin1, 64, 0, 0, get_charset_key,
714
0
                0, 0, HASH_UNIQUE);
715
716
0
  init_compiled_charsets(MYF(0));
717
718
  /* Copy compiled charsets */
719
0
  for (cs= (struct charset_info_st**) all_charsets;
720
0
       cs < (struct charset_info_st**) all_charsets +
721
0
            array_elements(all_charsets)-1 ;
722
0
       cs++)
723
0
  {
724
0
    if (*cs)
725
0
    {
726
0
      DBUG_ASSERT(cs[0]->mbmaxlen <= MY_CS_MBMAXLEN);
727
0
      if (cs[0]->m_ctype)
728
0
        if (init_state_maps(*cs))
729
0
          *cs= NULL;
730
0
    }
731
0
  }
732
733
0
  my_charset_loader_init_mysys(&loader);
734
0
  strmov(get_charsets_dir(fname), MY_CHARSET_INDEX);
735
0
  my_read_charset_file(&loader, fname, MYF(0));
736
0
  DBUG_VOID_RETURN;
737
0
}
738
739
740
void free_charsets(void)
741
0
{
742
0
  charsets_initialized= charsets_template;
743
0
  my_hash_free(&charset_name_hash);
744
0
}
745
746
747
static const char*
748
get_collation_name_alias(const char *name, char *buf, size_t bufsize, myf flags)
749
0
{
750
0
  if (!strncasecmp(name, "utf8_", 5))
751
0
  {
752
0
    my_snprintf(buf, bufsize, "utf8mb%c_%s",
753
0
       flags & MY_UTF8_IS_UTF8MB3 ? '3' : '4', name + 5);
754
0
    return buf;
755
0
  }
756
0
  return NULL;
757
0
}
758
759
760
uint get_collation_number(const char *name, myf flags)
761
0
{
762
0
  uint id;
763
0
  char alias[64];
764
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
765
0
  if ((id= get_collation_number_internal(name)))
766
0
    return id;
767
0
  if ((name= get_collation_name_alias(name, alias, sizeof(alias),flags)))
768
0
    return get_collation_number_internal(name);
769
0
  return 0;
770
0
}
771
772
773
static uint
774
get_charset_number_internal(const char *charset_name, uint cs_flags)
775
0
{
776
0
  CHARSET_INFO **cs;
777
  
778
0
  for (cs= all_charsets;
779
0
       cs < all_charsets + array_elements(all_charsets);
780
0
       cs++)
781
0
  {
782
0
    if ( cs[0] && cs[0]->cs_name.str && (cs[0]->state & cs_flags) &&
783
0
         !my_strcasecmp_latin1(cs[0]->cs_name.str, charset_name))
784
0
      return cs[0]->number;
785
0
  }  
786
0
  return 0;
787
0
}
788
789
790
uint get_charset_number(const char *charset_name, uint cs_flags, myf flags)
791
0
{
792
0
  uint id;
793
0
  const char *new_charset_name= flags & MY_UTF8_IS_UTF8MB3 ? "utf8mb3" :
794
0
                                                             "utf8mb4";
795
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
796
0
  if ((id= get_charset_number_internal(charset_name, cs_flags)))
797
0
    return id;
798
0
  if ((charset_name= !my_strcasecmp_latin1(charset_name, "utf8") ?
799
0
                      new_charset_name : NULL))
800
0
    return get_charset_number_internal(charset_name, cs_flags);
801
0
  return 0;
802
0
}
803
                  
804
805
const char *get_charset_name(uint charset_number)
806
0
{
807
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
808
809
0
  if (charset_number < array_elements(all_charsets))
810
0
  {
811
0
    CHARSET_INFO *cs= all_charsets[charset_number];
812
813
0
    if (cs && (cs->number == charset_number) && cs->coll_name.str)
814
0
      return cs->coll_name.str;
815
0
  }
816
  
817
0
  return "?";   /* this mimics find_type() */
818
0
}
819
820
821
static CHARSET_INFO *inheritance_source_by_id(CHARSET_INFO *cs, uint refid)
822
0
{
823
0
  CHARSET_INFO *refcs;
824
0
  return refid && refid != cs->number &&
825
0
         (refcs= all_charsets[refid]) &&
826
0
         (refcs->state & MY_CS_AVAILABLE) ? refcs : NULL;
827
0
}
828
829
830
static CHARSET_INFO *find_collation_data_inheritance_source(CHARSET_INFO *cs, myf flags)
831
0
{
832
0
  const char *beg, *end;
833
0
  if (cs->tailoring &&
834
0
      !strncmp(cs->tailoring, "[import ", 8) &&
835
0
      (end= strchr(cs->tailoring + 8, ']')) &&
836
0
      (beg= cs->tailoring + 8) + MY_CS_COLLATION_NAME_SIZE > end)
837
0
  {
838
0
    char name[MY_CS_COLLATION_NAME_SIZE + 1];
839
0
    memcpy(name, beg, end - beg);
840
0
    name[end - beg]= '\0';
841
0
    return inheritance_source_by_id(cs, get_collation_number(name,MYF(flags)));
842
0
  }
843
0
  return NULL;
844
0
}
845
846
847
static CHARSET_INFO *find_charset_data_inheritance_source(CHARSET_INFO *cs)
848
0
{
849
0
  uint refid= get_charset_number_internal(cs->cs_name.str, MY_CS_PRIMARY);
850
0
  return inheritance_source_by_id(cs, refid);
851
0
}
852
853
854
static CHARSET_INFO *
855
get_internal_charset(MY_CHARSET_LOADER *loader, uint cs_number, myf flags)
856
0
{
857
0
  char  buf[FN_REFLEN];
858
0
  struct charset_info_st *cs;
859
860
0
  DBUG_ASSERT(cs_number < array_elements(all_charsets));
861
862
0
  if ((cs= (struct charset_info_st*) all_charsets[cs_number]))
863
0
  {
864
0
    if (cs->state & MY_CS_READY)  /* if CS is already initialized */
865
0
    {
866
0
      my_collation_statistics_inc_use_count(cs_number);
867
0
      return cs;
868
0
    }
869
870
    /*
871
      To make things thread safe we are not allowing other threads to interfere
872
      while we may changing the cs_info_table
873
    */
874
0
    mysql_mutex_lock(&THR_LOCK_charset);
875
876
0
    if (!(cs->state & (MY_CS_COMPILED|MY_CS_LOADED))) /* if CS is not in memory */
877
0
    {
878
0
      MY_CHARSET_LOADER loader;
879
0
      strxmov(get_charsets_dir(buf), cs->cs_name.str, ".xml", NullS);
880
0
      my_charset_loader_init_mysys(&loader);
881
0
      my_read_charset_file(&loader, buf, flags);
882
0
    }
883
884
0
    if (cs->state & MY_CS_AVAILABLE)
885
0
    {
886
0
      if (!(cs->state & MY_CS_READY))
887
0
      {
888
0
        if (!simple_8bit_charset_data_is_full(cs))
889
0
        {
890
0
          CHARSET_INFO *refcs= find_charset_data_inheritance_source(cs);
891
0
          if (refcs)
892
0
            inherit_charset_data(cs, refcs);
893
0
        }
894
0
        if (!simple_8bit_collation_data_is_full(cs))
895
0
        {
896
0
          CHARSET_INFO *refcl= find_collation_data_inheritance_source(cs, flags);
897
0
          if (refcl)
898
0
            inherit_collation_data(cs, refcl);
899
0
        }
900
901
0
        if (my_ci_init_charset(cs, loader) ||
902
0
            my_ci_init_collation(cs, loader))
903
0
        {
904
0
          cs= NULL;
905
0
        }
906
0
        else
907
0
          cs->state|= MY_CS_READY;
908
0
      }
909
0
      my_collation_statistics_inc_use_count(cs_number);
910
0
    }
911
0
    else
912
0
      cs= NULL;
913
914
0
    mysql_mutex_unlock(&THR_LOCK_charset);
915
0
  }
916
0
  return cs;
917
0
}
918
919
920
CHARSET_INFO *get_charset(uint cs_number, myf flags)
921
0
{
922
0
  CHARSET_INFO *cs= NULL;
923
924
0
  if (cs_number == default_charset_info->number)
925
0
    return default_charset_info;
926
927
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
928
929
0
  if (cs_number < array_elements(all_charsets))
930
0
  {
931
0
    MY_CHARSET_LOADER loader;
932
0
    my_charset_loader_init_mysys(&loader);
933
0
    cs= get_internal_charset(&loader, cs_number, flags);
934
0
  }
935
936
0
  if (!cs && (flags & MY_WME))
937
0
  {
938
0
    char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)], cs_string[23];
939
0
    strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
940
0
    cs_string[0]='#';
941
0
    int10_to_str(cs_number, cs_string+1, 10);
942
0
    my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_string, index_file);
943
0
  }
944
0
  return cs;
945
0
}
946
947
948
/**
949
  Find collation by name: extended version of get_charset_by_name()
950
  to return error messages to the caller.
951
  @param   loader  Character set loader
952
  @param   name    Collation name
953
  @param   flags   Flags
954
  @return          NULL on error, pointer to collation on success
955
*/
956
957
CHARSET_INFO *
958
my_collation_get_by_name(MY_CHARSET_LOADER *loader,
959
                         const char *name, myf flags)
960
0
{
961
0
  uint cs_number;
962
0
  CHARSET_INFO *cs;
963
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
964
965
0
  cs_number= get_collation_number(name,flags);
966
0
  my_charset_loader_init_mysys(loader);
967
0
  cs= cs_number ? get_internal_charset(loader, cs_number, flags) : NULL;
968
969
0
  if (!cs && (flags & MY_WME))
970
0
  {
971
0
    char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
972
0
    strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
973
0
    my_error(EE_UNKNOWN_COLLATION, MYF(ME_BELL), name, index_file);
974
0
  }
975
0
  return cs;
976
0
}
977
978
979
CHARSET_INFO *get_charset_by_name(const char *cs_name, myf flags)
980
0
{
981
0
  MY_CHARSET_LOADER loader;
982
0
  my_charset_loader_init_mysys(&loader);
983
0
  return my_collation_get_by_name(&loader, cs_name, flags);
984
0
}
985
986
987
/**
988
  Find character set by name: extended version of get_charset_by_csname()
989
  to return error messages to the caller.
990
  @param   loader   Character set loader
991
  @param   name     Collation name
992
  @param   cs_flags Character set flags (e.g. default or binary collation)
993
  @param   flags    Flags
994
  @return           NULL on error, pointer to collation on success
995
*/
996
CHARSET_INFO *
997
my_charset_get_by_name(MY_CHARSET_LOADER *loader,
998
                       const char *cs_name, uint cs_flags, myf flags)
999
0
{
1000
0
  uint cs_number;
1001
0
  CHARSET_INFO *cs;
1002
0
  DBUG_ENTER("get_charset_by_csname");
1003
0
  DBUG_PRINT("enter",("name: '%s'", cs_name));
1004
1005
0
  my_pthread_once(&charsets_initialized, init_available_charsets);
1006
1007
0
  cs_number= get_charset_number(cs_name, cs_flags, flags);
1008
0
  cs= cs_number ? get_internal_charset(loader, cs_number, flags) : NULL;
1009
1010
0
  if (!cs && (flags & MY_WME))
1011
0
  {
1012
0
    char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
1013
0
    strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
1014
0
    my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_name, index_file);
1015
0
  }
1016
1017
0
  DBUG_RETURN(cs);
1018
0
}
1019
1020
1021
CHARSET_INFO *
1022
get_charset_by_csname(const char *cs_name, uint cs_flags, myf flags)
1023
0
{
1024
0
  MY_CHARSET_LOADER loader;
1025
0
  my_charset_loader_init_mysys(&loader);
1026
0
  return my_charset_get_by_name(&loader, cs_name, cs_flags, flags);
1027
0
}
1028
1029
1030
/**
1031
  Resolve character set by the character set name (utf8, latin1, ...).
1032
1033
  The function tries to resolve character set by the specified name. If
1034
  there is character set with the given name, it is assigned to the "cs"
1035
  parameter and FALSE is returned. If there is no such character set,
1036
  "default_cs" is assigned to the "cs" and TRUE is returned.
1037
1038
  @param[in] cs_name    Character set name.
1039
  @param[in] default_cs Default character set.
1040
  @param[out] cs        Variable to store character set.
1041
1042
  @return FALSE if character set was resolved successfully; TRUE if there
1043
  is no character set with given name.
1044
*/
1045
1046
my_bool resolve_charset(const char *cs_name,
1047
                        CHARSET_INFO *default_cs,
1048
                        CHARSET_INFO **cs,
1049
                        myf flags)
1050
0
{
1051
0
  *cs= get_charset_by_csname(cs_name, MY_CS_PRIMARY, flags);
1052
1053
0
  if (*cs == NULL)
1054
0
  {
1055
0
    *cs= default_cs;
1056
0
    return TRUE;
1057
0
  }
1058
1059
0
  return FALSE;
1060
0
}
1061
1062
1063
/**
1064
  Resolve collation by the collation name (utf8_general_ci, ...).
1065
1066
  The function tries to resolve collation by the specified name. If there
1067
  is collation with the given name, it is assigned to the "cl" parameter
1068
  and FALSE is returned. If there is no such collation, "default_cl" is
1069
  assigned to the "cl" and TRUE is returned.
1070
1071
  @param[out] cl        Variable to store collation.
1072
  @param[in] cl_name    Collation name.
1073
  @param[in] default_cl Default collation.
1074
1075
  @return FALSE if collation was resolved successfully; TRUE if there is no
1076
  collation with given name.
1077
*/
1078
1079
my_bool resolve_collation(const char *cl_name,
1080
                          CHARSET_INFO *default_cl,
1081
                          CHARSET_INFO **cl,
1082
                          myf my_flags)
1083
0
{
1084
0
  *cl= get_charset_by_name(cl_name, my_flags);
1085
1086
0
  if (*cl == NULL)
1087
0
  {
1088
0
    *cl= default_cl;
1089
0
    return TRUE;
1090
0
  }
1091
1092
0
  return FALSE;
1093
0
}
1094
1095
1096
/*
1097
  Escape string with backslashes (\)
1098
1099
  SYNOPSIS
1100
    escape_string_for_mysql()
1101
    charset_info        Charset of the strings
1102
    to                  Buffer for escaped string
1103
    to_length           Length of destination buffer, or 0
1104
    from                The string to escape
1105
    length              The length of the string to escape
1106
    overflow            Set to 1 if the escaped string did not fit in
1107
                        the to buffer
1108
1109
  DESCRIPTION
1110
    This escapes the contents of a string by adding backslashes before special
1111
    characters, and turning others into specific escape sequences, such as
1112
    turning newlines into \n and null bytes into \0.
1113
1114
  NOTE
1115
    To maintain compatibility with the old C API, to_length may be 0 to mean
1116
    "big enough"
1117
1118
  RETURN VALUES
1119
    #           The length of the escaped string
1120
*/
1121
1122
size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
1123
                               char *to, size_t to_length,
1124
                               const char *from, size_t length,
1125
                               my_bool *overflow)
1126
0
{
1127
0
  const char *to_start= to;
1128
0
  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
1129
0
  *overflow= FALSE;
1130
0
  for (end= from + length; from < end; from++)
1131
0
  {
1132
0
    char escape= 0;
1133
0
#ifdef USE_MB
1134
0
    int tmp_length= my_ci_charlen(charset_info, (const uchar *) from, (const uchar *) end);
1135
0
    if (tmp_length > 1)
1136
0
    {
1137
0
      if (to + tmp_length > to_end)
1138
0
      {
1139
0
        *overflow= TRUE;
1140
0
        break;
1141
0
      }
1142
0
      while (tmp_length--)
1143
0
  *to++= *from++;
1144
0
      from--;
1145
0
      continue;
1146
0
    }
1147
    /*
1148
     If the next character appears to begin a multi-byte character, we
1149
     escape that first byte of that apparent multi-byte character. (The
1150
     character just looks like a multi-byte character -- if it were actually
1151
     a multi-byte character, it would have been passed through in the test
1152
     above.)
1153
1154
     Without this check, we can create a problem by converting an invalid
1155
     multi-byte character into a valid one. For example, 0xbf27 is not
1156
     a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
1157
    */
1158
0
    if (tmp_length < 1) /* Bad byte sequence */
1159
0
      escape= *from;
1160
0
    else
1161
0
#endif
1162
0
    switch (*from) {
1163
0
    case 0:       /* Must be escaped for 'mysql' */
1164
0
      escape= '0';
1165
0
      break;
1166
0
    case '\n':        /* Must be escaped for logs */
1167
0
      escape= 'n';
1168
0
      break;
1169
0
    case '\r':
1170
0
      escape= 'r';
1171
0
      break;
1172
0
    case '\\':
1173
0
      escape= '\\';
1174
0
      break;
1175
0
    case '\'':
1176
0
      escape= '\'';
1177
0
      break;
1178
0
    case '"':       /* Better safe than sorry */
1179
0
      escape= '"';
1180
0
      break;
1181
0
    case '\032':      /* This gives problems on Win32 */
1182
0
      escape= 'Z';
1183
0
      break;
1184
0
    }
1185
0
    if (escape)
1186
0
    {
1187
0
      if (to + 2 > to_end)
1188
0
      {
1189
0
        *overflow= TRUE;
1190
0
        break;
1191
0
      }
1192
0
      *to++= '\\';
1193
0
      *to++= escape;
1194
0
    }
1195
0
    else
1196
0
    {
1197
0
      if (to + 1 > to_end)
1198
0
      {
1199
0
        *overflow= TRUE;
1200
0
        break;
1201
0
      }
1202
0
      *to++= *from;
1203
0
    }
1204
0
  }
1205
0
  *to= 0;
1206
0
  return (size_t) (to - to_start);
1207
0
}
1208
1209
1210
#ifdef BACKSLASH_MBTAIL
1211
CHARSET_INFO *fs_character_set()
1212
{
1213
  static CHARSET_INFO *fs_cset_cache;
1214
  if (fs_cset_cache)
1215
    return fs_cset_cache;
1216
#ifdef HAVE_CHARSET_cp932
1217
  else if (GetACP() == 932)
1218
    return fs_cset_cache= &my_charset_cp932_japanese_ci;
1219
#endif
1220
  else
1221
    return fs_cset_cache= &my_charset_bin;
1222
}
1223
#endif
1224
1225
/*
1226
  Escape apostrophes by doubling them up
1227
1228
  SYNOPSIS
1229
    escape_quotes_for_mysql()
1230
    charset_info        Charset of the strings
1231
    to                  Buffer for escaped string
1232
    to_length           Length of destination buffer, or 0
1233
    from                The string to escape
1234
    length              The length of the string to escape
1235
    overflow            Set to 1 if the buffer overflows
1236
1237
  DESCRIPTION
1238
    This escapes the contents of a string by doubling up any apostrophes that
1239
    it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
1240
    effect on the server.
1241
1242
  NOTE
1243
    To be consistent with escape_string_for_mysql(), to_length may be 0 to
1244
    mean "big enough"
1245
1246
  RETURN VALUES
1247
     The length of the escaped string
1248
*/
1249
1250
size_t escape_quotes_for_mysql(CHARSET_INFO *charset_info,
1251
                               char *to, size_t to_length,
1252
                               const char *from, size_t length,
1253
                               my_bool *overflow)
1254
0
{
1255
0
  const char *to_start= to;
1256
0
  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
1257
0
#ifdef USE_MB
1258
0
  my_bool use_mb_flag= my_ci_use_mb(charset_info);
1259
0
#endif
1260
0
  *overflow= FALSE;
1261
0
  for (end= from + length; from < end; from++)
1262
0
  {
1263
0
#ifdef USE_MB
1264
0
    int tmp_length;
1265
0
    if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
1266
0
    {
1267
0
      if (to + tmp_length > to_end)
1268
0
      {
1269
0
        *overflow= TRUE;
1270
0
        break;
1271
0
      }
1272
0
      while (tmp_length--)
1273
0
  *to++= *from++;
1274
0
      from--;
1275
0
      continue;
1276
0
    }
1277
    /*
1278
      We don't have the same issue here with a non-multi-byte character being
1279
      turned into a multi-byte character by the addition of an escaping
1280
      character, because we are only escaping the ' character with itself.
1281
     */
1282
0
#endif
1283
0
    if (*from == '\'')
1284
0
    {
1285
0
      if (to + 2 > to_end)
1286
0
      {
1287
0
        *overflow= TRUE;
1288
0
        break;
1289
0
      }
1290
0
      *to++= '\'';
1291
0
      *to++= '\'';
1292
0
    }
1293
0
    else
1294
0
    {
1295
0
      if (to + 1 > to_end)
1296
0
      {
1297
0
        *overflow= TRUE;
1298
0
        break;
1299
0
      }
1300
0
      *to++= *from;
1301
0
    }
1302
0
  }
1303
0
  *to= 0;
1304
0
  return (size_t) (to - to_start);
1305
0
}
1306
1307
1308
typedef enum my_cs_match_type_enum
1309
{
1310
  /* MySQL and OS charsets are fully compatible */
1311
  my_cs_exact,
1312
  /* MySQL charset is very close to OS charset  */
1313
  my_cs_approx,
1314
  /*
1315
    MySQL knows this charset, but it is not supported as client character set.
1316
  */
1317
  my_cs_unsupp
1318
} my_cs_match_type;
1319
1320
1321
typedef struct str2str_st
1322
{
1323
  const char* os_name;
1324
  const char* my_name;
1325
  my_cs_match_type param;
1326
} MY_CSET_OS_NAME;
1327
1328
static const MY_CSET_OS_NAME charsets[] =
1329
{
1330
#ifdef _WIN32
1331
  {"cp437",          "cp850",    my_cs_approx},
1332
  {"cp850",          "cp850",    my_cs_exact},
1333
  {"cp852",          "cp852",    my_cs_exact},
1334
  {"cp858",          "cp850",    my_cs_approx},
1335
  {"cp866",          "cp866",    my_cs_exact},
1336
  {"cp874",          "tis620",   my_cs_approx},
1337
  {"cp932",          "cp932",    my_cs_exact},
1338
  {"cp936",          "gbk",      my_cs_approx},
1339
  {"cp949",          "euckr",    my_cs_approx},
1340
  {"cp950",          "big5",     my_cs_exact},
1341
  {"cp1200",         "utf16le",  my_cs_unsupp},
1342
  {"cp1201",         "utf16",    my_cs_unsupp},
1343
  {"cp1250",         "cp1250",   my_cs_exact},
1344
  {"cp1251",         "cp1251",   my_cs_exact},
1345
  {"cp1252",         "latin1",   my_cs_exact},
1346
  {"cp1253",         "greek",    my_cs_exact},
1347
  {"cp1254",         "latin5",   my_cs_exact},
1348
  {"cp1255",         "hebrew",   my_cs_approx},
1349
  {"cp1256",         "cp1256",   my_cs_exact},
1350
  {"cp1257",         "cp1257",   my_cs_exact},
1351
  {"cp10000",        "macroman", my_cs_exact},
1352
  {"cp10001",        "sjis",     my_cs_approx},
1353
  {"cp10002",        "big5",     my_cs_approx},
1354
  {"cp10008",        "gb2312",   my_cs_approx},
1355
  {"cp10021",        "tis620",   my_cs_approx},
1356
  {"cp10029",        "macce",    my_cs_exact},
1357
  {"cp12001",        "utf32",    my_cs_unsupp},
1358
  {"cp20107",        "swe7",     my_cs_exact},
1359
  {"cp20127",        "latin1",   my_cs_approx},
1360
  {"cp20866",        "koi8r",    my_cs_exact},
1361
  {"cp20932",        "ujis",     my_cs_exact},
1362
  {"cp20936",        "gb2312",   my_cs_approx},
1363
  {"cp20949",        "euckr",    my_cs_approx},
1364
  {"cp21866",        "koi8u",    my_cs_exact},
1365
  {"cp28591",        "latin1",   my_cs_approx},
1366
  {"cp28592",        "latin2",   my_cs_exact},
1367
  {"cp28597",        "greek",    my_cs_exact},
1368
  {"cp28598",        "hebrew",   my_cs_exact},
1369
  {"cp28599",        "latin5",   my_cs_exact},
1370
  {"cp28603",        "latin7",   my_cs_exact},
1371
#ifdef UNCOMMENT_THIS_WHEN_WL_4579_IS_DONE
1372
  {"cp28605",        "latin9",   my_cs_exact},
1373
#endif
1374
  {"cp38598",        "hebrew",   my_cs_exact},
1375
  {"cp51932",        "ujis",     my_cs_exact},
1376
  {"cp51936",        "gb2312",   my_cs_exact},
1377
  {"cp51949",        "euckr",    my_cs_exact},
1378
  {"cp51950",        "big5",     my_cs_exact},
1379
#ifdef UNCOMMENT_THIS_WHEN_WL_WL_4024_IS_DONE
1380
  {"cp54936",        "gb18030",  my_cs_exact},
1381
#endif
1382
  {"cp65001",        "utf8mb4",  my_cs_exact},
1383
  {"cp65001",        "utf8mb3",  my_cs_approx},
1384
#else /* not Windows */
1385
1386
  {"646",            "latin1",   my_cs_approx}, /* Default on Solaris */
1387
  {"ANSI_X3.4-1968", "latin1",   my_cs_approx},
1388
  {"ansi1251",       "cp1251",   my_cs_exact},
1389
  {"armscii8",       "armscii8", my_cs_exact},
1390
  {"armscii-8",      "armscii8", my_cs_exact},
1391
  {"ASCII",          "latin1",   my_cs_approx},
1392
  {"Big5",           "big5",     my_cs_exact},
1393
  {"cp1251",         "cp1251",   my_cs_exact},
1394
  {"cp1255",         "hebrew",   my_cs_approx},
1395
  {"CP866",          "cp866",    my_cs_exact},
1396
  {"eucCN",          "gb2312",   my_cs_exact},
1397
  {"euc-CN",         "gb2312",   my_cs_exact},
1398
  {"eucJP",          "ujis",     my_cs_exact},
1399
  {"euc-JP",         "ujis",     my_cs_exact},
1400
  {"eucKR",          "euckr",    my_cs_exact},
1401
  {"euc-KR",         "euckr",    my_cs_exact},
1402
#ifdef UNCOMMENT_THIS_WHEN_WL_WL_4024_IS_DONE
1403
  {"gb18030",        "gb18030",  my_cs_exact},
1404
#endif
1405
  {"gb2312",         "gb2312",   my_cs_exact},
1406
  {"gbk",            "gbk",      my_cs_exact},
1407
  {"georgianps",     "geostd8",  my_cs_exact},
1408
  {"georgian-ps",    "geostd8",  my_cs_exact},
1409
  {"IBM-1252",       "cp1252",   my_cs_exact},
1410
1411
  {"iso88591",       "latin1",   my_cs_approx},
1412
  {"ISO_8859-1",     "latin1",   my_cs_approx},
1413
  {"ISO8859-1",      "latin1",   my_cs_approx},
1414
  {"ISO-8859-1",     "latin1",   my_cs_approx},
1415
1416
  {"iso885913",      "latin7",   my_cs_exact},
1417
  {"ISO_8859-13",    "latin7",   my_cs_exact},
1418
  {"ISO8859-13",     "latin7",   my_cs_exact},
1419
  {"ISO-8859-13",    "latin7",   my_cs_exact},
1420
1421
#ifdef UNCOMMENT_THIS_WHEN_WL_4579_IS_DONE
1422
  {"iso885915",      "latin9",   my_cs_exact},
1423
  {"ISO_8859-15",    "latin9",   my_cs_exact},
1424
  {"ISO8859-15",     "latin9",   my_cs_exact},
1425
  {"ISO-8859-15",    "latin9",   my_cs_exact},
1426
#endif
1427
1428
  {"iso88592",       "latin2",   my_cs_exact},
1429
  {"ISO_8859-2",     "latin2",   my_cs_exact},
1430
  {"ISO8859-2",      "latin2",   my_cs_exact},
1431
  {"ISO-8859-2",     "latin2",   my_cs_exact},
1432
1433
  {"iso88597",       "greek",    my_cs_exact},
1434
  {"ISO_8859-7",     "greek",    my_cs_exact},
1435
  {"ISO8859-7",      "greek",    my_cs_exact},
1436
  {"ISO-8859-7",     "greek",    my_cs_exact},
1437
1438
  {"iso88598",       "hebrew",   my_cs_exact},
1439
  {"ISO_8859-8",     "hebrew",   my_cs_exact},
1440
  {"ISO8859-8",      "hebrew",   my_cs_exact},
1441
  {"ISO-8859-8",     "hebrew",   my_cs_exact},
1442
1443
  {"iso88599",       "latin5",   my_cs_exact},
1444
  {"ISO_8859-9",     "latin5",   my_cs_exact},
1445
  {"ISO8859-9",      "latin5",   my_cs_exact},
1446
  {"ISO-8859-9",     "latin5",   my_cs_exact},
1447
1448
  {"koi8r",          "koi8r",    my_cs_exact},
1449
  {"KOI8-R",         "koi8r",    my_cs_exact},
1450
  {"koi8u",          "koi8u",    my_cs_exact},
1451
  {"KOI8-U",         "koi8u",    my_cs_exact},
1452
1453
  {"roman8",         "hp8",      my_cs_exact}, /* Default on HP UX */
1454
1455
  {"Shift_JIS",      "sjis",     my_cs_exact},
1456
  {"SJIS",           "sjis",     my_cs_exact},
1457
  {"shiftjisx0213",  "sjis",     my_cs_exact},
1458
1459
  {"tis620",         "tis620",   my_cs_exact},
1460
  {"tis-620",        "tis620",   my_cs_exact},
1461
1462
  {"ujis",           "ujis",     my_cs_exact},
1463
1464
  {"US-ASCII",       "latin1",   my_cs_approx},
1465
1466
  {"utf8",           "utf8",     my_cs_exact},
1467
  {"utf-8",          "utf8",     my_cs_exact},
1468
#endif
1469
  {NULL,             NULL,       0}
1470
};
1471
1472
1473
static const char*
1474
my_os_charset_to_mysql_charset(const char* csname)
1475
0
{
1476
0
  const MY_CSET_OS_NAME* csp;
1477
0
  for (csp = charsets; csp->os_name; csp++)
1478
0
  {
1479
0
    if (!strcasecmp(csp->os_name, csname))
1480
0
    {
1481
0
      switch (csp->param)
1482
0
      {
1483
0
      case my_cs_exact:
1484
0
        return csp->my_name;
1485
1486
0
      case my_cs_approx:
1487
        /*
1488
          Maybe we should print a warning eventually:
1489
          character set correspondence is not exact.
1490
        */
1491
0
        return csp->my_name;
1492
1493
0
      default:
1494
0
        return NULL;
1495
0
      }
1496
0
    }
1497
0
  }
1498
0
  return NULL;
1499
0
}
1500
1501
const char* my_default_csname()
1502
0
{
1503
0
  const char* csname = NULL;
1504
#ifdef _WIN32
1505
  char cpbuf[64];
1506
  UINT cp;
1507
  if (GetACP() == CP_UTF8)
1508
    cp= CP_UTF8;
1509
  else
1510
  {
1511
    cp= GetConsoleCP();
1512
    if (cp == 0)
1513
      cp= GetACP();
1514
  }
1515
  snprintf(cpbuf, sizeof(cpbuf), "cp%d", (int)cp);
1516
  csname = my_os_charset_to_mysql_charset(cpbuf);
1517
#elif defined(HAVE_SETLOCALE) && defined(HAVE_NL_LANGINFO)
1518
0
  if (setlocale(LC_CTYPE, "") && (csname = nl_langinfo(CODESET)))
1519
0
    csname = my_os_charset_to_mysql_charset(csname);
1520
0
#endif
1521
0
  return csname ? csname : MYSQL_DEFAULT_CHARSET_NAME;
1522
0
}
1523
1524
1525
#ifdef _WIN32
1526
/**
1527
  Extract codepage number from "cpNNNN" string,
1528
  and check that this codepage is supported.
1529
1530
  @return 0 - invalid codepage(or unsupported)
1531
          > 0 - valid codepage number.
1532
*/
1533
static UINT get_codepage(const char *s)
1534
{
1535
  UINT cp;
1536
  if (s[0] != 'c' || s[1] != 'p')
1537
  {
1538
    DBUG_ASSERT(0);
1539
    return 0;
1540
  }
1541
  cp= strtoul(s + 2, NULL, 10);
1542
  if (!IsValidCodePage(cp))
1543
  {
1544
    /*
1545
     Can happen also with documented CP, i.e 51936
1546
     Perhaps differs from one machine to another.
1547
    */
1548
    return 0;
1549
  }
1550
  return cp;
1551
}
1552
1553
static UINT mysql_charset_to_codepage(const char *my_cs_name)
1554
{
1555
  const MY_CSET_OS_NAME *csp;
1556
  UINT cp=0,tmp;
1557
  for (csp= charsets; csp->os_name; csp++)
1558
  {
1559
    if (!strcasecmp(csp->my_name, my_cs_name))
1560
    {
1561
      switch (csp->param)
1562
      {
1563
      case my_cs_exact:
1564
        tmp= get_codepage(csp->os_name);
1565
        if (tmp)
1566
          return tmp;
1567
        break;
1568
      case my_cs_approx:
1569
        /*
1570
          don't return just yet, perhaps there is a better
1571
          (exact) match later.
1572
        */
1573
        if (!cp)
1574
          cp= get_codepage(csp->os_name);
1575
        continue;
1576
1577
      default:
1578
        return 0;
1579
      }
1580
    }
1581
  }
1582
  return cp;
1583
}
1584
1585
/** Set console codepage for MariaDB's charset name */
1586
int my_set_console_cp(const char *csname)
1587
{
1588
  UINT cp;
1589
  if (fileno(stdout) < 0 || !isatty(fileno(stdout)))
1590
    return 0;
1591
  cp= mysql_charset_to_codepage(csname);
1592
  if (!cp)
1593
  {
1594
    /* No compatible os charset.*/
1595
    return -1;
1596
  }
1597
1598
  if (GetConsoleOutputCP() != cp && !SetConsoleOutputCP(cp))
1599
  {
1600
    return -1;
1601
  }
1602
1603
  if (GetConsoleCP() != cp && !SetConsoleCP(cp))
1604
  {
1605
    return -1;
1606
  }
1607
  return 0;
1608
}
1609
#endif