Coverage Report

Created: 2024-06-18 07:03

/src/server/strings/ctype-mb.c
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (c) 2000, 2014, Oracle and/or its affiliates.
2
   Copyright (c) 2009, 2021, MariaDB Corporation.
3
4
   This program is free software; you can redistribute it and/or modify
5
   it under the terms of the GNU General Public License as published by
6
   the Free Software Foundation; version 2 of the License.
7
8
   This program is distributed in the hope that it will be useful,
9
   but WITHOUT ANY WARRANTY; without even the implied warranty of
10
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
   GNU General Public License for more details.
12
13
   You should have received a copy of the GNU General Public License
14
   along with this program; if not, write to the Free Software
15
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
16
17
#include "strings_def.h"
18
#include <m_ctype.h>
19
#include "ctype-mb.h"
20
21
#ifdef USE_MB
22
23
24
static inline const MY_CASEFOLD_CHARACTER*
25
get_case_info_for_ch(CHARSET_INFO *cs, uint page, uint offs)
26
0
{
27
0
  const MY_CASEFOLD_CHARACTER *p;
28
0
  return cs->casefold && (p= cs->casefold->page[page]) ? &p[offs] : NULL;
29
0
}
30
31
32
/*
33
  Case folding functions for CJK character set.
34
  Case conversion can optionally reduce string octet length.
35
  For example, in EUCKR,
36
    _euckr 0xA9A5 == "LATIN LETTER DOTLESS I" (Turkish letter)
37
  is upper-cased to to
38
    _euckr 0x49 "LATIN CAPITAL LETTER I"  ('usual' letter I)
39
  Length is reduced in this example from two bytes to one byte.
40
*/
41
static size_t
42
my_casefold_mb(CHARSET_INFO *cs,
43
               const char *src, size_t srclen,
44
               char *dst, size_t dstlen __attribute__((unused)),
45
               const uchar *map,
46
               size_t is_upper)
47
0
{
48
0
  const char *srcend= src + srclen;
49
0
  char *dst0= dst;
50
51
0
  DBUG_ASSERT(cs->mbmaxlen == 2);
52
53
0
  while (src < srcend)
54
0
  {
55
0
    size_t mblen= my_ismbchar(cs, src, srcend);
56
0
    if (mblen)
57
0
    {
58
0
      const MY_CASEFOLD_CHARACTER *ch;
59
0
      if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1])))
60
0
      {
61
0
        int code= is_upper ? ch->toupper : ch->tolower;
62
0
        src+= 2;
63
0
        if (code > 0xFF)
64
0
          *dst++= code >> 8;
65
0
        *dst++= code & 0xFF;
66
0
      }
67
0
      else
68
0
      {
69
0
        *dst++= *src++;
70
0
        *dst++= *src++;
71
0
      }
72
0
    }
73
0
    else
74
0
    {
75
0
      *dst++= (char) map[(uchar) *src++];
76
0
    }
77
0
  }
78
0
  return (size_t) (dst - dst0);
79
0
}
80
81
82
size_t
83
my_casedn_mb(CHARSET_INFO * cs, const char *src, size_t srclen,
84
                    char *dst, size_t dstlen)
85
0
{
86
0
  DBUG_ASSERT(dstlen >= srclen * cs->cset->casedn_multiply(cs));
87
0
  DBUG_ASSERT(src != dst || cs->cset->casedn_multiply(cs) == 1);
88
0
  return my_casefold_mb(cs, src, srclen, dst, dstlen, cs->to_lower, 0);
89
0
}
90
91
92
size_t
93
my_caseup_mb(CHARSET_INFO * cs, const char *src, size_t srclen,
94
             char *dst, size_t dstlen)
95
0
{
96
0
  DBUG_ASSERT(dstlen >= srclen * cs->cset->caseup_multiply(cs));
97
0
  DBUG_ASSERT(src != dst || cs->cset->caseup_multiply(cs) == 1);
98
0
  return my_casefold_mb(cs, src, srclen, dst, dstlen, cs->to_upper, 1);
99
0
}
100
101
102
/*
103
** Compare string against string with wildcard
104
**  0 if matched
105
**  -1 if not matched with wildcard
106
**   1 if matched with wildcard
107
*/
108
109
0
#define INC_PTR(cs,A,B) A+=(my_ismbchar(cs,A,B) ? my_ismbchar(cs,A,B) : 1)
110
111
0
#define likeconv(s,A) (uchar) (s)->sort_order[(uchar) (A)]
112
113
static
114
int my_wildcmp_mb_impl(CHARSET_INFO *cs,
115
                       const char *str,const char *str_end,
116
                       const char *wildstr,const char *wildend,
117
                       int escape, int w_one, int w_many, int recurse_level)
118
0
{
119
0
  int result= -1;       /* Not found, using wildcards */
120
121
0
  if (my_string_stack_guard && my_string_stack_guard(recurse_level))
122
0
    return 1;
123
0
  while (wildstr != wildend)
124
0
  {
125
0
    while (*wildstr != w_many && *wildstr != w_one)
126
0
    {
127
0
      int l;
128
0
      if (*wildstr == escape && wildstr+1 != wildend)
129
0
  wildstr++;
130
0
      if ((l = my_ismbchar(cs, wildstr, wildend)))
131
0
      {
132
0
    if (str+l > str_end || memcmp(str, wildstr, l) != 0)
133
0
        return 1;
134
0
    str += l;
135
0
    wildstr += l;
136
0
      }
137
0
      else
138
0
      if (str == str_end || likeconv(cs,*wildstr++) != likeconv(cs,*str++))
139
0
  return(1);       /* No match */
140
0
      if (wildstr == wildend)
141
0
  return (str != str_end);   /* Match if both are at end */
142
0
      result=1;         /* Found an anchor char */
143
0
    }
144
0
    if (*wildstr == w_one)
145
0
    {
146
0
      do
147
0
      {
148
0
  if (str == str_end)     /* Skip one char if possible */
149
0
    return (result);
150
0
  INC_PTR(cs,str,str_end);
151
0
      } while (++wildstr < wildend && *wildstr == w_one);
152
0
      if (wildstr == wildend)
153
0
  break;
154
0
    }
155
0
    if (*wildstr == w_many)
156
0
    {           /* Found w_many */
157
0
      uchar cmp;
158
0
      const char* mb = wildstr;
159
0
      int mb_len=0;
160
      
161
0
      wildstr++;
162
      /* Remove any '%' and '_' from the wild search string */
163
0
      for (; wildstr != wildend ; wildstr++)
164
0
      {
165
0
  if (*wildstr == w_many)
166
0
    continue;
167
0
  if (*wildstr == w_one)
168
0
  {
169
0
    if (str == str_end)
170
0
      return (-1);
171
0
    INC_PTR(cs,str,str_end);
172
0
    continue;
173
0
  }
174
0
  break;         /* Not a wild character */
175
0
      }
176
0
      if (wildstr == wildend)
177
0
  return(0);       /* Ok if w_many is last */
178
0
      if (str == str_end)
179
0
  return -1;
180
      
181
0
      if ((cmp= *wildstr) == escape && wildstr+1 != wildend)
182
0
  cmp= *++wildstr;
183
  
184
0
      mb=wildstr;
185
0
      mb_len= my_ismbchar(cs, wildstr, wildend);
186
0
      INC_PTR(cs,wildstr,wildend);   /* This is compared trough cmp */
187
0
      cmp=likeconv(cs,cmp);   
188
0
      do
189
0
      {
190
0
        for (;;)
191
0
        {
192
0
          if (str >= str_end)
193
0
            return -1;
194
0
          if (mb_len)
195
0
          {
196
0
            if (str+mb_len <= str_end && memcmp(str, mb, mb_len) == 0)
197
0
            {
198
0
              str += mb_len;
199
0
              break;
200
0
            }
201
0
          }
202
0
          else if (!my_ismbchar(cs, str, str_end) &&
203
0
                   likeconv(cs,*str) == cmp)
204
0
          {
205
0
            str++;
206
0
            break;
207
0
          }
208
0
          INC_PTR(cs,str, str_end);
209
0
        }
210
0
  {
211
0
    int tmp=my_wildcmp_mb_impl(cs,str,str_end,wildstr,wildend,escape,w_one,
212
0
                                     w_many, recurse_level + 1);
213
0
    if (tmp <= 0)
214
0
      return (tmp);
215
0
  }
216
0
      } while (str != str_end);
217
0
      return(-1);
218
0
    }
219
0
  }
220
0
  return (str != str_end ? 1 : 0);
221
0
}
222
223
int my_wildcmp_mb(CHARSET_INFO *cs,
224
                  const char *str,const char *str_end,
225
                  const char *wildstr,const char *wildend,
226
                  int escape, int w_one, int w_many)
227
0
{
228
0
  return my_wildcmp_mb_impl(cs, str, str_end,
229
0
                            wildstr, wildend,
230
0
                            escape, w_one, w_many, 1);
231
0
}
232
233
234
size_t my_numchars_mb(CHARSET_INFO *cs __attribute__((unused)),
235
          const char *pos, const char *end)
236
0
{
237
0
  register size_t count= 0;
238
0
  while (pos < end) 
239
0
  {
240
0
    uint mb_len;
241
0
    pos+= (mb_len= my_ismbchar(cs,pos,end)) ? mb_len : 1;
242
0
    count++;
243
0
  }
244
0
  return count;
245
0
}
246
247
248
size_t my_charpos_mb(CHARSET_INFO *cs __attribute__((unused)),
249
         const char *pos, const char *end, size_t length)
250
0
{
251
0
  const char *start= pos;
252
  
253
0
  while (length && pos < end)
254
0
  {
255
0
    uint mb_len;
256
0
    pos+= (mb_len= my_ismbchar(cs, pos, end)) ? mb_len : 1;
257
0
    length--;
258
0
  }
259
0
  return (size_t) (length ? end+2-start : pos-start);
260
0
}
261
262
263
/*
264
  Append a badly formed piece of string.
265
  Bad bytes are fixed to '?'.
266
  
267
  @param to        The destination string
268
  @param to_end    The end of the destination string
269
  @param from      The source string
270
  @param from_end  The end of the source string
271
  @param nchars    Write not more than "nchars" characters.
272
  @param status    Copying status, must be previously initialized,
273
                   e.g. using well_formed_char_length() on the original
274
                   full source string.
275
*/
276
static size_t
277
my_append_fix_badly_formed_tail(CHARSET_INFO *cs,
278
                                char *to, char *to_end,
279
                                const char *from, const char *from_end,
280
                                size_t nchars,
281
                                MY_STRCOPY_STATUS *status)
282
0
{
283
0
  char *to0= to;
284
285
0
  for ( ; nchars; nchars--)
286
0
  {
287
0
    int chlen;
288
0
    if ((chlen= my_ci_charlen(cs, (const uchar*) from,
289
0
                                  (const uchar *) from_end)) > 0)
290
0
    {
291
      /* Found a valid character */         /* chlen == 1..MBMAXLEN  */
292
0
      DBUG_ASSERT(chlen <= (int) cs->mbmaxlen);
293
0
      if (to + chlen > to_end)
294
0
        goto end;                           /* Does not fit to "to" */
295
0
      memcpy(to, from, (size_t) chlen);
296
0
      from+= chlen;
297
0
      to+= chlen;
298
0
      continue;
299
0
    }
300
0
    if (chlen == MY_CS_ILSEQ)              /* chlen == 0 */
301
0
    {
302
0
      DBUG_ASSERT(from < from_end);  /* Shouldn't get MY_CS_ILSEQ if empty */
303
0
      goto bad;
304
0
    }
305
    /* Got an incomplete character */       /* chlen == MY_CS_TOOSMALLXXX  */
306
0
    DBUG_ASSERT(chlen >= MY_CS_TOOSMALL6); 
307
0
    DBUG_ASSERT(chlen <= MY_CS_TOOSMALL);
308
0
    if (from >= from_end)                   
309
0
      break;                                /* End of the source string    */
310
0
bad:
311
    /* Bad byte sequence, or incomplete character found */
312
0
    if (!status->m_well_formed_error_pos)
313
0
      status->m_well_formed_error_pos= from;
314
315
0
    if ((chlen= my_ci_wc_mb(cs, '?', (uchar*) to, (uchar *) to_end)) <= 0)
316
0
      break; /* Question mark does not fit into the destination */
317
0
    to+= chlen;
318
0
    from++;
319
0
  }
320
0
end:
321
0
  status->m_source_end_pos= from;
322
0
  return to - to0;
323
0
}
324
325
326
size_t
327
my_copy_fix_mb(CHARSET_INFO *cs,
328
               char *dst, size_t dst_length,
329
               const char *src, size_t src_length,
330
               size_t nchars, MY_STRCOPY_STATUS *status)
331
0
{
332
0
  size_t well_formed_nchars;
333
0
  size_t well_formed_length;
334
0
  size_t fixed_length;
335
0
  size_t min_length= MY_MIN(src_length, dst_length);
336
337
0
  well_formed_nchars= my_ci_well_formed_char_length(cs, src, src + min_length,
338
0
                                                        nchars, status);
339
0
  DBUG_ASSERT(well_formed_nchars <= nchars);
340
0
  well_formed_length= status->m_source_end_pos - src;
341
0
  if (well_formed_length)
342
0
    memmove(dst, src, well_formed_length);
343
0
  if (!status->m_well_formed_error_pos)
344
0
    return well_formed_length;
345
346
0
  fixed_length= my_append_fix_badly_formed_tail(cs,
347
0
                                                dst + well_formed_length,
348
0
                                                dst + dst_length,
349
0
                                                src + well_formed_length,
350
0
                                                src + src_length,
351
0
                                                nchars - well_formed_nchars,
352
0
                                                status);
353
0
  return well_formed_length + fixed_length;
354
0
}
355
356
357
uint my_instr_mb(CHARSET_INFO *cs,
358
                 const char *b, size_t b_length, 
359
                 const char *s, size_t s_length,
360
                 my_match_t *match, uint nmatch)
361
0
{
362
0
  register const char *end, *b0;
363
0
  int res= 0;
364
  
365
0
  if (s_length <= b_length)
366
0
  {
367
0
    if (!s_length)
368
0
    {
369
0
      if (nmatch)
370
0
      {
371
0
        match->beg= 0;
372
0
        match->end= 0;
373
0
        match->mb_len= 0;
374
0
      }
375
0
      return 1;   /* Empty string is always found */
376
0
    }
377
    
378
0
    b0= b;
379
0
    end= b+b_length-s_length+1;
380
    
381
0
    while (b < end)
382
0
    {
383
0
      int mb_len;
384
      
385
0
      if (!my_ci_strnncoll(cs, (const uchar *) b, s_length,
386
0
                               (const uchar *) s, s_length, 0))
387
0
      {
388
0
        if (nmatch)
389
0
        {
390
0
          match[0].beg= 0;
391
0
          match[0].end= (uint) (b-b0);
392
0
          match[0].mb_len= res;
393
0
          if (nmatch > 1)
394
0
          {
395
0
            match[1].beg= match[0].end;
396
0
            match[1].end= (uint)(match[0].end+s_length);
397
0
            match[1].mb_len= 0; /* Not computed */
398
0
          }
399
0
        }
400
0
        return 2;
401
0
      }
402
0
      mb_len= (mb_len= my_ismbchar(cs, b, end)) ? mb_len : 1;
403
0
      b+= mb_len;
404
0
      b_length-= mb_len;
405
0
      res++;
406
0
    }
407
0
  }
408
0
  return 0;
409
0
}
410
411
412
/*
413
  Copy one non-ascii character.
414
  "dst" must have enough room for the character.
415
  Note, we don't use sort_order[] in this macros.
416
  This is correct even for case insensitive collations:
417
  - basic Latin letters are processed outside this macros;
418
  - for other characters sort_order[x] is equal to x.
419
*/
420
0
#define my_strnxfrm_mb_non_ascii_char(cs, dst, src, se)                  \
421
0
{                                                                        \
422
0
  switch (my_ismbchar(cs, (const char *) src, (const char *) se)) {      \
423
0
  case 4:                                                                \
424
0
    *dst++= *src++;                                                      \
425
0
    /* fall through */                                                   \
426
0
  case 3:                                                                \
427
0
    *dst++= *src++;                                                      \
428
0
    /* fall through */                                                   \
429
0
  case 2:                                                                \
430
0
    *dst++= *src++;                                                      \
431
0
    /* fall through */                                                   \
432
0
  case 0:                                                                \
433
0
    *dst++= *src++; /* byte in range 0x80..0xFF which is not MB head */  \
434
0
  }                                                                      \
435
0
}
436
437
438
/*
439
  For character sets with two or three byte multi-byte
440
  characters having multibyte weights *equal* to their codes:
441
  cp932, euckr, gb2312, sjis, eucjpms, ujis.
442
*/
443
size_t my_strnxfrm_mb_internal(CHARSET_INFO *cs, uchar *dst, uchar *de,
444
                               uint *nweights, const uchar *src, size_t srclen)
445
0
{
446
0
  uchar *d0= dst;
447
0
  const uchar *se= src + srclen;
448
0
  const uchar *sort_order= cs->sort_order;
449
450
0
  DBUG_ASSERT(cs->mbmaxlen <= 4);
451
452
  /*
453
    If "srclen" is smaller than both "dstlen" and "nweights"
454
    then we can run a simplified loop -
455
    without checking "nweights" and "de".
456
  */
457
0
  if (de >= d0 + srclen && *nweights >= srclen)
458
0
  {
459
0
    if (sort_order)
460
0
    {
461
      /* Optimized version for a case insensitive collation */
462
0
      for (; src < se; (*nweights)--)
463
0
      {
464
0
        if (*src < 128) /* quickly catch ASCII characters */
465
0
          *dst++= sort_order[*src++];
466
0
        else
467
0
          my_strnxfrm_mb_non_ascii_char(cs, dst, src, se);
468
0
      }
469
0
    }
470
0
    else
471
0
    {
472
      /* Optimized version for a case sensitive collation (no sort_order) */
473
0
      for (; src < se; (*nweights)--)
474
0
      {
475
0
        if (*src < 128) /* quickly catch ASCII characters */
476
0
          *dst++= *src++;
477
0
        else
478
0
          my_strnxfrm_mb_non_ascii_char(cs, dst, src, se);
479
0
      }
480
0
    }
481
0
    goto end;
482
0
  }
483
484
  /*
485
    A thourough loop, checking all possible limits:
486
    "se", "nweights" and "de".
487
  */
488
0
  for (; src < se && *nweights && dst < de; (*nweights)--)
489
0
  {
490
0
    int chlen;
491
0
    if (*src < 128 || !(chlen= my_ismbchar(cs, (const char *) src,
492
0
                                               (const char *) se)))
493
0
    {
494
      /* Single byte character */
495
0
      *dst++= sort_order ? sort_order[*src++] : *src++;
496
0
    }
497
0
    else
498
0
    {
499
      /* Multi-byte character */
500
0
      size_t len= (dst + chlen <= de) ? chlen : de - dst;
501
0
      memcpy(dst, src, len);
502
0
      dst+= len;
503
0
      src+= len;
504
0
    }
505
0
  }
506
507
0
end:
508
0
  return dst - d0;
509
0
}
510
511
512
size_t
513
my_strnxfrm_mb(CHARSET_INFO *cs,
514
               uchar *dst, size_t dstlen, uint nweights,
515
               const uchar *src, size_t srclen, uint flags)
516
0
{
517
0
  uchar *de= dst + dstlen;
518
0
  uchar *d0= dst;
519
0
  dst= d0 + my_strnxfrm_mb_internal(cs, dst, de, &nweights, src, srclen);
520
0
  return my_strxfrm_pad_desc_and_reverse(cs, d0, dst, de, nweights, flags, 0);
521
0
}
522
523
524
size_t
525
my_strnxfrm_mb_nopad(CHARSET_INFO *cs,
526
                     uchar *dst, size_t dstlen, uint nweights,
527
                     const uchar *src, size_t srclen, uint flags)
528
0
{
529
0
  uchar *de= dst + dstlen;
530
0
  uchar *d0= dst;
531
0
  dst= d0 + my_strnxfrm_mb_internal(cs, dst, de, &nweights, src, srclen);
532
0
  return my_strxfrm_pad_desc_and_reverse_nopad(cs, d0, dst, de, nweights,
533
0
                                               flags, 0);
534
0
}
535
536
537
void
538
my_hash_sort_mb_nopad_bin(CHARSET_INFO *cs __attribute__((unused)),
539
                          const uchar *key, size_t len,ulong *nr1, ulong *nr2)
540
0
{
541
0
  register ulong m1= *nr1, m2= *nr2;
542
0
  const uchar *end= key + len;
543
0
  for (; key < end ; key++)
544
0
  {
545
0
    MY_HASH_ADD(m1, m2, (uint)*key);
546
0
  }
547
0
  *nr1= m1;
548
0
  *nr2= m2;
549
0
}
550
551
552
void
553
my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
554
                    const uchar *key, size_t len,ulong *nr1, ulong *nr2)
555
0
{
556
  /*
557
     Remove trailing spaces. We have to do this to be able to compare
558
    'A ' and 'A' as identical
559
  */
560
0
  const uchar *end= skip_trailing_space(key, len);
561
0
  my_hash_sort_mb_nopad_bin(cs, key, end - key, nr1, nr2);
562
0
}
563
564
565
static inline size_t
566
my_repeat_char_native(CHARSET_INFO *cs,
567
                      uchar *dst, size_t dst_size, size_t nchars,
568
                      my_wc_t native_code)
569
0
{
570
0
  uchar *dst0= dst;
571
0
  uchar *dstend= dst + dst_size;
572
0
  int chlen= my_ci_native_to_mb(cs, native_code, dst, dstend);
573
0
  if (chlen < 1 /* Not enough space */ || !nchars)
574
0
    return 0;
575
0
  for (dst+= chlen, nchars--;
576
0
       dst + chlen <= dstend && nchars > 0;
577
0
       dst+= chlen, nchars--)
578
0
    memcpy(dst, dst0, chlen);
579
0
  return dst - dst0;
580
0
}
581
582
583
size_t my_min_str_mb_simple(CHARSET_INFO *cs,
584
                            uchar *dst, size_t dst_size, size_t nchars)
585
0
{
586
0
  return my_repeat_char_native(cs, dst, dst_size, nchars, cs->min_sort_char);
587
0
}
588
589
590
size_t my_min_str_mb_simple_nopad(CHARSET_INFO *cs,
591
                                  uchar *dst, size_t dst_size, size_t nchars)
592
0
{
593
  /* For NOPAD collations, the empty string is the smallest possible */
594
0
  return 0;
595
0
}
596
597
598
size_t my_max_str_mb_simple(CHARSET_INFO *cs,
599
                            uchar *dst, size_t dst_size, size_t nchars)
600
0
{
601
0
  return my_repeat_char_native(cs, dst, dst_size, nchars, cs->max_sort_char);
602
0
}
603
604
605
/* 
606
  Fill the given buffer with 'maximum character' for given charset
607
  SYNOPSIS
608
      pad_max_char()
609
      cs   Character set
610
      str  Start of buffer to fill
611
      end  End of buffer to fill
612
613
  DESCRIPTION
614
      Write max key:
615
      - for non-Unicode character sets:
616
        just bfill using max_sort_char if max_sort_char is one byte.
617
        In case when max_sort_char is two bytes, fill with double-byte pairs
618
        and optionally pad with a single space character.
619
      - for Unicode character set (utf-8):
620
        create a buffer with multibyte representation of the max_sort_char
621
        character, and copy it into max_str in a loop. 
622
*/
623
static void pad_max_char(CHARSET_INFO *cs, char *str, char *end)
624
0
{
625
0
  char buf[10];
626
0
  char buflen= my_ci_native_to_mb(cs, cs->max_sort_char, (uchar*) buf,
627
0
                                      (uchar*) buf + sizeof(buf));
628
0
  DBUG_ASSERT(buflen > 0);
629
0
  do
630
0
  {
631
0
    if ((str + buflen) <= end)
632
0
    {
633
      /* Enough space for the character */
634
0
      memcpy(str, buf, buflen);
635
0
      str+= buflen;
636
0
    }
637
0
    else
638
0
    {
639
      /* 
640
        There is no space for whole multibyte
641
        character, then add trailing spaces.
642
      */  
643
0
      *str++= ' ';
644
0
    }
645
0
  } while (str < end);
646
0
}
647
648
/*
649
** Calculate min_str and max_str that ranges a LIKE string.
650
** Arguments:
651
** ptr    Pointer to LIKE string.
652
** ptr_length Length of LIKE string.
653
** escape Escape character in LIKE.  (Normally '\').
654
**    All escape characters should be removed from min_str and max_str
655
** res_length Length of min_str and max_str.
656
** min_str  Smallest case sensitive string that ranges LIKE.
657
**    Should be space padded to res_length.
658
** max_str  Largest case sensitive string that ranges LIKE.
659
**    Normally padded with the biggest character sort value.
660
**
661
** The function should return 0 if ok and 1 if the LIKE string can't be
662
** optimized !
663
*/
664
665
my_bool my_like_range_mb(CHARSET_INFO *cs,
666
       const char *ptr,size_t ptr_length,
667
       pbool escape, pbool w_one, pbool w_many,
668
       size_t res_length,
669
       char *min_str,char *max_str,
670
       size_t *min_length,size_t *max_length)
671
0
{
672
0
  uint mb_len;
673
0
  const char *end= ptr + ptr_length;
674
0
  char *min_org= min_str;
675
0
  char *min_end= min_str + res_length;
676
0
  char *max_end= max_str + res_length;
677
0
  size_t maxcharlen= res_length / cs->mbmaxlen;
678
0
  const MY_CONTRACTIONS *contractions= my_charset_get_contractions(cs, 0);
679
680
0
  for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--)
681
0
  {
682
    /* We assume here that escape, w_any, w_namy are one-byte characters */
683
0
    if (*ptr == escape && ptr+1 != end)
684
0
      ptr++;                                    /* Skip escape */
685
0
    else if (*ptr == w_one || *ptr == w_many)   /* '_' and '%' in SQL */
686
0
    {      
687
0
fill_max_and_min:
688
      /*
689
        Calculate length of keys:
690
        'a\0\0... is the smallest possible string when we have space expand
691
        a\ff\ff... is the biggest possible string
692
      */
693
0
      *min_length= (cs->state & (MY_CS_BINSORT | MY_CS_NOPAD)) ?
694
0
                    (size_t) (min_str - min_org) :
695
0
                    res_length;
696
      /* Create min key  */
697
0
      do
698
0
      {
699
0
  *min_str++= (char) cs->min_sort_char;
700
0
      } while (min_str != min_end);
701
      
702
      /* 
703
        Write max key: create a buffer with multibyte
704
        representation of the max_sort_char character,
705
        and copy it into max_str in a loop. 
706
      */
707
0
      *max_length= res_length;
708
0
      pad_max_char(cs, max_str, max_end);
709
0
      return 0;
710
0
    }
711
0
    if ((mb_len= my_ismbchar(cs, ptr, end)) > 1)
712
0
    {
713
0
      if (ptr+mb_len > end || min_str+mb_len > min_end)
714
0
        break;
715
0
      while (mb_len--)
716
0
       *min_str++= *max_str++= *ptr++;
717
0
    }
718
0
    else
719
0
    {
720
      /*
721
        Special case for collations with contractions.
722
        For example, in Chezh, 'ch' is a separate letter
723
        which is sorted between 'h' and 'i'.
724
        If the pattern 'abc%', 'c' at the end can mean:
725
        - letter 'c' itself,
726
        - beginning of the contraction 'ch'.
727
728
        If we simply return this LIKE range:
729
730
         'abc\min\min\min' and 'abc\max\max\max'
731
732
        then this query: SELECT * FROM t1 WHERE a LIKE 'abc%'
733
        will only find values starting from 'abc[^h]',
734
        but won't find values starting from 'abch'.
735
736
        We must ignore contraction heads followed by w_one or w_many.
737
        ('Contraction head' means any letter which can be the first
738
        letter in a contraction)
739
740
        For example, for Czech 'abc%', we will return LIKE range,
741
        which is equal to LIKE range for 'ab%':
742
743
        'ab\min\min\min\min' and 'ab\max\max\max\max'.
744
745
      */
746
0
      if (contractions && ptr + 1 < end &&
747
0
          my_uca_can_be_contraction_head(contractions, (uchar) *ptr))
748
0
      {
749
        /* Ptr[0] is a contraction head. */
750
        
751
0
        if (ptr[1] == w_one || ptr[1] == w_many)
752
0
        {
753
          /* Contraction head followed by a wildcard, quit. */
754
0
          goto fill_max_and_min;
755
0
        }
756
        
757
        /*
758
          Some letters can be both contraction heads and contraction tails.
759
          For example, in Danish 'aa' is a separate single letter which
760
          is sorted after 'z'. So 'a' can be both head and tail.
761
          
762
          If ptr[0]+ptr[1] is a contraction,
763
          then put both letters together.
764
          
765
          If ptr[1] can be a contraction part, but ptr[0]+ptr[1]
766
          is not a contraction, then we put only ptr[0],
767
          and continue with ptr[1] on the next loop.
768
        */
769
0
        if (my_uca_can_be_contraction_tail(contractions, (uchar) ptr[1]) &&
770
0
            my_uca_contraction2_weight(contractions, (uchar) ptr[0], ptr[1]))
771
0
        {
772
          /* Contraction found */
773
0
          if (maxcharlen == 1 || min_str + 1 >= min_end)
774
0
          {
775
            /* Both contraction parts don't fit, quit */
776
0
            goto fill_max_and_min;
777
0
          }
778
          
779
          /* Put contraction head */
780
0
          *min_str++= *max_str++= *ptr++;
781
0
          maxcharlen--;
782
0
        }
783
0
      }
784
      /* Put contraction tail, or a single character */
785
0
      *min_str++= *max_str++= *ptr++;    
786
0
    }
787
0
  }
788
789
0
  *min_length= *max_length = (size_t) (min_str - min_org);
790
0
  while (min_str != min_end)
791
0
    *min_str++= *max_str++= ' ';           /* Because if key compression */
792
0
  return 0;
793
0
}
794
795
796
/**
797
   Calculate min_str and max_str that ranges a LIKE string.
798
   Generic function, currently used for ucs2, utf16, utf32,
799
   but should be suitable for any other character sets with
800
   cs->min_sort_char and cs->max_sort_char represented in
801
   Unicode code points.
802
803
   @param cs           Character set and collation pointer
804
   @param ptr          Pointer to LIKE pattern.
805
   @param ptr_length   Length of LIKE pattern.
806
   @param escape       Escape character pattern,  typically '\'.
807
   @param w_one        'One character' pattern,   typically '_'.
808
   @param w_many       'Many characters' pattern, typically '%'.
809
   @param res_length   Length of min_str and max_str.
810
811
   @param[out] min_str Smallest string that ranges LIKE.
812
   @param[out] max_str Largest string that ranges LIKE.
813
   @param[out] min_len Length of min_str
814
   @param[out] max_len Length of max_str
815
816
   @return Optimization status.
817
   @retval FALSE if LIKE pattern can be optimized
818
   @rerval TRUE if LIKE can't be optimized.
819
*/
820
my_bool
821
my_like_range_generic(CHARSET_INFO *cs,
822
                      const char *ptr, size_t ptr_length,
823
                      pbool escape, pbool w_one, pbool w_many,
824
                      size_t res_length,
825
                      char *min_str,char *max_str,
826
                      size_t *min_length,size_t *max_length)
827
0
{
828
0
  const char *end= ptr + ptr_length;
829
0
  const char *min_org= min_str;
830
0
  const char *max_org= max_str;
831
0
  char *min_end= min_str + res_length;
832
0
  char *max_end= max_str + res_length;
833
0
  size_t charlen= res_length / cs->mbmaxlen;
834
0
  size_t res_length_diff;
835
0
  const MY_CONTRACTIONS *contractions= my_charset_get_contractions(cs, 0);
836
837
0
  for ( ; charlen > 0; charlen--)
838
0
  {
839
0
    my_wc_t wc, wc2;
840
0
    int res;
841
0
    if ((res= my_ci_mb_wc(cs, &wc, (uchar*) ptr, (uchar*) end)) <= 0)
842
0
    {
843
0
      if (res == MY_CS_ILSEQ) /* Bad sequence */
844
0
        return TRUE; /* min_length and max_length are not important */
845
0
      break; /* End of the string */
846
0
    }
847
0
    ptr+= res;
848
849
0
    if (wc == (my_wc_t) escape)
850
0
    {
851
0
      if ((res= my_ci_mb_wc(cs, &wc, (uchar*) ptr, (uchar*) end)) <= 0)
852
0
      {
853
0
        if (res == MY_CS_ILSEQ)
854
0
          return TRUE; /* min_length and max_length are not important */
855
        /*
856
           End of the string: Escape is the last character.
857
           Put escape as a normal character.
858
           We'll will leave the loop on the next iteration.
859
        */
860
0
      }
861
0
      else
862
0
        ptr+= res;
863
864
      /* Put escape character to min_str and max_str  */
865
0
      if ((res= my_ci_wc_mb(cs, wc, (uchar*) min_str, (uchar*) min_end)) <= 0)
866
0
        goto pad_set_lengths; /* No space */
867
0
      min_str+= res;
868
869
0
      if ((res= my_ci_wc_mb(cs, wc, (uchar*) max_str, (uchar*) max_end)) <= 0)
870
0
        goto pad_set_lengths; /* No space */
871
0
      max_str+= res;
872
0
      continue;
873
0
    }
874
0
    else if (wc == (my_wc_t) w_one)
875
0
    {
876
0
      if ((res= my_ci_wc_mb(cs, cs->min_sort_char,
877
0
                             (uchar*) min_str, (uchar*) min_end)) <= 0)
878
0
        goto pad_set_lengths;
879
0
      min_str+= res;
880
881
0
      if ((res= my_ci_wc_mb(cs, cs->max_sort_char,
882
0
                             (uchar*) max_str, (uchar*) max_end)) <= 0)
883
0
        goto pad_set_lengths;
884
0
      max_str+= res;
885
0
      continue;
886
0
    }
887
0
    else if (wc == (my_wc_t) w_many)
888
0
    {
889
      /*
890
        Calculate length of keys:
891
        a\min\min... is the smallest possible string
892
        a\max\max... is the biggest possible string
893
      */
894
0
      *min_length= (cs->state & (MY_CS_BINSORT | MY_CS_NOPAD)) ?
895
0
                    (size_t) (min_str - min_org) :
896
0
                    res_length;
897
0
      *max_length= res_length;
898
0
      goto pad_min_max;
899
0
    }
900
901
0
    if (contractions &&
902
0
        my_uca_can_be_contraction_head(contractions, wc) &&
903
0
        (res= my_ci_mb_wc(cs, &wc2, (uchar*) ptr, (uchar*) end)) > 0)
904
0
    {
905
0
      const uint16 *weight;
906
0
      if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many))
907
0
      {
908
        /* Contraction head followed by a wildcard */
909
0
        *min_length= *max_length= res_length;
910
0
        goto pad_min_max;
911
0
      }
912
913
0
      if (my_uca_can_be_contraction_tail(contractions, wc2) &&
914
0
          (weight= my_uca_contraction2_weight(contractions, wc, wc2)) && weight[0])
915
0
      {
916
        /* Contraction found */
917
0
        if (charlen == 1)
918
0
        {
919
          /* contraction does not fit to result */
920
0
          *min_length= *max_length= res_length;
921
0
          goto pad_min_max;
922
0
        }
923
924
0
        ptr+= res;
925
0
        charlen--;
926
927
        /* Put contraction head */
928
0
        if ((res= my_ci_wc_mb(cs, wc, (uchar*) min_str, (uchar*) min_end)) <= 0)
929
0
          goto pad_set_lengths;
930
0
        min_str+= res;
931
932
0
        if ((res= my_ci_wc_mb(cs, wc, (uchar*) max_str, (uchar*) max_end)) <= 0)
933
0
          goto pad_set_lengths;
934
0
        max_str+= res;
935
0
        wc= wc2; /* Prepare to put contraction tail */
936
0
      }
937
0
    }
938
939
    /* Normal character, or contraction tail */
940
0
    if ((res= my_ci_wc_mb(cs, wc, (uchar*) min_str, (uchar*) min_end)) <= 0)
941
0
      goto pad_set_lengths;
942
0
    min_str+= res;
943
0
    if ((res= my_ci_wc_mb(cs, wc, (uchar*) max_str, (uchar*) max_end)) <= 0)
944
0
      goto pad_set_lengths;
945
0
    max_str+= res;
946
0
  }
947
948
0
pad_set_lengths:
949
0
  *min_length= (size_t) (min_str - min_org);
950
0
  *max_length= (size_t) (max_str - max_org);
951
952
0
pad_min_max:
953
  /*
954
    Fill up max_str and min_str to res_length.
955
    fill() cannot set incomplete characters and
956
    requires that "length" argument is divisible to mbminlen.
957
    Make sure to call fill() with proper "length" argument.
958
  */
959
0
  res_length_diff= res_length % cs->mbminlen;
960
0
  my_ci_fill(cs, min_str, min_end - min_str - res_length_diff,
961
0
                 cs->min_sort_char);
962
0
  my_ci_fill(cs, max_str, max_end - max_str - res_length_diff,
963
0
                 cs->max_sort_char);
964
965
  /* In case of incomplete characters set the remainder to 0x00's */
966
0
  if (res_length_diff)
967
0
  {
968
    /* Example: odd res_length for ucs2 */
969
0
    memset(min_end - res_length_diff, 0, res_length_diff);
970
0
    memset(max_end - res_length_diff, 0, res_length_diff);
971
0
  }
972
0
  return FALSE;
973
0
}
974
975
976
static int my_wildcmp_mb_bin_impl(CHARSET_INFO *cs,
977
                                  const char *str,const char *str_end,
978
                                  const char *wildstr,const char *wildend,
979
                                  int escape, int w_one, int w_many, int recurse_level)
980
0
{
981
0
  int result= -1;       /* Not found, using wildcards */
982
983
0
  if (my_string_stack_guard && my_string_stack_guard(recurse_level))
984
0
    return 1;
985
0
  while (wildstr != wildend)
986
0
  {
987
0
    while (*wildstr != w_many && *wildstr != w_one)
988
0
    {
989
0
      int l;
990
0
      if (*wildstr == escape && wildstr+1 != wildend)
991
0
  wildstr++;
992
0
      if ((l = my_ismbchar(cs, wildstr, wildend)))
993
0
      {
994
0
    if (str+l > str_end || memcmp(str, wildstr, l) != 0)
995
0
        return 1;
996
0
    str += l;
997
0
    wildstr += l;
998
0
      }
999
0
      else
1000
0
      if (str == str_end || *wildstr++ != *str++)
1001
0
  return(1);       /* No match */
1002
0
      if (wildstr == wildend)
1003
0
  return (str != str_end);   /* Match if both are at end */
1004
0
      result=1;         /* Found an anchor char */
1005
0
    }
1006
0
    if (*wildstr == w_one)
1007
0
    {
1008
0
      do
1009
0
      {
1010
0
  if (str == str_end)     /* Skip one char if possible */
1011
0
    return (result);
1012
0
  INC_PTR(cs,str,str_end);
1013
0
      } while (++wildstr < wildend && *wildstr == w_one);
1014
0
      if (wildstr == wildend)
1015
0
  break;
1016
0
    }
1017
0
    if (*wildstr == w_many)
1018
0
    {           /* Found w_many */
1019
0
      int cmp;
1020
0
      const char* mb = wildstr;
1021
0
      int mb_len=0;
1022
      
1023
0
      wildstr++;
1024
      /* Remove any '%' and '_' from the wild search string */
1025
0
      for (; wildstr != wildend ; wildstr++)
1026
0
      {
1027
0
  if (*wildstr == w_many)
1028
0
    continue;
1029
0
  if (*wildstr == w_one)
1030
0
  {
1031
0
    if (str == str_end)
1032
0
      return (-1);
1033
0
    INC_PTR(cs,str,str_end);
1034
0
    continue;
1035
0
  }
1036
0
  break;         /* Not a wild character */
1037
0
      }
1038
0
      if (wildstr == wildend)
1039
0
  return(0);       /* Ok if w_many is last */
1040
0
      if (str == str_end)
1041
0
  return -1;
1042
      
1043
0
      if ((cmp= *wildstr) == escape && wildstr+1 != wildend)
1044
0
  cmp= *++wildstr;
1045
  
1046
0
      mb=wildstr;
1047
0
      mb_len= my_ismbchar(cs, wildstr, wildend);
1048
0
      INC_PTR(cs,wildstr,wildend);   /* This is compared trough cmp */
1049
0
      do
1050
0
      {
1051
0
        for (;;)
1052
0
        {
1053
0
          if (str >= str_end)
1054
0
            return -1;
1055
0
          if (mb_len)
1056
0
          {
1057
0
            if (str+mb_len <= str_end && memcmp(str, mb, mb_len) == 0)
1058
0
            {
1059
0
              str += mb_len;
1060
0
              break;
1061
0
            }
1062
0
          }
1063
0
          else if (!my_ismbchar(cs, str, str_end) && *str == cmp)
1064
0
          {
1065
0
            str++;
1066
0
            break;
1067
0
          }
1068
0
          INC_PTR(cs,str, str_end);
1069
0
        }
1070
0
  {
1071
0
    int tmp=my_wildcmp_mb_bin_impl(cs,str,str_end,
1072
0
                                         wildstr,wildend,escape,
1073
0
                                         w_one,w_many, recurse_level+1);
1074
0
    if (tmp <= 0)
1075
0
      return (tmp);
1076
0
  }
1077
0
      } while (str != str_end);
1078
0
      return(-1);
1079
0
    }
1080
0
  }
1081
0
  return (str != str_end ? 1 : 0);
1082
0
}
1083
1084
int
1085
my_wildcmp_mb_bin(CHARSET_INFO *cs,
1086
                  const char *str,const char *str_end,
1087
                  const char *wildstr,const char *wildend,
1088
                  int escape, int w_one, int w_many)
1089
0
{
1090
0
  return my_wildcmp_mb_bin_impl(cs, str, str_end,
1091
0
                                wildstr, wildend,
1092
0
                                escape, w_one, w_many, 1);
1093
0
}
1094
1095
1096
/*
1097
  Data was produced from EastAsianWidth.txt 
1098
  using utt11-dump utility.
1099
*/
1100
static const char pg11[256]=
1101
{
1102
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1103
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1104
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,
1105
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1106
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1107
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1108
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1109
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1110
};
1111
1112
static const char pg23[256]=
1113
{
1114
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1115
0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1116
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1117
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1118
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1119
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1120
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1121
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1122
};
1123
1124
static const char pg2E[256]=
1125
{
1126
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1127
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1128
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1129
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1130
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,
1131
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1132
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1133
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
1134
};
1135
1136
static const char pg2F[256]=
1137
{
1138
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1139
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1140
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1141
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1142
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1143
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1144
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
1145
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0
1146
};
1147
1148
static const char pg30[256]=
1149
{
1150
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1151
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
1152
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1153
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1154
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,
1155
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1156
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1157
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1158
};
1159
1160
static const char pg31[256]=
1161
{
1162
0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1163
1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1164
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1165
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1166
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1167
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
1168
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1169
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1170
};
1171
1172
static const char pg32[256]=
1173
{
1174
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
1175
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1176
1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1177
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,
1178
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1179
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1180
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1181
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
1182
};
1183
1184
static const char pg4D[256]=
1185
{
1186
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1187
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1188
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1189
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1190
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1191
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
1192
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1193
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1194
};
1195
1196
static const char pg9F[256]=
1197
{
1198
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1199
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1200
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1201
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1202
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1203
1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1204
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1205
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1206
};
1207
1208
static const char pgA4[256]=
1209
{
1210
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1211
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1212
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1213
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1214
1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1215
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1216
1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1217
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1218
};
1219
1220
static const char pgD7[256]=
1221
{
1222
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1223
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1224
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1225
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1226
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1227
1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1228
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1229
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1230
};
1231
1232
static const char pgFA[256]=
1233
{
1234
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1235
1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1236
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1237
1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1238
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1239
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1240
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1241
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1242
};
1243
1244
static const char pgFE[256]=
1245
{
1246
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1247
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1248
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,
1249
1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1250
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1251
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1252
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1253
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1254
};
1255
1256
static const char pgFF[256]=
1257
{
1258
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1259
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1260
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1261
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1262
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1263
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1264
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1265
1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1266
};
1267
1268
static const struct {int page; const char *p;} utr11_data[256]=
1269
{
1270
{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},
1271
{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},
1272
{0,NULL},{0,pg11},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},
1273
{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},
1274
{0,NULL},{0,NULL},{0,NULL},{0,pg23},{0,NULL},{0,NULL},{0,NULL},{0,NULL},
1275
{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,pg2E},{0,pg2F},
1276
{0,pg30},{0,pg31},{0,pg32},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1277
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1278
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1279
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{0,pg4D},{1,NULL},{1,NULL},
1280
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1281
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1282
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1283
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1284
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1285
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1286
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1287
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1288
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1289
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{0,pg9F},
1290
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{0,pgA4},{0,NULL},{0,NULL},{0,NULL},
1291
{0,NULL},{0,NULL},{0,NULL},{0,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1292
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1293
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1294
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1295
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},
1296
{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{0,pgD7},
1297
{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},
1298
{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},
1299
{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},
1300
{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},
1301
{0,NULL},{1,NULL},{0,pgFA},{0,NULL},{0,NULL},{0,NULL},{0,pgFE},{0,pgFF}
1302
};
1303
1304
1305
size_t my_numcells_mb(CHARSET_INFO *cs, const char *b, const char *e)
1306
0
{
1307
0
  my_wc_t wc;
1308
0
  size_t clen= 0;
1309
  
1310
0
  while (b < e)
1311
0
  {
1312
0
    int mb_len;
1313
0
    uint pg;
1314
0
    if ((mb_len= my_ci_mb_wc(cs, &wc, (uchar*) b, (uchar*) e)) <= 0)
1315
0
    {
1316
0
      mb_len= 1; /* Let's think a wrong sequence takes 1 dysplay cell */
1317
0
      b++;
1318
0
      continue;
1319
0
    }
1320
0
    b+= mb_len;
1321
0
    if (wc > 0xFFFF)
1322
0
    {
1323
0
      if (wc >= 0x20000 && wc <= 0x3FFFD) /* CJK Ideograph Extension B, C */
1324
0
        clen+= 1;
1325
0
    }
1326
0
    else
1327
0
    {
1328
0
      pg= (wc >> 8) & 0xFF;
1329
0
      clen+= utr11_data[pg].p ? utr11_data[pg].p[wc & 0xFF] : utr11_data[pg].page;
1330
0
    }
1331
0
    clen++;
1332
0
  }
1333
0
  return clen;
1334
0
}
1335
1336
1337
int my_mb_ctype_mb(CHARSET_INFO *cs, int *ctype,
1338
                   const uchar *s, const uchar *e)
1339
0
{
1340
0
  my_wc_t wc;
1341
0
  int res= my_ci_mb_wc(cs, &wc, s, e);
1342
0
  if (res <= 0 || wc > 0xFFFF)
1343
0
    *ctype= 0;
1344
0
  else
1345
0
    *ctype= my_uni_ctype[wc>>8].ctype ?
1346
0
            my_uni_ctype[wc>>8].ctype[wc&0xFF] :
1347
0
            my_uni_ctype[wc>>8].pctype;    
1348
0
  return res;
1349
0
}
1350
1351
1352
#endif