Coverage Report

Created: 2025-07-11 06:49

/src/oniguruma/src/sjis.c
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
  sjis.c -  Oniguruma (regular expression library)
3
**********************************************************************/
4
/*-
5
 * Copyright (c) 2002-2020  K.Kosako
6
 * All rights reserved.
7
 *
8
 * Redistribution and use in source and binary forms, with or without
9
 * modification, are permitted provided that the following conditions
10
 * are met:
11
 * 1. Redistributions of source code must retain the above copyright
12
 *    notice, this list of conditions and the following disclaimer.
13
 * 2. Redistributions in binary form must reproduce the above copyright
14
 *    notice, this list of conditions and the following disclaimer in the
15
 *    documentation and/or other materials provided with the distribution.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
 * SUCH DAMAGE.
28
 */
29
30
#include "regint.h"
31
32
static const int EncLen_SJIS[] = {
33
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
49
};
50
51
static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
52
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
60
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
68
};
69
70
100k
#define SJIS_ISMB_FIRST(byte)  (EncLen_SJIS[byte] > 1)
71
123k
#define SJIS_ISMB_TRAIL(byte)  SJIS_CAN_BE_TRAIL_TABLE[(byte)]
72
73
static int
74
mbc_enc_len(const UChar* p)
75
696k
{
76
696k
  return EncLen_SJIS[*p];
77
696k
}
78
79
static int
80
is_valid_mbc_string(const UChar* p, const UChar* end)
81
9.06k
{
82
197k
  while (p < end) {
83
189k
    if (*p < 0x80) {
84
159k
      p++;
85
159k
    }
86
29.0k
    else if (*p < 0xa1) {
87
4.91k
      if (*p == 0xa0 || *p == 0x80)
88
8
        return FALSE;
89
4.91k
      p++;
90
4.91k
      if (p >= end) return FALSE;
91
4.87k
      if (*p < 0x40 || *p > 0xfc || *p == 0x7f)
92
29
        return FALSE;
93
4.84k
      p++;
94
4.84k
    }
95
24.1k
    else if (*p < 0xe0) {
96
21.3k
      p++;
97
21.3k
    }
98
2.86k
    else if (*p < 0xfd) {
99
2.84k
      p++;
100
2.84k
      if (p >= end) return FALSE;
101
2.80k
      if (*p < 0x40 || *p > 0xfc || *p == 0x7f)
102
12
        return FALSE;
103
2.79k
      p++;
104
2.79k
    }
105
20
    else
106
20
      return FALSE;
107
189k
  }
108
109
8.92k
  return TRUE;
110
9.06k
}
111
112
static int
113
code_to_mbclen(OnigCodePoint code)
114
319k
{
115
319k
  if (code < 256) {
116
316k
    if (EncLen_SJIS[(int )code] == 1)
117
264k
      return 1;
118
316k
  }
119
2.98k
  else if (code < 0x10000) {
120
2.82k
    if (EncLen_SJIS[(int )(code >>  8) & 0xff] == 2)
121
2.75k
      return 2;
122
2.82k
  }
123
124
53.0k
  return ONIGERR_INVALID_CODE_POINT_VALUE;
125
319k
}
126
127
static OnigCodePoint
128
mbc_to_code(const UChar* p, const UChar* end)
129
235k
{
130
235k
  int c, i, len;
131
235k
  OnigCodePoint n;
132
133
235k
  len = enclen(ONIG_ENCODING_SJIS, p);
134
235k
  c = *p++;
135
235k
  n = c;
136
235k
  if (len == 1) return n;
137
138
39.4k
  for (i = 1; i < len; i++) {
139
19.7k
    if (p >= end) break;
140
19.7k
    c = *p++;
141
19.7k
    n <<= 8;  n += c;
142
19.7k
  }
143
19.7k
  return n;
144
235k
}
145
146
static int
147
code_to_mbc(OnigCodePoint code, UChar *buf)
148
610
{
149
610
  UChar *p = buf;
150
151
610
  if ((code & 0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
152
610
  *p++ = (UChar )(code & 0xff);
153
154
610
  return (int )(p - buf);
155
610
}
156
157
static int
158
mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
159
              const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
160
3.03k
{
161
3.03k
  const UChar* p = *pp;
162
163
3.03k
  if (ONIGENC_IS_MBC_ASCII(p)) {
164
2.34k
    *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
165
2.34k
    (*pp)++;
166
2.34k
    return 1;
167
2.34k
  }
168
684
  else {
169
684
    int i;
170
684
    int len = enclen(ONIG_ENCODING_SJIS, p);
171
172
1.75k
    for (i = 0; i < len; i++) {
173
1.07k
      *lower++ = *p++;
174
1.07k
    }
175
684
    (*pp) += len;
176
684
    return len; /* return byte length of converted char to lower */
177
684
  }
178
3.03k
}
179
180
static UChar*
181
left_adjust_char_head(const UChar* start, const UChar* s)
182
133k
{
183
133k
  const UChar *p;
184
133k
  int len;
185
186
133k
  if (s <= start) return (UChar* )s;
187
122k
  p = s;
188
189
122k
  if (SJIS_ISMB_TRAIL(*p)) {
190
101k
    while (p > start) {
191
100k
      if (! SJIS_ISMB_FIRST(*--p)) {
192
83.7k
        p++;
193
83.7k
        break;
194
83.7k
      }
195
100k
    }
196
84.3k
  }
197
122k
  len = enclen(ONIG_ENCODING_SJIS, p);
198
122k
  if (p + len > s) return (UChar* )p;
199
954
  p += len;
200
954
  return (UChar* )(p + ((s - p) & ~1));
201
122k
}
202
203
static int
204
is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
205
515
{
206
515
  const UChar c = *s;
207
515
  return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
208
515
}
209
210
211
static const OnigCodePoint CR_Hiragana[] = {
212
  1,
213
  0x829f, 0x82f1
214
}; /* CR_Hiragana */
215
216
static const OnigCodePoint CR_Katakana[] = {
217
  4,
218
  0x00a6, 0x00af,
219
  0x00b1, 0x00dd,
220
  0x8340, 0x837e,
221
  0x8380, 0x8396,
222
}; /* CR_Katakana */
223
224
static const OnigCodePoint* PropertyList[] = {
225
  CR_Hiragana,
226
  CR_Katakana
227
};
228
229
230
static int
231
property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
232
166
{
233
166
  struct PropertyNameCtype* pc;
234
166
  int len = (int )(end - p);
235
166
  char q[32];
236
237
166
  if (len < sizeof(q) - 1) {
238
164
    xmemcpy(q, p, (size_t )len);
239
164
    q[len] = '\0';
240
164
    pc = onigenc_sjis_lookup_property_name(q, len);
241
164
    if (pc != 0)
242
140
      return pc->ctype;
243
164
  }
244
245
26
  return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
246
166
}
247
248
static int
249
is_code_ctype(OnigCodePoint code, unsigned int ctype)
250
242k
{
251
242k
  if (ctype <= ONIGENC_MAX_STD_CTYPE) {
252
242k
    if (code < 128)
253
181k
      return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
254
61.0k
    else {
255
61.0k
      if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
256
43.8k
        return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
257
43.8k
      }
258
61.0k
    }
259
242k
  }
260
0
  else {
261
0
    ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
262
0
    if (ctype >= (unsigned int )(sizeof(PropertyList)/sizeof(PropertyList[0])))
263
0
      return ONIGERR_TYPE_BUG;
264
265
0
    return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
266
0
  }
267
268
17.1k
  return FALSE;
269
242k
}
270
271
static int
272
get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
273
                     const OnigCodePoint* ranges[])
274
1.03k
{
275
1.03k
  if (ctype <= ONIGENC_MAX_STD_CTYPE) {
276
998
    return ONIG_NO_SUPPORT_CONFIG;
277
998
  }
278
32
  else {
279
32
    *sb_out = 0x80;
280
281
32
    ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
282
32
    if (ctype >= (OnigCtype )(sizeof(PropertyList)/sizeof(PropertyList[0])))
283
0
      return ONIGERR_TYPE_BUG;
284
285
32
    *ranges = PropertyList[ctype];
286
32
    return 0;
287
32
  }
288
1.03k
}
289
290
OnigEncodingType OnigEncodingSJIS = {
291
  mbc_enc_len,
292
  "Shift_JIS",   /* name */
293
  2,             /* max enc length */
294
  1,             /* min enc length */
295
  onigenc_is_mbc_newline_0x0a,
296
  mbc_to_code,
297
  code_to_mbclen,
298
  code_to_mbc,
299
  mbc_case_fold,
300
  onigenc_ascii_apply_all_case_fold,
301
  onigenc_ascii_get_case_fold_codes_by_str,
302
  property_name_to_ctype,
303
  is_code_ctype,
304
  get_ctype_code_range,
305
  left_adjust_char_head,
306
  is_allowed_reverse_match,
307
  NULL, /* init */
308
  NULL, /* is_initialized */
309
  is_valid_mbc_string,
310
  ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1_OR_0,
311
  0, 0
312
};