/proc/self/cwd/external/utf8_range/naive.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include <stdio.h> |
2 | | |
3 | | /* |
4 | | * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 |
5 | | * |
6 | | * Table 3-7. Well-Formed UTF-8 Byte Sequences |
7 | | * |
8 | | * +--------------------+------------+-------------+------------+-------------+ |
9 | | * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | |
10 | | * +--------------------+------------+-------------+------------+-------------+ |
11 | | * | U+0000..U+007F | 00..7F | | | | |
12 | | * +--------------------+------------+-------------+------------+-------------+ |
13 | | * | U+0080..U+07FF | C2..DF | 80..BF | | | |
14 | | * +--------------------+------------+-------------+------------+-------------+ |
15 | | * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | |
16 | | * +--------------------+------------+-------------+------------+-------------+ |
17 | | * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | |
18 | | * +--------------------+------------+-------------+------------+-------------+ |
19 | | * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | |
20 | | * +--------------------+------------+-------------+------------+-------------+ |
21 | | * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | |
22 | | * +--------------------+------------+-------------+------------+-------------+ |
23 | | * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | |
24 | | * +--------------------+------------+-------------+------------+-------------+ |
25 | | * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | |
26 | | * +--------------------+------------+-------------+------------+-------------+ |
27 | | * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | |
28 | | * +--------------------+------------+-------------+------------+-------------+ |
29 | | */ |
30 | | |
31 | | /* Return 0 - success, >0 - index(1 based) of first error char */ |
32 | | int utf8_naive(const unsigned char *data, int len) |
33 | 0 | { |
34 | 0 | int err_pos = 1; |
35 | |
|
36 | 0 | while (len) { |
37 | 0 | int bytes; |
38 | 0 | const unsigned char byte1 = data[0]; |
39 | | |
40 | | /* 00..7F */ |
41 | 0 | if (byte1 <= 0x7F) { |
42 | 0 | bytes = 1; |
43 | | /* C2..DF, 80..BF */ |
44 | 0 | } else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && |
45 | 0 | (signed char)data[1] <= (signed char)0xBF) { |
46 | 0 | bytes = 2; |
47 | 0 | } else if (len >= 3) { |
48 | 0 | const unsigned char byte2 = data[1]; |
49 | | |
50 | | /* Is byte2, byte3 between 0x80 ~ 0xBF */ |
51 | 0 | const int byte2_ok = (signed char)byte2 <= (signed char)0xBF; |
52 | 0 | const int byte3_ok = (signed char)data[2] <= (signed char)0xBF; |
53 | |
|
54 | 0 | if (byte2_ok && byte3_ok && |
55 | | /* E0, A0..BF, 80..BF */ |
56 | 0 | ((byte1 == 0xE0 && byte2 >= 0xA0) || |
57 | | /* E1..EC, 80..BF, 80..BF */ |
58 | 0 | (byte1 >= 0xE1 && byte1 <= 0xEC) || |
59 | | /* ED, 80..9F, 80..BF */ |
60 | 0 | (byte1 == 0xED && byte2 <= 0x9F) || |
61 | | /* EE..EF, 80..BF, 80..BF */ |
62 | 0 | (byte1 >= 0xEE && byte1 <= 0xEF))) { |
63 | 0 | bytes = 3; |
64 | 0 | } else if (len >= 4) { |
65 | | /* Is byte4 between 0x80 ~ 0xBF */ |
66 | 0 | const int byte4_ok = (signed char)data[3] <= (signed char)0xBF; |
67 | |
|
68 | 0 | if (byte2_ok && byte3_ok && byte4_ok && |
69 | | /* F0, 90..BF, 80..BF, 80..BF */ |
70 | 0 | ((byte1 == 0xF0 && byte2 >= 0x90) || |
71 | | /* F1..F3, 80..BF, 80..BF, 80..BF */ |
72 | 0 | (byte1 >= 0xF1 && byte1 <= 0xF3) || |
73 | | /* F4, 80..8F, 80..BF, 80..BF */ |
74 | 0 | (byte1 == 0xF4 && byte2 <= 0x8F))) { |
75 | 0 | bytes = 4; |
76 | 0 | } else { |
77 | 0 | return err_pos; |
78 | 0 | } |
79 | 0 | } else { |
80 | 0 | return err_pos; |
81 | 0 | } |
82 | 0 | } else { |
83 | 0 | return err_pos; |
84 | 0 | } |
85 | | |
86 | 0 | len -= bytes; |
87 | 0 | err_pos += bytes; |
88 | 0 | data += bytes; |
89 | 0 | } |
90 | | |
91 | 0 | return 0; |
92 | 0 | } |