/src/gettext-0.26/gettext-tools/libgettextpo/uniwidth/width.c
Line | Count | Source |
1 | | /* Determine display width of Unicode character. |
2 | | Copyright (C) 2001-2002, 2006-2025 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible <bruno@clisp.org>, 2002. |
4 | | |
5 | | This file is free software: you can redistribute it and/or modify |
6 | | it under the terms of the GNU Lesser General Public License as |
7 | | published by the Free Software Foundation; either version 2.1 of the |
8 | | License, or (at your option) any later version. |
9 | | |
10 | | This file is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU Lesser General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU Lesser General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
17 | | |
18 | | #include <config.h> |
19 | | |
20 | | /* Specification. */ |
21 | | #include "uniwidth.h" |
22 | | |
23 | | #include "cjk.h" |
24 | | |
25 | | /* The non-spacing attribute table consists of: |
26 | | * Non-spacing characters; generated from PropList.txt or |
27 | | "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt" |
28 | | * Format control characters; generated from |
29 | | "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" |
30 | | * Zero width characters; generated from |
31 | | "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt" |
32 | | * Hangul Jamo characters that have conjoining behaviour: |
33 | | - jungseong = syllable-middle vowels |
34 | | - jongseong = syllable-final consonants |
35 | | Rationale: |
36 | | 1) These characters act like combining characters. They have no |
37 | | equivalent in legacy character sets. Therefore the EastAsianWidth.txt |
38 | | file does not really matter for them; UAX #11 East Asian Width |
39 | | <https://www.unicode.org/reports/tr11/> makes it clear that it focus |
40 | | is on compatibility with traditional Japanese layout. |
41 | | By contrast, the same glyphs without conjoining behaviour are available |
42 | | in the U+3130..U+318F block, and these characters are mapped to legacy |
43 | | character sets, and traditional Japanese layout matters for them. |
44 | | 2) glibc does the same thing, see |
45 | | <https://sourceware.org/bugzilla/show_bug.cgi?id=21750> |
46 | | <https://sourceware.org/bugzilla/show_bug.cgi?id=26120> |
47 | | */ |
48 | | #include "uniwidth/width0.h" |
49 | | |
50 | | #include "uniwidth/width2.h" |
51 | | #include "unictype/bitmap.h" |
52 | | |
53 | 4.33M | #define SIZEOF(a) (sizeof(a) / sizeof(a[0])) |
54 | | |
55 | | |
56 | | /* Determine number of column positions required for UC. */ |
57 | | int |
58 | | uc_width (ucs4_t uc, const char *encoding) |
59 | 4.33M | { |
60 | | /* Test for non-spacing or control character. */ |
61 | 4.33M | if ((uc >> 9) < SIZEOF (nonspacing_table_ind)) |
62 | 4.33M | { |
63 | 4.33M | int ind = nonspacing_table_ind[uc >> 9]; |
64 | 4.33M | if (ind >= 0) |
65 | 3.97M | if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1) |
66 | 2.99M | { |
67 | 2.99M | if (uc > 0 && uc < 0xa0) |
68 | 181k | return -1; |
69 | 2.80M | else |
70 | 2.80M | return 0; |
71 | 2.99M | } |
72 | 4.33M | } |
73 | 0 | else if ((uc >> 9) == (0xe0000 >> 9)) |
74 | 0 | { |
75 | 0 | if (uc >= 0xe0100) |
76 | 0 | { |
77 | 0 | if (uc <= 0xe01ef) |
78 | 0 | return 0; |
79 | 0 | } |
80 | 0 | else |
81 | 0 | { |
82 | 0 | if (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001) |
83 | 0 | return 0; |
84 | 0 | } |
85 | 0 | } |
86 | | /* Test for double-width character. */ |
87 | 1.34M | if (bitmap_lookup (&u_width2, uc)) |
88 | 368k | return 2; |
89 | | /* In ancient CJK encodings, Cyrillic and most other characters are |
90 | | double-width as well. */ |
91 | 977k | if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9 |
92 | 12.1k | && is_cjk_encoding (encoding)) |
93 | 12.1k | return 2; |
94 | 965k | return 1; |
95 | 977k | } |