Coverage Report

Created: 2024-10-16 07:58

/rust/registry/src/index.crates.io-6f17d22bba15001f/unicode-width-0.1.13/src/lib.rs
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2
// file at the top-level directory of this distribution and at
3
// http://rust-lang.org/COPYRIGHT.
4
//
5
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8
// option. This file may not be copied, modified, or distributed
9
// except according to those terms.
10
11
//! Determine displayed width of `char` and `str` types according to
12
//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
13
//! and other portions of the Unicode standard.
14
//! See the [Rules for determining width](#rules-for-determining-width) section
15
//! for the exact rules.
16
//!
17
//! This crate is `#![no_std]`.
18
//!
19
//! ```rust
20
//! use unicode_width::UnicodeWidthStr;
21
//!
22
//! let teststr = "Hello, world!";
23
//! let width = UnicodeWidthStr::width(teststr);
24
//! println!("{}", teststr);
25
//! println!("The above string is {} columns wide.", width);
26
//! let width = teststr.width_cjk();
27
//! println!("The above string is {} columns wide (CJK).", width);
28
//! ```
29
//!
30
//! # Rules for determining width
31
//!
32
//! This crate currently uses the following rules to determine the width of a
33
//! character or string, in order of decreasing precedence. These may be tweaked in the future.
34
//!
35
//! 1. [Emoji presentation sequences] have width 2.
36
//! 2. Outside of an East Asian context, [text presentation sequences] have width 1
37
//!    if their base character:
38
//!    - Has the [`Emoji_Presentation`] property, and
39
//!    - Is not in the [Enclosed Ideographic Supplement] block.
40
//! 3. The sequence `"\r\n"` has width 1.
41
//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
42
//!    followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
43
//! 5. In an East Asian context only, `<`, `=`, or `>` have width 2 when followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY].
44
//! 6. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
45
//! 7. The following have width 0:
46
//!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
47
//!       with the [`Default_Ignorable_Code_Point`] property.
48
//!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
49
//!       with the [`Grapheme_Extend`] property.
50
//!    - The following 8 characters, all of which have NFD decompositions consisting of two [`Grapheme_Extend`] characters:
51
//!      - [`'\u{0CC0}'` KANNADA VOWEL SIGN II](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC0),
52
//!      - [`'\u{0CC7}'` KANNADA VOWEL SIGN EE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC7),
53
//!      - [`'\u{0CC8}'` KANNADA VOWEL SIGN AI](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC8),
54
//!      - [`'\u{0CCA}'` KANNADA VOWEL SIGN O](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCA),
55
//!      - [`'\u{0CCB}'` KANNADA VOWEL SIGN OO](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCB),
56
//!      - [`'\u{1B3B}'` BALINESE VOWEL SIGN RA REPA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3B),
57
//!      - [`'\u{1B3D}'` BALINESE VOWEL SIGN LA LENGA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3D), and
58
//!      - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
59
//!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
60
//!       with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
61
//!    - The following [`Prepended_Concatenation_Mark`]s:
62
//!      - [`'\u{0605}'` NUMBER MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0605),
63
//!      - [`'\u{070F}'` SYRIAC ABBREVIATION MARK](https://util.unicode.org/UnicodeJsps/character.jsp?a=070F),
64
//!      - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890),
65
//!      - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
66
//!      - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
67
//!    - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
68
//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
69
//!    with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
70
//! 9. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
71
//!    - Has an [`East_Asian_Width`] of [`Ambiguous`], or
72
//!      has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
73
//!      is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
74
//!    - Does not have a [`General_Category`] of `Modifier_Symbol`, and
75
//!    - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
76
//! 10. All other characters have width 1.
77
//!
78
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
79
//!
80
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
81
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
82
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
83
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
84
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
85
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
86
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
87
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
88
//!
89
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
90
//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
91
//! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6
92
//!
93
//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
94
//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
95
//!
96
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
97
//!
98
//! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
99
//!
100
//! ## Canonical equivalence
101
//!
102
//! Canonically equivalent strings are assigned the same width (CJK and non-CJK).
103
104
#![forbid(unsafe_code)]
105
#![deny(missing_docs)]
106
#![doc(
107
    html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
108
    html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
109
)]
110
#![no_std]
111
112
use tables::charwidth as cw;
113
pub use tables::UNICODE_VERSION;
114
115
mod tables;
116
117
/// Methods for determining displayed width of Unicode characters.
118
pub trait UnicodeWidthChar {
119
    /// Returns the character's displayed width in columns, or `None` if the
120
    /// character is a control character.
121
    ///
122
    /// This function treats characters in the Ambiguous category according
123
    /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
124
    /// as 1 column wide. This is consistent with the recommendations for non-CJK
125
    /// contexts, or when the context cannot be reliably determined.
126
    fn width(self) -> Option<usize>;
127
128
    /// Returns the character's displayed width in columns, or `None` if the
129
    /// character is a control character.
130
    ///
131
    /// This function treats characters in the Ambiguous category according
132
    /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
133
    /// as 2 columns wide. This is consistent with the recommendations for
134
    /// CJK contexts.
135
    fn width_cjk(self) -> Option<usize>;
136
}
137
138
impl UnicodeWidthChar for char {
139
    #[inline]
140
0
    fn width(self) -> Option<usize> {
141
0
        single_char_width(self, false)
142
0
    }
143
144
    #[inline]
145
0
    fn width_cjk(self) -> Option<usize> {
146
0
        single_char_width(self, true)
147
0
    }
148
}
149
150
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
151
/// `None` if `c` is a control character.
152
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
153
/// they're treated as single width.
154
#[inline]
155
0
fn single_char_width(c: char, is_cjk: bool) -> Option<usize> {
156
0
    if c < '\u{7F}' {
157
0
        if c >= '\u{20}' {
158
            // U+0020 to U+007F (exclusive) are single-width ASCII codepoints
159
0
            Some(1)
160
        } else {
161
            // U+0001 to U+0020 (exclusive) are control codes
162
0
            None
163
        }
164
0
    } else if c >= '\u{A0}' {
165
        // No characters >= U+00A0 are control codes, so we can consult the lookup tables
166
0
        Some(cw::lookup_width(c, is_cjk))
167
    } else {
168
        // U+007F to U+00A0 (exclusive) are control codes
169
0
        None
170
    }
171
0
}
172
173
/// Methods for determining displayed width of Unicode strings.
174
pub trait UnicodeWidthStr {
175
    /// Returns the string's displayed width in columns.
176
    ///
177
    /// This function treats characters in the Ambiguous category according
178
    /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
179
    /// as 1 column wide. This is consistent with the recommendations for
180
    /// non-CJK contexts, or when the context cannot be reliably determined.
181
    fn width(&self) -> usize;
182
183
    /// Returns the string's displayed width in columns.
184
    ///
185
    /// This function treats characters in the Ambiguous category according
186
    /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
187
    /// as 2 column wide. This is consistent with the recommendations for
188
    /// CJK contexts.
189
    fn width_cjk(&self) -> usize;
190
}
191
192
impl UnicodeWidthStr for str {
193
    #[inline]
194
0
    fn width(&self) -> usize {
195
0
        str_width(self, false)
196
0
    }
Unexecuted instantiation: <str as unicode_width::UnicodeWidthStr>::width
Unexecuted instantiation: <str as unicode_width::UnicodeWidthStr>::width
197
198
    #[inline]
199
0
    fn width_cjk(&self) -> usize {
200
0
        str_width(self, true)
201
0
    }
202
}
203
204
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
205
enum NextCharInfo {
206
    #[default]
207
    Default,
208
    /// `'\n'`
209
    LineFeed,
210
    /// '\u{0338}'
211
    /// For preserving canonical equivalence with CJK
212
    CombiningLongSolidusOverlay,
213
    /// `'\u{A4FC}'..='\u{A4FD}'`
214
    /// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
215
    TrailingLisuToneLetter,
216
    /// `'\u{FE0E}'`
217
    Vs15,
218
    /// `'\u{FE0F}'`
219
    Vs16,
220
}
221
222
0
fn str_width(s: &str, is_cjk: bool) -> usize {
223
0
    s.chars()
224
0
        .rfold((0, NextCharInfo::Default), |(sum, next_info), c| {
225
0
            let (add, info) = width_in_str(c, is_cjk, next_info);
226
0
            (sum + add, info)
227
0
        })
228
0
        .0
229
0
}
230
231
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
232
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
233
/// they're treated as single width.
234
#[inline]
235
0
fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
236
0
    if (is_cjk
237
0
        && next_info == NextCharInfo::CombiningLongSolidusOverlay
238
0
        && matches!(c, '<' | '=' | '>'))
239
0
        || (next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c))
240
    {
241
0
        (2, NextCharInfo::Default)
242
0
    } else if c <= '\u{A0}' {
243
0
        match c {
244
0
            '\n' => (1, NextCharInfo::LineFeed),
245
0
            '\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
246
0
            _ => (1, NextCharInfo::Default),
247
        }
248
    } else {
249
0
        match (c, next_info) {
250
0
            ('\u{A4F8}'..='\u{A4FB}', NextCharInfo::TrailingLisuToneLetter) => {
251
0
                (0, NextCharInfo::Default)
252
            }
253
0
            ('\u{0338}', _) => (0, NextCharInfo::CombiningLongSolidusOverlay),
254
0
            ('\u{A4FC}'..='\u{A4FD}', _) => (1, NextCharInfo::TrailingLisuToneLetter),
255
0
            ('\u{FE0E}', _) => (0, NextCharInfo::Vs15),
256
0
            ('\u{FE0F}', _) => (0, NextCharInfo::Vs16),
257
            (_, NextCharInfo::Vs15)
258
0
                if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
259
0
            {
260
0
                (1, NextCharInfo::Default)
261
            }
262
0
            _ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
263
        }
264
    }
265
0
}