/rust/registry/src/index.crates.io-1949cf8c6b5b557f/unicode-linebreak-0.1.5/src/lib.rs
Line | Count | Source |
1 | | //! Implementation of the Line Breaking Algorithm described in [Unicode Standard Annex #14][UAX14]. |
2 | | //! |
3 | | //! Given an input text, locates "line break opportunities", or positions appropriate for wrapping |
4 | | //! lines when displaying text. |
5 | | //! |
6 | | //! # Example |
7 | | //! |
8 | | //! ``` |
9 | | //! use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}}; |
10 | | //! |
11 | | //! let text = "a b \nc"; |
12 | | //! assert!(linebreaks(text).eq([ |
13 | | //! (2, Allowed), // May break after first space |
14 | | //! (5, Mandatory), // Must break after line feed |
15 | | //! (6, Mandatory) // Must break at end of text, so that there always is at least one LB |
16 | | //! ])); |
17 | | //! ``` |
18 | | //! |
19 | | //! [UAX14]: https://www.unicode.org/reports/tr14/ |
20 | | |
21 | | #![no_std] |
22 | | #![deny(missing_docs, missing_debug_implementations)] |
23 | | |
24 | | use core::iter::once; |
25 | | |
26 | | /// The [Unicode version](https://www.unicode.org/versions/) conformed to. |
27 | | pub const UNICODE_VERSION: (u8, u8, u8) = (15, 0, 0); |
28 | | |
29 | | include!("shared.rs"); |
30 | | include!("tables.rs"); |
31 | | |
32 | | /// Returns the line break property of the specified code point. |
33 | | /// |
34 | | /// # Examples |
35 | | /// |
36 | | /// ``` |
37 | | /// use unicode_linebreak::{BreakClass, break_property}; |
38 | | /// assert_eq!(break_property(0x2CF3), BreakClass::Alphabetic); |
39 | | /// ``` |
40 | | #[inline(always)] |
41 | 42.8M | pub fn break_property(codepoint: u32) -> BreakClass { |
42 | | const BMP_INDEX_LENGTH: u32 = BMP_LIMIT >> BMP_SHIFT; |
43 | | const OMITTED_BMP_INDEX_1_LENGTH: u32 = BMP_LIMIT >> SHIFT_1; |
44 | | |
45 | 42.8M | let data_pos = if codepoint < BMP_LIMIT { |
46 | 42.7M | let i = codepoint >> BMP_SHIFT; |
47 | 42.7M | BREAK_PROP_TRIE_INDEX[i as usize] + (codepoint & (BMP_DATA_BLOCK_LENGTH - 1)) as u16 |
48 | 44.9k | } else if codepoint < BREAK_PROP_TRIE_HIGH_START { |
49 | 37.0k | let i1 = codepoint >> SHIFT_1; |
50 | 37.0k | let i2 = BREAK_PROP_TRIE_INDEX |
51 | 37.0k | [(i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH) as usize] |
52 | 37.0k | + ((codepoint >> SHIFT_2) & (INDEX_2_BLOCK_LENGTH - 1)) as u16; |
53 | 37.0k | let i3_block = BREAK_PROP_TRIE_INDEX[i2 as usize]; |
54 | 37.0k | let i3_pos = ((codepoint >> SHIFT_3) & (INDEX_3_BLOCK_LENGTH - 1)) as u16; |
55 | | |
56 | 37.0k | debug_assert!(i3_block & 0x8000 == 0, "18-bit indices are unexpected"); |
57 | 37.0k | let data_block = BREAK_PROP_TRIE_INDEX[(i3_block + i3_pos) as usize]; |
58 | 37.0k | data_block + (codepoint & (SMALL_DATA_BLOCK_LENGTH - 1)) as u16 |
59 | | } else { |
60 | 7.87k | return XX; |
61 | | }; |
62 | 42.8M | BREAK_PROP_TRIE_DATA[data_pos as usize] |
63 | 42.8M | } |
64 | | |
65 | | /// Break opportunity type. |
66 | | #[derive(Copy, Clone, PartialEq, Eq, Debug)] |
67 | | pub enum BreakOpportunity { |
68 | | /// A line must break at this spot. |
69 | | Mandatory, |
70 | | /// A line is allowed to end at this spot. |
71 | | Allowed, |
72 | | } |
73 | | |
74 | | /// Returns an iterator over line break opportunities in the specified string. |
75 | | /// |
76 | | /// Break opportunities are given as tuples of the byte index of the character succeeding the break |
77 | | /// and the type. |
78 | | /// |
79 | | /// Uses the default Line Breaking Algorithm with the tailoring that Complex-Context Dependent |
80 | | /// (SA) characters get resolved to Ordinary Alphabetic and Symbol Characters (AL) regardless of |
81 | | /// General_Category. |
82 | | /// |
83 | | /// # Examples |
84 | | /// |
85 | | /// ``` |
86 | | /// use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}}; |
87 | | /// assert!(linebreaks("Hello world!").eq(vec![(6, Allowed), (12, Mandatory)])); |
88 | | /// ``` |
89 | 693k | pub fn linebreaks(s: &str) -> impl Iterator<Item = (usize, BreakOpportunity)> + Clone + '_ { |
90 | | use BreakOpportunity::{Allowed, Mandatory}; |
91 | | |
92 | 693k | s.char_indices() |
93 | 42.8M | .map(|(i, c)| (i, break_property(c as u32) as u8)) unicode_linebreak::linebreaks::{closure#0}Line | Count | Source | 93 | 42.8M | .map(|(i, c)| (i, break_property(c as u32) as u8)) |
Unexecuted instantiation: unicode_linebreak::linebreaks::{closure#0} |
94 | 693k | .chain(once((s.len(), eot))) |
95 | 43.5M | .scan((sot, false), |state, (i, cls)| { |
96 | | // ZWJ is handled outside the table to reduce its size |
97 | 43.5M | let val = PAIR_TABLE[state.0 as usize][cls as usize]; |
98 | 43.5M | let is_mandatory = val & MANDATORY_BREAK_BIT != 0; |
99 | 43.5M | let is_break = val & ALLOWED_BREAK_BIT != 0 && (!state.1 || is_mandatory); |
100 | 43.5M | *state = ( |
101 | 43.5M | val & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT), |
102 | 43.5M | cls == BreakClass::ZeroWidthJoiner as u8, |
103 | 43.5M | ); |
104 | | |
105 | 43.5M | Some((i, is_break, is_mandatory)) |
106 | 43.5M | }) unicode_linebreak::linebreaks::{closure#1}Line | Count | Source | 95 | 43.5M | .scan((sot, false), |state, (i, cls)| { | 96 | | // ZWJ is handled outside the table to reduce its size | 97 | 43.5M | let val = PAIR_TABLE[state.0 as usize][cls as usize]; | 98 | 43.5M | let is_mandatory = val & MANDATORY_BREAK_BIT != 0; | 99 | 43.5M | let is_break = val & ALLOWED_BREAK_BIT != 0 && (!state.1 || is_mandatory); | 100 | 43.5M | *state = ( | 101 | 43.5M | val & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT), | 102 | 43.5M | cls == BreakClass::ZeroWidthJoiner as u8, | 103 | 43.5M | ); | 104 | | | 105 | 43.5M | Some((i, is_break, is_mandatory)) | 106 | 43.5M | }) |
Unexecuted instantiation: unicode_linebreak::linebreaks::{closure#1} |
107 | 43.5M | .filter_map(|(i, is_break, is_mandatory)| { |
108 | 43.5M | if is_break { |
109 | 12.3M | Some((i, if is_mandatory { Mandatory } else { Allowed })) |
110 | | } else { |
111 | 31.1M | None |
112 | | } |
113 | 43.5M | }) unicode_linebreak::linebreaks::{closure#2}Line | Count | Source | 107 | 43.5M | .filter_map(|(i, is_break, is_mandatory)| { | 108 | 43.5M | if is_break { | 109 | 12.3M | Some((i, if is_mandatory { Mandatory } else { Allowed })) | 110 | | } else { | 111 | 31.1M | None | 112 | | } | 113 | 43.5M | }) |
Unexecuted instantiation: unicode_linebreak::linebreaks::{closure#2} |
114 | 693k | } |
115 | | |
116 | | /// Divides the string at the last index where further breaks do not depend on prior context. |
117 | | /// |
118 | | /// The trivial index at `eot` is excluded. |
119 | | /// |
120 | | /// A common optimization is to determine only the nearest line break opportunity before the first |
121 | | /// character that would cause the line to become overfull, requiring backward traversal, of which |
122 | | /// there are two approaches: |
123 | | /// |
124 | | /// * Cache breaks from forward traversals |
125 | | /// * Step backward and with `split_at_safe` find a pos to safely search forward from, repeatedly |
126 | | /// |
127 | | /// # Examples |
128 | | /// |
129 | | /// ``` |
130 | | /// use unicode_linebreak::{linebreaks, split_at_safe}; |
131 | | /// let s = "Not allowed to break within em dashes: — —"; |
132 | | /// let (prev, safe) = split_at_safe(s); |
133 | | /// let n = prev.len(); |
134 | | /// assert!(linebreaks(safe).eq(linebreaks(s).filter_map(|(i, x)| i.checked_sub(n).map(|i| (i, x))))); |
135 | | /// ``` |
136 | 0 | pub fn split_at_safe(s: &str) -> (&str, &str) { |
137 | 0 | let mut chars = s.char_indices().rev().scan(None, |state, (i, c)| { |
138 | 0 | let cls = break_property(c as u32); |
139 | 0 | let is_safe_pair = state |
140 | 0 | .replace(cls) |
141 | 0 | .map_or(false, |prev| is_safe_pair(cls, prev)); // Reversed since iterating backwards |
142 | 0 | Some((i, is_safe_pair)) |
143 | 0 | }); |
144 | 0 | chars.find(|&(_, is_safe_pair)| is_safe_pair); |
145 | | // Include preceding char for `linebreaks` to pick up break before match (disallowed after sot) |
146 | 0 | s.split_at(chars.next().map_or(0, |(i, _)| i)) |
147 | 0 | } |
148 | | |
149 | | #[cfg(test)] |
150 | | mod tests { |
151 | | use super::*; |
152 | | |
153 | | #[test] |
154 | | fn it_works() { |
155 | | assert_eq!(break_property(0xA), BreakClass::LineFeed); |
156 | | assert_eq!(break_property(0xDB80), BreakClass::Surrogate); |
157 | | assert_eq!(break_property(0xe01ef), BreakClass::CombiningMark); |
158 | | assert_eq!(break_property(0x10ffff), BreakClass::Unknown); |
159 | | } |
160 | | } |