Coverage Report

Created: 2025-09-27 06:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/unicode-bidi-0.3.18/src/prepare.rs
Line
Count
Source
1
// Copyright 2015 The Servo Project Developers. See the
2
// COPYRIGHT file at the top-level directory of this distribution.
3
//
4
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7
// option. This file may not be copied, modified, or distributed
8
// except according to those terms.
9
10
//! 3.3.3 Preparations for Implicit Processing
11
//!
12
//! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
13
14
use alloc::vec::Vec;
15
use core::cmp::max;
16
use core::ops::Range;
17
#[cfg(feature = "smallvec")]
18
use smallvec::{smallvec, SmallVec};
19
20
use super::level::Level;
21
use super::BidiClass::{self, *};
22
23
/// A maximal substring of characters with the same embedding level.
24
///
25
/// Represented as a range of byte indices.
26
pub type LevelRun = Range<usize>;
27
28
#[cfg(feature = "smallvec")]
29
pub type LevelRunVec = SmallVec<[LevelRun; 8]>;
30
#[cfg(not(feature = "smallvec"))]
31
pub type LevelRunVec = Vec<LevelRun>;
32
33
/// Output of `isolating_run_sequences` (steps X9-X10)
34
#[derive(Debug, PartialEq)]
35
pub struct IsolatingRunSequence {
36
    pub runs: Vec<LevelRun>,
37
    pub sos: BidiClass, // Start-of-sequence type.
38
    pub eos: BidiClass, // End-of-sequence type.
39
}
40
41
#[cfg(feature = "smallvec")]
42
pub type IsolatingRunSequenceVec = SmallVec<[IsolatingRunSequence; 8]>;
43
#[cfg(not(feature = "smallvec"))]
44
pub type IsolatingRunSequenceVec = Vec<IsolatingRunSequence>;
45
46
/// Compute the set of isolating run sequences.
47
///
48
/// An isolating run sequence is a maximal sequence of level runs such that for all level runs
49
/// except the last one in the sequence, the last character of the run is an isolate initiator
50
/// whose matching PDI is the first character of the next level run in the sequence.
51
///
52
/// Note: This function does *not* return the sequences in order by their first characters.
53
#[cfg_attr(feature = "flame_it", flamer::flame)]
54
0
pub fn isolating_run_sequences(
55
0
    para_level: Level,
56
0
    original_classes: &[BidiClass],
57
0
    levels: &[Level],
58
0
    runs: LevelRunVec,
59
0
    has_isolate_controls: bool,
60
0
    isolating_run_sequences: &mut IsolatingRunSequenceVec,
61
0
) {
62
    // Per http://www.unicode.org/reports/tr9/#BD13:
63
    // "In the absence of isolate initiators, each isolating run sequence in a paragraph
64
    //  consists of exactly one level run, and each level run constitutes a separate
65
    //  isolating run sequence."
66
    // We can take a simplified path to handle this case.
67
0
    if !has_isolate_controls {
68
0
        isolating_run_sequences.reserve_exact(runs.len());
69
0
        for run in runs {
70
            // Determine the `sos` and `eos` class for the sequence.
71
            // <http://www.unicode.org/reports/tr9/#X10>
72
73
0
            let run_levels = &levels[run.clone()];
74
0
            let run_classes = &original_classes[run.clone()];
75
0
            let seq_level = run_levels[run_classes
76
0
                .iter()
77
0
                .position(|c| not_removed_by_x9(c))
78
0
                .unwrap_or(0)];
79
80
0
            let end_level = run_levels[run_classes
81
0
                .iter()
82
0
                .rposition(|c| not_removed_by_x9(c))
83
0
                .unwrap_or(run.end - run.start - 1)];
84
85
            // Get the level of the last non-removed char before the run.
86
0
            let pred_level = match original_classes[..run.start]
87
0
                .iter()
88
0
                .rposition(not_removed_by_x9)
89
            {
90
0
                Some(idx) => levels[idx],
91
0
                None => para_level,
92
            };
93
94
            // Get the level of the next non-removed char after the run.
95
0
            let succ_level = match original_classes[run.end..]
96
0
                .iter()
97
0
                .position(not_removed_by_x9)
98
            {
99
0
                Some(idx) => levels[run.end + idx],
100
0
                None => para_level,
101
            };
102
103
0
            isolating_run_sequences.push(IsolatingRunSequence {
104
0
                runs: vec![run],
105
0
                sos: max(seq_level, pred_level).bidi_class(),
106
0
                eos: max(end_level, succ_level).bidi_class(),
107
0
            });
108
        }
109
0
        return;
110
0
    }
111
112
    // Compute the set of isolating run sequences.
113
    // <http://www.unicode.org/reports/tr9/#BD13>
114
0
    let mut sequences = Vec::with_capacity(runs.len());
115
116
    // When we encounter an isolate initiator, we push the current sequence onto the
117
    // stack so we can resume it after the matching PDI.
118
    #[cfg(feature = "smallvec")]
119
    let mut stack: SmallVec<[Vec<Range<usize>>; 8]> = smallvec![vec![]];
120
    #[cfg(not(feature = "smallvec"))]
121
0
    let mut stack = vec![vec![]];
122
123
0
    for run in runs {
124
0
        assert!(!run.is_empty());
125
0
        assert!(!stack.is_empty());
126
127
0
        let start_class = original_classes[run.start];
128
        // > In rule X10, [..] skip over any BNs when [..].
129
        // > Do the same when determining if the last character of the sequence is an isolate initiator.
130
        //
131
        // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
132
0
        let end_class = original_classes[run.start..run.end]
133
0
            .iter()
134
0
            .copied()
135
0
            .rev()
136
0
            .find(not_removed_by_x9)
137
0
            .unwrap_or(start_class);
138
139
0
        let mut sequence = if start_class == PDI && stack.len() > 1 {
140
            // Continue a previous sequence interrupted by an isolate.
141
0
            stack.pop().unwrap()
142
        } else {
143
            // Start a new sequence.
144
0
            Vec::new()
145
        };
146
147
0
        sequence.push(run);
148
149
0
        if matches!(end_class, RLI | LRI | FSI) {
150
0
            // Resume this sequence after the isolate.
151
0
            stack.push(sequence);
152
0
        } else {
153
0
            // This sequence is finished.
154
0
            sequences.push(sequence);
155
0
        }
156
    }
157
    // Pop any remaining sequences off the stack.
158
0
    sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
159
160
    // Determine the `sos` and `eos` class for each sequence.
161
    // <http://www.unicode.org/reports/tr9/#X10>
162
0
    for sequence in sequences {
163
0
        assert!(!sequence.is_empty());
164
165
0
        let start_of_seq = sequence[0].start;
166
0
        let runs_len = sequence.len();
167
0
        let end_of_seq = sequence[runs_len - 1].end;
168
169
0
        let mut result = IsolatingRunSequence {
170
0
            runs: sequence,
171
0
            sos: L,
172
0
            eos: L,
173
0
        };
174
175
        // > (not counting characters removed by X9)
176
0
        let seq_level = levels[result
177
0
            .iter_forwards_from(start_of_seq, 0)
178
0
            .find(|i| not_removed_by_x9(&original_classes[*i]))
179
0
            .unwrap_or(start_of_seq)];
180
181
        // XXXManishearth the spec talks of a start and end level,
182
        // but for a given IRS the two should be equivalent, yes?
183
0
        let end_level = levels[result
184
0
            .iter_backwards_from(end_of_seq, runs_len - 1)
185
0
            .find(|i| not_removed_by_x9(&original_classes[*i]))
186
0
            .unwrap_or(end_of_seq - 1)];
187
188
        #[cfg(test)]
189
        for idx in result.runs.clone().into_iter().flatten() {
190
            if not_removed_by_x9(&original_classes[idx]) {
191
                assert_eq!(seq_level, levels[idx]);
192
            }
193
        }
194
195
        // Get the level of the last non-removed char before the runs.
196
0
        let pred_level = match original_classes[..start_of_seq]
197
0
            .iter()
198
0
            .rposition(not_removed_by_x9)
199
        {
200
0
            Some(idx) => levels[idx],
201
0
            None => para_level,
202
        };
203
204
        // Get the last non-removed character to check if it is an isolate initiator.
205
        // The spec calls for an unmatched one, but matched isolate initiators
206
        // will never be at the end of a level run (otherwise there would be more to the run).
207
        // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check.
208
0
        let last_non_removed = original_classes[..end_of_seq]
209
0
            .iter()
210
0
            .copied()
211
0
            .rev()
212
0
            .find(not_removed_by_x9)
213
0
            .unwrap_or(BN);
214
215
        // Get the level of the next non-removed char after the runs.
216
0
        let succ_level = if matches!(last_non_removed, RLI | LRI | FSI) {
217
0
            para_level
218
        } else {
219
0
            match original_classes[end_of_seq..]
220
0
                .iter()
221
0
                .position(not_removed_by_x9)
222
            {
223
0
                Some(idx) => levels[end_of_seq + idx],
224
0
                None => para_level,
225
            }
226
        };
227
228
0
        result.sos = max(seq_level, pred_level).bidi_class();
229
0
        result.eos = max(end_level, succ_level).bidi_class();
230
231
0
        isolating_run_sequences.push(result);
232
    }
233
0
}
234
235
impl IsolatingRunSequence {
236
    /// Given a text-relative position `pos` and an index of the level run it is in,
237
    /// produce an iterator of all characters after and pos (`pos..`) that are in this
238
    /// run sequence
239
0
    pub(crate) fn iter_forwards_from(
240
0
        &self,
241
0
        pos: usize,
242
0
        level_run_index: usize,
243
0
    ) -> impl Iterator<Item = usize> + '_ {
244
0
        let runs = &self.runs[level_run_index..];
245
246
        // Check that it is in range
247
        // (we can't use contains() since we want an inclusive range)
248
        #[cfg(feature = "std")]
249
        debug_assert!(runs[0].start <= pos && pos <= runs[0].end);
250
251
0
        (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone))
252
0
    }
253
254
    /// Given a text-relative position `pos` and an index of the level run it is in,
255
    /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this
256
    /// run sequence
257
0
    pub(crate) fn iter_backwards_from(
258
0
        &self,
259
0
        pos: usize,
260
0
        level_run_index: usize,
261
0
    ) -> impl Iterator<Item = usize> + '_ {
262
0
        let prev_runs = &self.runs[..level_run_index];
263
0
        let current = &self.runs[level_run_index];
264
265
        // Check that it is in range
266
        // (we can't use contains() since we want an inclusive range)
267
        #[cfg(feature = "std")]
268
        debug_assert!(current.start <= pos && pos <= current.end);
269
270
0
        (current.start..pos)
271
0
            .rev()
272
0
            .chain(prev_runs.iter().rev().flat_map(Clone::clone))
273
0
    }
274
}
275
276
/// Finds the level runs in a paragraph.
277
///
278
/// <http://www.unicode.org/reports/tr9/#BD7>
279
///
280
/// This is only used by tests; normally level runs are identified during explicit::compute.
281
#[cfg(test)]
282
fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
283
    assert_eq!(levels.len(), original_classes.len());
284
285
    let mut runs = Vec::new();
286
    if levels.is_empty() {
287
        return runs;
288
    }
289
290
    let mut current_run_level = levels[0];
291
    let mut current_run_start = 0;
292
    for i in 1..levels.len() {
293
        if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
294
            // End the last run and start a new one.
295
            runs.push(current_run_start..i);
296
            current_run_level = levels[i];
297
            current_run_start = i;
298
        }
299
    }
300
    runs.push(current_run_start..levels.len());
301
302
    runs
303
}
304
305
/// Should this character be ignored in steps after X9?
306
///
307
/// <http://www.unicode.org/reports/tr9/#X9>
308
0
pub fn removed_by_x9(class: BidiClass) -> bool {
309
0
    matches!(class, RLE | LRE | RLO | LRO | PDF | BN)
310
0
}
311
312
// For use as a predicate for `position` / `rposition`
313
0
pub fn not_removed_by_x9(class: &BidiClass) -> bool {
314
0
    !removed_by_x9(*class)
315
0
}
316
317
#[cfg(test)]
318
mod tests {
319
    use super::*;
320
321
    #[test]
322
    fn test_level_runs() {
323
        assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
324
        assert_eq!(
325
            level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]),
326
            &[0..3, 3..5, 5..6, 6..8]
327
        );
328
    }
329
330
    // From <http://www.unicode.org/reports/tr9/#BD13>
331
    #[rustfmt::skip]
332
    #[test]
333
    fn test_isolating_run_sequences() {
334
335
        // == Example 1 ==
336
        // text1·RLE·text2·PDF·RLE·text3·PDF·text4
337
        // index        0    1  2    3    4  5    6  7
338
        let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
339
        let levels =  &[0,   1, 1,   1,   1, 1,   1, 0];
340
        let para_level = Level::ltr();
341
        let mut sequences = IsolatingRunSequenceVec::new();
342
        isolating_run_sequences(
343
            para_level,
344
            classes,
345
            &Level::vec(levels),
346
            level_runs(&Level::vec(levels), classes).into(),
347
            false,
348
            &mut sequences);
349
        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
350
        assert_eq!(
351
            sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
352
            vec![vec![0..2], vec![2..7], vec![7..8]]
353
        );
354
355
        // == Example 2 ==
356
        // text1·RLI·text2·PDI·RLI·text3·PDI·text4
357
        // index        0    1  2    3    4  5    6  7
358
        let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
359
        let levels =  &[0,   0, 1,   0,   0, 1,   0, 0];
360
        let para_level = Level::ltr();
361
        let mut sequences = IsolatingRunSequenceVec::new();
362
        isolating_run_sequences(
363
            para_level,
364
            classes,
365
            &Level::vec(levels),
366
            level_runs(&Level::vec(levels), classes).into(),
367
            true,
368
            &mut sequences);
369
        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
370
        assert_eq!(
371
            sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
372
            vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]]
373
        );
374
375
        // == Example 3 ==
376
        // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
377
        // index        0    1  2    3  4    5  6    7  8    9  10  11  12
378
        let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI,  L];
379
        let levels =  &[0,   0, 1,   1, 2,   3, 3,   3, 2,   1, 1,   0,  0];
380
        let para_level = Level::ltr();
381
        let mut sequences = IsolatingRunSequenceVec::new();
382
        isolating_run_sequences(
383
            para_level,
384
            classes,
385
            &Level::vec(levels),
386
            level_runs(&Level::vec(levels), classes).into(),
387
            true,
388
            &mut sequences);
389
        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
390
        assert_eq!(
391
            sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
392
            vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]]
393
        );
394
    }
395
396
    // From <http://www.unicode.org/reports/tr9/#X10>
397
    #[rustfmt::skip]
398
    #[test]
399
    fn test_isolating_run_sequences_sos_and_eos() {
400
401
        // == Example 1 ==
402
        // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
403
        // index        0    1  2    3  4    5  6    7    8  9   10  11
404
        let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF,  L];
405
        let levels =  &[0,   1, 1,   2, 2,   2, 1,   1,   1, 1,   1,  0];
406
        let para_level = Level::ltr();
407
        let mut sequences = IsolatingRunSequenceVec::new();
408
        isolating_run_sequences(
409
            para_level,
410
            classes,
411
            &Level::vec(levels),
412
            level_runs(&Level::vec(levels), classes).into(),
413
            false,
414
            &mut sequences);
415
        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
416
417
        // text1
418
        assert_eq!(
419
            &sequences[0],
420
            &IsolatingRunSequence {
421
                runs: vec![0..2],
422
                sos: L,
423
                eos: R,
424
            }
425
        );
426
427
        // text2
428
        assert_eq!(
429
            &sequences[1],
430
            &IsolatingRunSequence {
431
                runs: vec![2..4],
432
                sos: R,
433
                eos: L,
434
            }
435
        );
436
437
        // text3
438
        assert_eq!(
439
            &sequences[2],
440
            &IsolatingRunSequence {
441
                runs: vec![4..6],
442
                sos: L,
443
                eos: L,
444
            }
445
        );
446
447
        // text4 text5
448
        assert_eq!(
449
            &sequences[3],
450
            &IsolatingRunSequence {
451
                runs: vec![6..11],
452
                sos: L,
453
                eos: R,
454
            }
455
        );
456
457
        // text6
458
        assert_eq!(
459
            &sequences[4],
460
            &IsolatingRunSequence {
461
                runs: vec![11..12],
462
                sos: R,
463
                eos: L,
464
            }
465
        );
466
467
        // == Example 2 ==
468
        // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
469
        // index        0    1  2    3  4    5  6    7    8  9   10  11
470
        let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI,  L];
471
        let levels =  &[0,   0, 1,   1, 2,   1, 1,   0,   0, 1,   0,  0];
472
        let para_level = Level::ltr();
473
        let mut sequences = IsolatingRunSequenceVec::new();
474
        isolating_run_sequences(
475
            para_level,
476
            classes,
477
            &Level::vec(levels),
478
            level_runs(&Level::vec(levels), classes).into(),
479
            true,
480
            &mut sequences);
481
        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
482
483
        // text1·RLI·PDI·RLI·PDI·text6
484
        assert_eq!(
485
            &sequences[0],
486
            &IsolatingRunSequence {
487
                runs: vec![0..2, 7..9, 10..12],
488
                sos: L,
489
                eos: L,
490
            }
491
        );
492
493
        // text2·LRI·PDI·text4
494
        assert_eq!(
495
            &sequences[1],
496
            &IsolatingRunSequence {
497
                runs: vec![2..4, 5..7],
498
                sos: R,
499
                eos: R,
500
            }
501
        );
502
503
        // text3
504
        assert_eq!(
505
            &sequences[2],
506
            &IsolatingRunSequence {
507
                runs: vec![4..5],
508
                sos: L,
509
                eos: L,
510
            }
511
        );
512
513
        // text5
514
        assert_eq!(
515
            &sequences[3],
516
            &IsolatingRunSequence {
517
                runs: vec![9..10],
518
                sos: R,
519
                eos: R,
520
            }
521
        );
522
    }
523
524
    #[test]
525
    fn test_removed_by_x9() {
526
        let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
527
        let not_classes = &[L, RLI, AL, LRI, PDI];
528
        for x in rem_classes {
529
            assert_eq!(removed_by_x9(*x), true);
530
        }
531
        for x in not_classes {
532
            assert_eq!(removed_by_x9(*x), false);
533
        }
534
    }
535
536
    #[test]
537
    fn test_not_removed_by_x9() {
538
        let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI];
539
        for x in non_x9_classes {
540
            assert_eq!(not_removed_by_x9(&x), true);
541
        }
542
    }
543
}