zng_ext_font/unicode_bidi_util.rs
1//! This module contains copies of `unicode_bidi` code, with the dependency on text strings removed.
2//!
3//! Text is only used in the source crate to handle character with multiple bytes, but we do bidi sorting at
4//! the "segment" level.
5
6use std::{collections::HashMap, ops::Range};
7
8use unicode_bidi::{BidiClass, BidiDataSource, Level};
9
10pub(super) fn visual_runs(
11 mut levels: Vec<unicode_bidi::Level>,
12 line_classes: Vec<unicode_bidi::BidiClass>,
13 para_level: unicode_bidi::Level,
14) -> (Vec<unicode_bidi::Level>, Vec<unicode_bidi::LevelRun>) {
15 use unicode_bidi::BidiClass::*;
16
17 let line_levels = &mut levels;
18
19 // Reset some whitespace chars to paragraph level.
20 // <http://www.unicode.org/reports/tr9/#L1>
21 let mut reset_from: Option<usize> = Some(0);
22 let mut reset_to: Option<usize> = None;
23 let mut prev_level = para_level;
24 for i in 0..line_classes.len() {
25 match line_classes[i] {
26 // Segment separator, Paragraph separator
27 B | S => {
28 assert_eq!(reset_to, None);
29 reset_to = Some(i + 1);
30 if reset_from.is_none() {
31 reset_from = Some(i);
32 }
33 }
34 // Whitespace, isolate formatting
35 WS | FSI | LRI | RLI | PDI => {
36 if reset_from.is_none() {
37 reset_from = Some(i);
38 }
39 }
40 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
41 // same as above + set the level
42 RLE | LRE | RLO | LRO | PDF | BN => {
43 if reset_from.is_none() {
44 reset_from = Some(i);
45 }
46 // also set the level to previous
47 line_levels[i] = prev_level;
48 }
49 _ => {
50 reset_from = None;
51 }
52 }
53 if let (Some(from), Some(to)) = (reset_from, reset_to) {
54 for level in &mut line_levels[from..to] {
55 *level = para_level;
56 }
57 reset_from = None;
58 reset_to = None;
59 }
60 prev_level = line_levels[i];
61 }
62 if let Some(from) = reset_from {
63 for level in &mut line_levels[from..] {
64 *level = para_level;
65 }
66 }
67
68 // Find consecutive level runs.
69 let mut runs = Vec::new();
70 let mut start = 0;
71 let mut run_level = levels[start];
72 let mut min_level = run_level;
73 let mut max_level = run_level;
74
75 for (i, &new_level) in levels.iter().enumerate().skip(1) {
76 if new_level != run_level {
77 // End of the previous run, start of a new one.
78 runs.push(start..i);
79 start = i;
80 run_level = new_level;
81 min_level = std::cmp::min(run_level, min_level);
82 max_level = std::cmp::max(run_level, max_level);
83 }
84 }
85 runs.push(start..line_classes.len());
86
87 let run_count = runs.len();
88
89 // Re-order the odd runs.
90 // <http://www.unicode.org/reports/tr9/#L2>
91
92 // Stop at the lowest *odd* level.
93 min_level = min_level.new_lowest_ge_rtl().expect("Level error");
94
95 while max_level >= min_level {
96 // Look for the start of a sequence of consecutive runs of max_level or higher.
97 let mut seq_start = 0;
98 while seq_start < run_count {
99 if levels[runs[seq_start].start] < max_level {
100 seq_start += 1;
101 continue;
102 }
103
104 // Found the start of a sequence. Now find the end.
105 let mut seq_end = seq_start + 1;
106 while seq_end < run_count {
107 if levels[runs[seq_end].start] < max_level {
108 break;
109 }
110 seq_end += 1;
111 }
112
113 // Reverse the runs within this sequence.
114 runs[seq_start..seq_end].reverse();
115
116 seq_start = seq_end;
117 }
118 max_level.lower(1).expect("Lowering embedding level below zero");
119 }
120
121 (levels, runs)
122}
123
124pub(super) fn explicit_compute(
125 para_level: Level,
126 original_classes: &[BidiClass],
127 levels: &mut [Level],
128 processing_classes: &mut [BidiClass],
129) {
130 // <http://www.unicode.org/reports/tr9/#X1>
131 let mut stack = DirectionalStatusStack::new();
132 stack.push(para_level, OverrideStatus::Neutral);
133
134 let mut overflow_isolate_count = 0u32;
135 let mut overflow_embedding_count = 0u32;
136 let mut valid_isolate_count = 0u32;
137
138 for i in 0..original_classes.len() {
139 use BidiClass::*;
140 match original_classes[i] {
141 // Rules X2-X5c
142 RLE | LRE | RLO | LRO | RLI | LRI | FSI => {
143 let last_level = stack.last().level;
144
145 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
146 levels[i] = last_level;
147
148 // X5a-X5c: Isolate initiators get the level of the last entry on the stack.
149 let is_isolate = matches!(original_classes[i], RLI | LRI | FSI);
150 if is_isolate {
151 // Redundant due to "Retaining explicit formatting characters" step.
152 // levels[i] = last_level;
153 match stack.last().status {
154 OverrideStatus::RTL => processing_classes[i] = R,
155 OverrideStatus::LTR => processing_classes[i] = L,
156 _ => {}
157 }
158 }
159
160 let new_level = if is_rtl(original_classes[i]) {
161 last_level.new_explicit_next_rtl()
162 } else {
163 last_level.new_explicit_next_ltr()
164 };
165
166 if new_level.is_ok() && overflow_isolate_count == 0 && overflow_embedding_count == 0 {
167 let new_level = new_level.unwrap();
168 stack.push(
169 new_level,
170 match original_classes[i] {
171 RLO => OverrideStatus::RTL,
172 LRO => OverrideStatus::LTR,
173 RLI | LRI | FSI => OverrideStatus::Isolate,
174 _ => OverrideStatus::Neutral,
175 },
176 );
177 if is_isolate {
178 valid_isolate_count += 1;
179 } else {
180 // The spec doesn't explicitly mention this step, but it is necessary.
181 // See the reference implementations for comparison.
182 levels[i] = new_level;
183 }
184 } else if is_isolate {
185 overflow_isolate_count += 1;
186 } else if overflow_isolate_count == 0 {
187 overflow_embedding_count += 1;
188 }
189
190 if !is_isolate {
191 // X9 +
192 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
193 // (PDF handled below)
194 processing_classes[i] = BN;
195 }
196 }
197
198 // <http://www.unicode.org/reports/tr9/#X6a>
199 PDI => {
200 if overflow_isolate_count > 0 {
201 overflow_isolate_count -= 1;
202 } else if valid_isolate_count > 0 {
203 overflow_embedding_count = 0;
204 loop {
205 // Pop everything up to and including the last Isolate status.
206 match stack.vec.pop() {
207 None
208 | Some(Status {
209 status: OverrideStatus::Isolate,
210 ..
211 }) => break,
212 _ => continue,
213 }
214 }
215 valid_isolate_count -= 1;
216 }
217 let last = stack.last();
218 levels[i] = last.level;
219 match last.status {
220 OverrideStatus::RTL => processing_classes[i] = R,
221 OverrideStatus::LTR => processing_classes[i] = L,
222 _ => {}
223 }
224 }
225
226 // <http://www.unicode.org/reports/tr9/#X7>
227 PDF => {
228 if overflow_isolate_count > 0 {
229 // do nothing
230 } else if overflow_embedding_count > 0 {
231 overflow_embedding_count -= 1;
232 } else if stack.last().status != OverrideStatus::Isolate && stack.vec.len() >= 2 {
233 stack.vec.pop();
234 }
235 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
236 levels[i] = stack.last().level;
237 // X9 part of retaining explicit formatting characters.
238 processing_classes[i] = BN;
239 }
240
241 // Nothing.
242 // BN case moved down to X6, see <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
243 B => {}
244
245 // <http://www.unicode.org/reports/tr9/#X6>
246 _ => {
247 let last = stack.last();
248 levels[i] = last.level;
249 // This condition is not in the spec, but I am pretty sure that is a spec bug.
250 // https://www.unicode.org/L2/L2023/23014-amd-to-uax9.pdf
251 if original_classes[i] != BN {
252 match last.status {
253 OverrideStatus::RTL => processing_classes[i] = R,
254 OverrideStatus::LTR => processing_classes[i] = L,
255 _ => {}
256 }
257 }
258 }
259 }
260 }
261}
262
263pub(super) fn prepare_isolating_run_sequences(
264 para_level: Level,
265 original_classes: &[BidiClass],
266 levels: &[Level],
267) -> Vec<IsolatingRunSequence> {
268 let runs = level_runs(levels, original_classes);
269
270 // Compute the set of isolating run sequences.
271 // <http://www.unicode.org/reports/tr9/#BD13>
272 let mut sequences = Vec::with_capacity(runs.len());
273
274 // When we encounter an isolate initiator, we push the current sequence onto the
275 // stack so we can resume it after the matching PDI.
276 let mut stack = vec![Vec::new()];
277
278 use BidiClass::*;
279
280 for run in runs {
281 assert!(!run.is_empty());
282 assert!(!stack.is_empty());
283
284 let start_class = original_classes[run.start];
285 let end_class = original_classes[run.end - 1];
286
287 let mut sequence = if start_class == PDI && stack.len() > 1 {
288 // Continue a previous sequence interrupted by an isolate.
289 stack.pop().unwrap()
290 } else {
291 // Start a new sequence.
292 Vec::new()
293 };
294
295 sequence.push(run);
296
297 if let RLI | LRI | FSI = end_class {
298 // Resume this sequence after the isolate.
299 stack.push(sequence);
300 } else {
301 // This sequence is finished.
302 sequences.push(sequence);
303 }
304 }
305 // Pop any remaining sequences off the stack.
306 sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
307
308 // Determine the `sos` and `eos` class for each sequence.
309 // <http://www.unicode.org/reports/tr9/#X10>
310 sequences
311 .into_iter()
312 .map(|sequence: Vec<LevelRun>| {
313 assert!(!sequence.is_empty());
314
315 let mut result = IsolatingRunSequence {
316 runs: sequence,
317 sos: L,
318 eos: L,
319 };
320
321 let start_of_seq = result.runs[0].start;
322 let runs_len = result.runs.len();
323 let end_of_seq = result.runs[runs_len - 1].end;
324
325 // > (not counting characters removed by X9)
326 let seq_level = result
327 .iter_forwards_from(start_of_seq, 0)
328 .filter(|i| not_removed_by_x9(&original_classes[*i]))
329 .map(|i| levels[i])
330 .next()
331 .unwrap_or(levels[start_of_seq]);
332
333 let end_level = result
334 .iter_backwards_from(end_of_seq, runs_len - 1)
335 .filter(|i| not_removed_by_x9(&original_classes[*i]))
336 .map(|i| levels[i])
337 .next()
338 .unwrap_or(levels[end_of_seq - 1]);
339
340 #[cfg(test)]
341 for run in result.runs.clone() {
342 for idx in run {
343 if not_removed_by_x9(&original_classes[idx]) {
344 assert_eq!(seq_level, levels[idx]);
345 }
346 }
347 }
348
349 // Get the level of the last non-removed char before the runs.
350 let pred_level = match original_classes[..start_of_seq].iter().rposition(not_removed_by_x9) {
351 Some(idx) => levels[idx],
352 None => para_level,
353 };
354
355 // Get the last non-removed character to check if it is an isolate initiator.
356 // The spec calls for an unmatched one, but matched isolate initiators
357 // will never be at the end of a level run (otherwise there would be more to the run).
358 // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check.
359 let last_non_removed = original_classes[..end_of_seq]
360 .iter()
361 .copied()
362 .rev()
363 .find(not_removed_by_x9)
364 .unwrap_or(BN);
365
366 // Get the level of the next non-removed char after the runs.
367 let succ_level = if let RLI | LRI | FSI = last_non_removed {
368 para_level
369 } else {
370 match original_classes[end_of_seq..].iter().position(not_removed_by_x9) {
371 Some(idx) => levels[end_of_seq + idx],
372 None => para_level,
373 }
374 };
375
376 result.sos = std::cmp::max(seq_level, pred_level).bidi_class();
377 result.eos = std::cmp::max(end_level, succ_level).bidi_class();
378 result
379 })
380 .collect()
381}
382
383/// Entries in the directional status stack:
384struct Status {
385 level: Level,
386 status: OverrideStatus,
387}
388
389#[derive(PartialEq)]
390#[expect(clippy::upper_case_acronyms)]
391enum OverrideStatus {
392 Neutral,
393 RTL,
394 LTR,
395 Isolate,
396}
397
398struct DirectionalStatusStack {
399 vec: Vec<Status>,
400}
401
402impl DirectionalStatusStack {
403 fn new() -> Self {
404 DirectionalStatusStack {
405 vec: Vec::with_capacity(Level::max_explicit_depth() as usize + 2),
406 }
407 }
408
409 fn push(&mut self, level: Level, status: OverrideStatus) {
410 self.vec.push(Status { level, status });
411 }
412
413 fn last(&self) -> &Status {
414 self.vec.last().unwrap()
415 }
416}
417
418fn is_rtl(bidi_class: BidiClass) -> bool {
419 use BidiClass::*;
420 matches!(bidi_class, RLE | RLO | RLI)
421}
422
423pub(super) type LevelRun = Range<usize>;
424
425pub(super) struct IsolatingRunSequence {
426 pub runs: Vec<LevelRun>,
427 pub sos: BidiClass, // Start-of-sequence type.
428 pub eos: BidiClass, // End-of-sequence type.
429}
430
431/// Should this character be ignored in steps after X9?
432///
433/// <http://www.unicode.org/reports/tr9/#X9>
434fn removed_by_x9(class: BidiClass) -> bool {
435 use BidiClass::*;
436 matches!(class, RLE | LRE | RLO | LRO | PDF | BN)
437}
438
439// For use as a predicate for `position` / `r_position`
440fn not_removed_by_x9(class: &BidiClass) -> bool {
441 !removed_by_x9(*class)
442}
443
444impl IsolatingRunSequence {
445 /// Given a text-relative position `pos` and an index of the level run it is in,
446 /// produce an iterator of all characters after and pos (`pos..`) that are in this
447 /// run sequence
448 pub(crate) fn iter_forwards_from(&self, pos: usize, level_run_index: usize) -> impl Iterator<Item = usize> + '_ {
449 let runs = &self.runs[level_run_index..];
450
451 // Check that it is in range
452 // (we can't use contains() since we want an inclusive range)
453 debug_assert!(runs[0].start <= pos && pos <= runs[0].end);
454
455 (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone))
456 }
457
458 /// Given a text-relative position `pos` and an index of the level run it is in,
459 /// produce an iterator of all characters before and excluding pos (`..pos`) that are in this
460 /// run sequence
461 pub(crate) fn iter_backwards_from(&self, pos: usize, level_run_index: usize) -> impl Iterator<Item = usize> + '_ {
462 let prev_runs = &self.runs[..level_run_index];
463 let current = &self.runs[level_run_index];
464
465 // Check that it is in range
466 // (we can't use contains() since we want an inclusive range)
467 debug_assert!(current.start <= pos && pos <= current.end);
468
469 (current.start..pos).rev().chain(prev_runs.iter().rev().flat_map(Clone::clone))
470 }
471}
472
473/// Finds the level runs in a paragraph.
474///
475/// <http://www.unicode.org/reports/tr9/#BD7>
476fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
477 assert_eq!(levels.len(), original_classes.len());
478
479 let mut runs = Vec::new();
480 if levels.is_empty() {
481 return runs;
482 }
483
484 let mut current_run_level = levels[0];
485 let mut current_run_start = 0;
486 for i in 1..levels.len() {
487 if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
488 // End the last run and start a new one.
489 runs.push(current_run_start..i);
490 current_run_level = levels[i];
491 current_run_start = i;
492 }
493 }
494 runs.push(current_run_start..levels.len());
495
496 runs
497}
498
499/// 3.3.4 Resolving Weak Types
500///
501/// <http://www.unicode.org/reports/tr9/#Resolving_Weak_Types>
502pub(super) fn implicit_resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [BidiClass]) {
503 use BidiClass::*;
504
505 // Note: The spec treats these steps as individual passes that are applied one after the other
506 // on the entire IsolatingRunSequence at once. We instead collapse it into a single iteration,
507 // which is straightforward for rules that are based on the state of the current character, but not
508 // for rules that care about surrounding characters. To deal with them, we retain additional state
509 // about previous character classes that may have since been changed by later rules.
510
511 // The previous class for the purposes of rule W4/W6, not tracking changes made after or during W4.
512 let mut prev_class_before_w4 = sequence.sos;
513 // The previous class for the purposes of rule W5.
514 let mut prev_class_before_w5 = sequence.sos;
515 // The previous class for the purposes of rule W1, not tracking changes from any other rules.
516 let mut prev_class_before_w1 = sequence.sos;
517 let mut last_strong_is_al = false;
518 let mut et_run_indices = Vec::new(); // for W5
519 let mut bn_run_indices = Vec::new(); // for W5 + <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
520
521 for (run_index, level_run) in sequence.runs.iter().enumerate() {
522 for i in &mut level_run.clone() {
523 if processing_classes[i] == BN {
524 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
525 // Keeps track of BN runs for W5 in case we see an ET.
526 bn_run_indices.push(i);
527 // BNs aren't real, skip over them.
528 continue;
529 }
530
531 // Store the processing class of all rules before W2/W1.
532 // Used to keep track of the last strong character for W2. W3 is able to insert new strong
533 // characters, so we don't want to be misled by it.
534 let mut w2_processing_class = processing_classes[i];
535
536 // <http://www.unicode.org/reports/tr9/#W1>
537 //
538
539 if processing_classes[i] == NSM {
540 processing_classes[i] = match prev_class_before_w1 {
541 RLI | LRI | FSI | PDI => ON,
542 _ => prev_class_before_w1,
543 };
544 // W1 occurs before W2, update this.
545 w2_processing_class = processing_classes[i];
546 }
547
548 prev_class_before_w1 = processing_classes[i];
549
550 // <http://www.unicode.org/reports/tr9/#W2>
551 // <http://www.unicode.org/reports/tr9/#W3>
552 //
553 match processing_classes[i] {
554 EN => {
555 if last_strong_is_al {
556 // W2. If previous strong char was AL, change EN to AN.
557 processing_classes[i] = AN;
558 }
559 }
560 // W3.
561 AL => processing_classes[i] = R,
562 _ => {}
563 }
564
565 // update last_strong_is_al.
566 match w2_processing_class {
567 L | R => {
568 last_strong_is_al = false;
569 }
570 AL => {
571 last_strong_is_al = true;
572 }
573 _ => {}
574 }
575
576 let class_before_w456 = processing_classes[i];
577
578 // <http://www.unicode.org/reports/tr9/#W4>
579 // <http://www.unicode.org/reports/tr9/#W5>
580 // <http://www.unicode.org/reports/tr9/#W6> (separators only)
581 // (see below for W6 terminator code)
582 //
583 match processing_classes[i] {
584 // <http://www.unicode.org/reports/tr9/#W6>
585 EN => {
586 // W5. If a run of ETs is adjacent to an EN, change the ETs to EN.
587 for j in &et_run_indices {
588 processing_classes[*j] = EN;
589 }
590 et_run_indices.clear();
591 }
592
593 // <http://www.unicode.org/reports/tr9/#W4>
594 // <http://www.unicode.org/reports/tr9/#W6>
595 ES | CS => {
596 let mut next_class = sequence
597 .iter_forwards_from(i, run_index)
598 .map(|j| processing_classes[j])
599 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
600 .find(not_removed_by_x9)
601 .unwrap_or(sequence.eos);
602 if next_class == EN && last_strong_is_al {
603 // Apply W2 to next_class. We know that last_strong_is_al
604 // has no chance of changing on this character so we can still assume its value
605 // will be the same by the time we get to it.
606 next_class = AN;
607 }
608 processing_classes[i] = match (prev_class_before_w4, processing_classes[i], next_class) {
609 // W4
610 (EN, ES, EN) | (EN, CS, EN) => EN,
611 // W4
612 (AN, CS, AN) => AN,
613 // W6 (separators only)
614 (_, _, _) => ON,
615 };
616
617 // W6 + <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
618 // We have to do this before W5 gets its grubby hands on these characters and thinks
619 // they're part of an ET run.
620 // We check for ON to ensure that we had hit the W6 branch above, since this `ES | CS` match
621 // arm handles both W4 and W6.
622 if processing_classes[i] == ON {
623 for idx in sequence.iter_backwards_from(i, run_index) {
624 let class = &mut processing_classes[idx];
625 if *class != BN {
626 break;
627 }
628 *class = ON;
629 }
630 for idx in sequence.iter_forwards_from(i, run_index) {
631 let class = &mut processing_classes[idx];
632 if *class != BN {
633 break;
634 }
635 *class = ON;
636 }
637 }
638 }
639 // <http://www.unicode.org/reports/tr9/#W5>
640 ET => {
641 match prev_class_before_w5 {
642 EN => processing_classes[i] = EN,
643 _ => {
644 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
645 // If there was a BN run before this, that's now a part of this ET run.
646 et_run_indices.extend(&bn_run_indices);
647
648 // In case this is followed by an EN.
649 et_run_indices.push(i);
650 }
651 }
652 }
653 _ => {}
654 }
655
656 // Common loop iteration code
657 //
658
659 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
660 // BN runs would have already continued the loop, clear them before we get to the next one.
661 bn_run_indices.clear();
662
663 // W6 above only deals with separators, so it doesn't change anything W5 cares about,
664 // so we still can update this after running that part of W6.
665 prev_class_before_w5 = processing_classes[i];
666
667 // <http://www.unicode.org/reports/tr9/#W6> (terminators only)
668 // (see above for W6 separator code)
669 //
670 if prev_class_before_w5 != ET {
671 // W6. If we didn't find an adjacent EN, turn any ETs into ON instead.
672 for j in &et_run_indices {
673 processing_classes[*j] = ON;
674 }
675 et_run_indices.clear();
676 }
677
678 // We stashed this before W4/5/6 could get their grubby hands on it, and it's not
679 // used in the W6 terminator code below so we can update it now.
680 prev_class_before_w4 = class_before_w456;
681 }
682 }
683 // Rerun this check in case we ended with a sequence of BNs (i.e., we'd never
684 // hit the end of the for loop above).
685 // W6. If we didn't find an adjacent EN, turn any ETs into ON instead.
686 for j in &et_run_indices {
687 processing_classes[*j] = ON;
688 }
689 et_run_indices.clear();
690
691 // W7. If the previous strong char was L, change EN to L.
692 let mut last_strong_is_l = sequence.sos == L;
693 for run in &sequence.runs {
694 for i in run.clone() {
695 match processing_classes[i] {
696 EN if last_strong_is_l => {
697 processing_classes[i] = L;
698 }
699 L => {
700 last_strong_is_l = true;
701 }
702 R | AL => {
703 last_strong_is_l = false;
704 }
705 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
706 // Already scanning past BN here.
707 _ => {}
708 }
709 }
710 }
711}
712
713/// 3.3.5 Resolving Neutral Types
714///
715/// <http://www.unicode.org/reports/tr9/#Resolving_Neutral_Types>
716pub(super) fn implicit_resolve_neutral(
717 sequence: &IsolatingRunSequence,
718 levels: &[Level],
719 original_classes: &[BidiClass],
720 processing_classes: &mut [BidiClass],
721 brackets: &HashMap<usize, char>,
722) {
723 use BidiClass::*;
724
725 // e = embedding direction
726 let e: BidiClass = levels[sequence.runs[0].start].bidi_class();
727 let not_e = if e == BidiClass::L { BidiClass::R } else { BidiClass::L };
728 // N0. Process bracket pairs.
729
730 // > Identify the bracket pairs in the current isolating run sequence according to BD16.
731 // We use processing_classes, not original_classes, due to BD14/BD15
732 let bracket_pairs = identify_bracket_pairs(sequence, processing_classes, brackets);
733
734 // > For each bracket-pair element in the list of pairs of text positions
735 //
736 // Note: Rust ranges are interpreted as [start..end), be careful using `pair` directly
737 // for indexing as it will include the opening bracket pair but not the closing one.
738 for pair in bracket_pairs {
739 debug_assert!(
740 pair.start < processing_classes.len(),
741 "identify_bracket_pairs returned a range that is out of bounds!"
742 );
743 debug_assert!(
744 pair.end < processing_classes.len(),
745 "identify_bracket_pairs returned a range that is out of bounds!"
746 );
747 let mut found_e = false;
748 let mut found_not_e = false;
749 let mut class_to_set = None;
750
751 // > Inspect the bidirectional types of the characters enclosed within the bracket pair.
752 //
753 // `pair` is [start, end) so we will end up processing the opening character but not the closing one.
754 //
755 for enclosed_i in sequence.iter_forwards_from(pair.start + 1, pair.start_run) {
756 if enclosed_i >= pair.end {
757 debug_assert!(enclosed_i == pair.end, "If we skipped past this, the iterator is broken");
758 break;
759 }
760 let class = processing_classes[enclosed_i];
761 if class == e {
762 found_e = true;
763 } else if class == not_e {
764 found_not_e = true;
765 } else if class == BidiClass::EN || class == BidiClass::AN {
766 // > Within this scope, bidirectional types EN and AN are treated as R.
767 if e == BidiClass::L {
768 found_not_e = true;
769 } else {
770 found_e = true;
771 }
772 }
773
774 // If we have found a character with the class of the embedding direction
775 // we can bail early.
776 if found_e {
777 break;
778 }
779 }
780 // > If any strong type (either L or R) matching the embedding direction is found
781 if found_e {
782 // > .. set the type for both brackets in the pair to match the embedding direction
783 class_to_set = Some(e);
784 // > Otherwise, if there is a strong type it must be opposite the embedding direction
785 } else if found_not_e {
786 // > Therefore, test for an established context with a preceding strong type by
787 // > checking backwards before the opening paired bracket
788 // > until the first strong type (L, R, or sos) is found.
789 // (see note above about processing_classes and character boundaries)
790 let mut previous_strong = sequence
791 .iter_backwards_from(pair.start, pair.start_run)
792 .map(|i| processing_classes[i])
793 .find(|class| *class == BidiClass::L || *class == BidiClass::R || *class == BidiClass::EN || *class == BidiClass::AN)
794 .unwrap_or(sequence.sos);
795
796 // > Within this scope, bidirectional types EN and AN are treated as R.
797 if previous_strong == BidiClass::EN || previous_strong == BidiClass::AN {
798 previous_strong = BidiClass::R;
799 }
800
801 // > If the preceding strong type is also opposite the embedding direction,
802 // > context is established,
803 // > so set the type for both brackets in the pair to that direction.
804 // AND
805 // > Otherwise set the type for both brackets in the pair to the embedding direction.
806 // > Either way it gets set to previous_strong
807 //
808 // Both branches amount to setting the type to the strong type.
809 class_to_set = Some(previous_strong);
810 }
811
812 if let Some(class_to_set) = class_to_set {
813 // Update all processing classes corresponding to the start and end elements, as requested.
814 // We should include all bytes of the character, not the first one.
815 for class in &mut processing_classes[pair.start..pair.start + 1] {
816 *class = class_to_set;
817 }
818 for class in &mut processing_classes[pair.end..pair.end + 1] {
819 *class = class_to_set;
820 }
821 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
822 for idx in sequence.iter_backwards_from(pair.start, pair.start_run) {
823 let class = &mut processing_classes[idx];
824 if *class != BN {
825 break;
826 }
827 *class = class_to_set;
828 }
829 // > Any number of characters that had original bidirectional character type NSM prior to the application of
830 // > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket.
831
832 // This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
833 // about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
834 let nsm_start = pair.start + 1;
835 for idx in sequence.iter_forwards_from(nsm_start, pair.start_run) {
836 let class = original_classes[idx];
837 if class == BidiClass::NSM || processing_classes[idx] == BN {
838 processing_classes[idx] = class_to_set;
839 } else {
840 break;
841 }
842 }
843 let nsm_end = pair.end + 1;
844 for idx in sequence.iter_forwards_from(nsm_end, pair.end_run) {
845 let class = original_classes[idx];
846 if class == BidiClass::NSM || processing_classes[idx] == BN {
847 processing_classes[idx] = class_to_set;
848 } else {
849 break;
850 }
851 }
852 }
853 // > Otherwise, there are no strong types within the bracket pair
854 // > Therefore, do not set the type for that bracket pair
855 }
856
857 // N1 and N2.
858 // Indices of every byte in this isolating run sequence
859 let mut indices = sequence.runs.iter().flat_map(Clone::clone);
860 let mut prev_class = sequence.sos;
861 while let Some(mut i) = indices.next() {
862 // Process sequences of NI characters.
863 let mut ni_run = Vec::new();
864 // The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
865 if is_NI(processing_classes[i]) || processing_classes[i] == BN {
866 // Consume a run of consecutive NI characters.
867 ni_run.push(i);
868 let mut next_class;
869 loop {
870 match indices.next() {
871 Some(j) => {
872 i = j;
873 next_class = processing_classes[j];
874 // The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
875 if is_NI(next_class) || next_class == BN {
876 ni_run.push(i);
877 } else {
878 break;
879 }
880 }
881 None => {
882 next_class = sequence.eos;
883 break;
884 }
885 };
886 }
887 // N1-N2.
888 //
889 // <http://www.unicode.org/reports/tr9/#N1>
890 // <http://www.unicode.org/reports/tr9/#N2>
891 let new_class = match (prev_class, next_class) {
892 (L, L) => L,
893 (R, R) | (R, AN) | (R, EN) | (AN, R) | (AN, AN) | (AN, EN) | (EN, R) | (EN, AN) | (EN, EN) => R,
894 (_, _) => e,
895 };
896 for j in &ni_run {
897 processing_classes[*j] = new_class;
898 }
899 ni_run.clear();
900 }
901 prev_class = processing_classes[i];
902 }
903}
904
905struct BracketPair {
906 /// The text-relative index of the opening bracket.
907 start: usize,
908 /// The text-relative index of the closing bracket.
909 end: usize,
910 /// The index of the run (in the run sequence) that the opening bracket is in.
911 start_run: usize,
912 /// The index of the run (in the run sequence) that the closing bracket is in.
913 end_run: usize,
914}
915/// 3.1.3 Identifying Bracket Pairs
916///
917/// Returns all paired brackets in the source, as indices into the
918/// text source.
919///
920/// <https://www.unicode.org/reports/tr9/#BD16>
921fn identify_bracket_pairs(
922 run_sequence: &IsolatingRunSequence,
923 original_classes: &[BidiClass],
924 brackets: &HashMap<usize, char>,
925) -> Vec<BracketPair> {
926 let data_source = &unicode_bidi::HardcodedBidiData;
927
928 let mut ret = vec![];
929 let mut stack = vec![];
930
931 for (run_index, level_run) in run_sequence.runs.iter().enumerate() {
932 for i in 0..level_run.len() {
933 let actual_index = level_run.start + i;
934 // All bracket characters are ON.
935 // From BidiBrackets.txt:
936 // > The Unicode property value stability policy guarantees that characters
937 // > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y
938 if original_classes[actual_index] != BidiClass::ON {
939 continue;
940 }
941
942 if let Some(matched) = brackets
943 .get(&actual_index)
944 .and_then(|c| data_source.bidi_matched_opening_bracket(*c))
945 {
946 if matched.is_open {
947 // > If an opening paired bracket is found ...
948
949 // > ... and there is no room in the stack,
950 // > stop processing BD16 for the remainder of the isolating run sequence.
951 if stack.len() >= 63 {
952 break;
953 }
954 // > ... push its Bidi_Paired_Bracket property value and its text position onto the stack
955 stack.push((matched.opening, actual_index, run_index))
956 } else {
957 // > If a closing paired bracket is found, do the following
958
959 // > Declare a variable that holds a reference to the current stack element
960 // > and initialize it with the top element of the stack.
961 // AND
962 // > Else, if the current stack element is not at the bottom of the stack
963 for (stack_index, element) in stack.iter().enumerate().rev() {
964 // > Compare the closing paired bracket being inspected or its canonical
965 // > equivalent to the bracket in the current stack element.
966 if element.0 == matched.opening {
967 // > If the values match, meaning the two characters form a bracket pair, then
968
969 // > Append the text position in the current stack element together with the
970 // > text position of the closing paired bracket to the list.
971 let pair = BracketPair {
972 start: element.1,
973 end: actual_index,
974 start_run: element.2,
975 end_run: run_index,
976 };
977 ret.push(pair);
978
979 // > Pop the stack through the current stack element inclusively.
980 stack.truncate(stack_index);
981 break;
982 }
983 }
984 }
985 }
986 }
987 }
988 // > Sort the list of pairs of text positions in ascending order based on
989 // > the text position of the opening paired bracket.
990 ret.sort_by_key(|r| r.start);
991 ret
992}
993
994/// Neutral or Isolate formatting character (B, S, WS, ON, FSI, LRI, RLI, PDI)
995///
996/// <http://www.unicode.org/reports/tr9/#NI>
997#[expect(non_snake_case)]
998fn is_NI(class: BidiClass) -> bool {
999 use BidiClass::*;
1000 matches!(class, B | S | WS | ON | FSI | LRI | RLI | PDI)
1001}
1002
1003/// 3.3.6 Resolving Implicit Levels
1004///
1005/// Returns the maximum embedding level in the paragraph.
1006///
1007/// <http://www.unicode.org/reports/tr9/#Resolving_Implicit_Levels>
1008pub(super) fn implicit_resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> Level {
1009 use BidiClass::*;
1010
1011 let mut max_level = Level::ltr();
1012 assert_eq!(original_classes.len(), levels.len());
1013 for i in 0..levels.len() {
1014 match (levels[i].is_rtl(), original_classes[i]) {
1015 (false, AN) | (false, EN) => levels[i].raise(2).expect("Level number error"),
1016 (false, R) | (true, L) | (true, EN) | (true, AN) => levels[i].raise(1).expect("Level number error"),
1017 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> handled here
1018 (_, _) => {}
1019 }
1020 max_level = std::cmp::max(max_level, levels[i]);
1021 }
1022
1023 max_level
1024}
1025
1026/// Assign levels to characters removed by rule X9.
1027///
1028/// The levels assigned to these characters are not specified by the algorithm. This function
1029/// assigns each one the level of the previous character, to avoid breaking level runs.
1030pub(super) fn assign_levels_to_removed_chars(para_level: Level, classes: &[BidiClass], levels: &mut [Level]) {
1031 for i in 0..levels.len() {
1032 if removed_by_x9(classes[i]) {
1033 levels[i] = if i > 0 { levels[i - 1] } else { para_level };
1034 }
1035 }
1036}