zng_ext_font/
unicode_bidi_util.rs

Help
1//! This module contains copies of `unicode_bidi` code, with the dependency on text strings removed.
2//!
3//! Text is only used in the source crate to handle character with multiple bytes, but we do bidi sorting at
4//! the "segment" level.
5
6use std::{collections::HashMap, ops::Range};
7
8use unicode_bidi::{BidiClass, BidiDataSource, Level};
9
10pub(super) fn visual_runs(
11    mut levels: Vec<unicode_bidi::Level>,
12    line_classes: Vec<unicode_bidi::BidiClass>,
13    para_level: unicode_bidi::Level,
14) -> (Vec<unicode_bidi::Level>, Vec<unicode_bidi::LevelRun>) {
15    use unicode_bidi::BidiClass::*;
16
17    let line_levels = &mut levels;
18
19    // Reset some whitespace chars to paragraph level.
20    // <http://www.unicode.org/reports/tr9/#L1>
21    let mut reset_from: Option<usize> = Some(0);
22    let mut reset_to: Option<usize> = None;
23    let mut prev_level = para_level;
24    for i in 0..line_classes.len() {
25        match line_classes[i] {
26            // Segment separator, Paragraph separator
27            B | S => {
28                assert_eq!(reset_to, None);
29                reset_to = Some(i + 1);
30                if reset_from.is_none() {
31                    reset_from = Some(i);
32                }
33            }
34            // Whitespace, isolate formatting
35            WS | FSI | LRI | RLI | PDI => {
36                if reset_from.is_none() {
37                    reset_from = Some(i);
38                }
39            }
40            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
41            // same as above + set the level
42            RLE | LRE | RLO | LRO | PDF | BN => {
43                if reset_from.is_none() {
44                    reset_from = Some(i);
45                }
46                // also set the level to previous
47                line_levels[i] = prev_level;
48            }
49            _ => {
50                reset_from = None;
51            }
52        }
53        if let (Some(from), Some(to)) = (reset_from, reset_to) {
54            for level in &mut line_levels[from..to] {
55                *level = para_level;
56            }
57            reset_from = None;
58            reset_to = None;
59        }
60        prev_level = line_levels[i];
61    }
62    if let Some(from) = reset_from {
63        for level in &mut line_levels[from..] {
64            *level = para_level;
65        }
66    }
67
68    // Find consecutive level runs.
69    let mut runs = Vec::new();
70    let mut start = 0;
71    let mut run_level = levels[start];
72    let mut min_level = run_level;
73    let mut max_level = run_level;
74
75    for (i, &new_level) in levels.iter().enumerate().skip(1) {
76        if new_level != run_level {
77            // End of the previous run, start of a new one.
78            runs.push(start..i);
79            start = i;
80            run_level = new_level;
81            min_level = std::cmp::min(run_level, min_level);
82            max_level = std::cmp::max(run_level, max_level);
83        }
84    }
85    runs.push(start..line_classes.len());
86
87    let run_count = runs.len();
88
89    // Re-order the odd runs.
90    // <http://www.unicode.org/reports/tr9/#L2>
91
92    // Stop at the lowest *odd* level.
93    min_level = min_level.new_lowest_ge_rtl().expect("Level error");
94
95    while max_level >= min_level {
96        // Look for the start of a sequence of consecutive runs of max_level or higher.
97        let mut seq_start = 0;
98        while seq_start < run_count {
99            if levels[runs[seq_start].start] < max_level {
100                seq_start += 1;
101                continue;
102            }
103
104            // Found the start of a sequence. Now find the end.
105            let mut seq_end = seq_start + 1;
106            while seq_end < run_count {
107                if levels[runs[seq_end].start] < max_level {
108                    break;
109                }
110                seq_end += 1;
111            }
112
113            // Reverse the runs within this sequence.
114            runs[seq_start..seq_end].reverse();
115
116            seq_start = seq_end;
117        }
118        max_level.lower(1).expect("Lowering embedding level below zero");
119    }
120
121    (levels, runs)
122}
123
124pub(super) fn explicit_compute(
125    para_level: Level,
126    original_classes: &[BidiClass],
127    levels: &mut [Level],
128    processing_classes: &mut [BidiClass],
129) {
130    // <http://www.unicode.org/reports/tr9/#X1>
131    let mut stack = DirectionalStatusStack::new();
132    stack.push(para_level, OverrideStatus::Neutral);
133
134    let mut overflow_isolate_count = 0u32;
135    let mut overflow_embedding_count = 0u32;
136    let mut valid_isolate_count = 0u32;
137
138    for i in 0..original_classes.len() {
139        use BidiClass::*;
140        match original_classes[i] {
141            // Rules X2-X5c
142            RLE | LRE | RLO | LRO | RLI | LRI | FSI => {
143                let last_level = stack.last().level;
144
145                // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
146                levels[i] = last_level;
147
148                // X5a-X5c: Isolate initiators get the level of the last entry on the stack.
149                let is_isolate = matches!(original_classes[i], RLI | LRI | FSI);
150                if is_isolate {
151                    // Redundant due to "Retaining explicit formatting characters" step.
152                    // levels[i] = last_level;
153                    match stack.last().status {
154                        OverrideStatus::RTL => processing_classes[i] = R,
155                        OverrideStatus::LTR => processing_classes[i] = L,
156                        _ => {}
157                    }
158                }
159
160                let new_level = if is_rtl(original_classes[i]) {
161                    last_level.new_explicit_next_rtl()
162                } else {
163                    last_level.new_explicit_next_ltr()
164                };
165
166                if new_level.is_ok() && overflow_isolate_count == 0 && overflow_embedding_count == 0 {
167                    let new_level = new_level.unwrap();
168                    stack.push(
169                        new_level,
170                        match original_classes[i] {
171                            RLO => OverrideStatus::RTL,
172                            LRO => OverrideStatus::LTR,
173                            RLI | LRI | FSI => OverrideStatus::Isolate,
174                            _ => OverrideStatus::Neutral,
175                        },
176                    );
177                    if is_isolate {
178                        valid_isolate_count += 1;
179                    } else {
180                        // The spec doesn't explicitly mention this step, but it is necessary.
181                        // See the reference implementations for comparison.
182                        levels[i] = new_level;
183                    }
184                } else if is_isolate {
185                    overflow_isolate_count += 1;
186                } else if overflow_isolate_count == 0 {
187                    overflow_embedding_count += 1;
188                }
189
190                if !is_isolate {
191                    // X9 +
192                    // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
193                    // (PDF handled below)
194                    processing_classes[i] = BN;
195                }
196            }
197
198            // <http://www.unicode.org/reports/tr9/#X6a>
199            PDI => {
200                if overflow_isolate_count > 0 {
201                    overflow_isolate_count -= 1;
202                } else if valid_isolate_count > 0 {
203                    overflow_embedding_count = 0;
204                    loop {
205                        // Pop everything up to and including the last Isolate status.
206                        match stack.vec.pop() {
207                            None
208                            | Some(Status {
209                                status: OverrideStatus::Isolate,
210                                ..
211                            }) => break,
212                            _ => continue,
213                        }
214                    }
215                    valid_isolate_count -= 1;
216                }
217                let last = stack.last();
218                levels[i] = last.level;
219                match last.status {
220                    OverrideStatus::RTL => processing_classes[i] = R,
221                    OverrideStatus::LTR => processing_classes[i] = L,
222                    _ => {}
223                }
224            }
225
226            // <http://www.unicode.org/reports/tr9/#X7>
227            PDF => {
228                if overflow_isolate_count > 0 {
229                    // do nothing
230                } else if overflow_embedding_count > 0 {
231                    overflow_embedding_count -= 1;
232                } else if stack.last().status != OverrideStatus::Isolate && stack.vec.len() >= 2 {
233                    stack.vec.pop();
234                }
235                // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
236                levels[i] = stack.last().level;
237                // X9 part of retaining explicit formatting characters.
238                processing_classes[i] = BN;
239            }
240
241            // Nothing.
242            // BN case moved down to X6, see <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
243            B => {}
244
245            // <http://www.unicode.org/reports/tr9/#X6>
246            _ => {
247                let last = stack.last();
248                levels[i] = last.level;
249                // This condition is not in the spec, but I am pretty sure that is a spec bug.
250                // https://www.unicode.org/L2/L2023/23014-amd-to-uax9.pdf
251                if original_classes[i] != BN {
252                    match last.status {
253                        OverrideStatus::RTL => processing_classes[i] = R,
254                        OverrideStatus::LTR => processing_classes[i] = L,
255                        _ => {}
256                    }
257                }
258            }
259        }
260    }
261}
262
263pub(super) fn prepare_isolating_run_sequences(
264    para_level: Level,
265    original_classes: &[BidiClass],
266    levels: &[Level],
267) -> Vec<IsolatingRunSequence> {
268    let runs = level_runs(levels, original_classes);
269
270    // Compute the set of isolating run sequences.
271    // <http://www.unicode.org/reports/tr9/#BD13>
272    let mut sequences = Vec::with_capacity(runs.len());
273
274    // When we encounter an isolate initiator, we push the current sequence onto the
275    // stack so we can resume it after the matching PDI.
276    let mut stack = vec![Vec::new()];
277
278    use BidiClass::*;
279
280    for run in runs {
281        assert!(!run.is_empty());
282        assert!(!stack.is_empty());
283
284        let start_class = original_classes[run.start];
285        let end_class = original_classes[run.end - 1];
286
287        let mut sequence = if start_class == PDI && stack.len() > 1 {
288            // Continue a previous sequence interrupted by an isolate.
289            stack.pop().unwrap()
290        } else {
291            // Start a new sequence.
292            Vec::new()
293        };
294
295        sequence.push(run);
296
297        if let RLI | LRI | FSI = end_class {
298            // Resume this sequence after the isolate.
299            stack.push(sequence);
300        } else {
301            // This sequence is finished.
302            sequences.push(sequence);
303        }
304    }
305    // Pop any remaining sequences off the stack.
306    sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
307
308    // Determine the `sos` and `eos` class for each sequence.
309    // <http://www.unicode.org/reports/tr9/#X10>
310    sequences
311        .into_iter()
312        .map(|sequence: Vec<LevelRun>| {
313            assert!(!sequence.is_empty());
314
315            let mut result = IsolatingRunSequence {
316                runs: sequence,
317                sos: L,
318                eos: L,
319            };
320
321            let start_of_seq = result.runs[0].start;
322            let runs_len = result.runs.len();
323            let end_of_seq = result.runs[runs_len - 1].end;
324
325            // > (not counting characters removed by X9)
326            let seq_level = result
327                .iter_forwards_from(start_of_seq, 0)
328                .filter(|i| not_removed_by_x9(&original_classes[*i]))
329                .map(|i| levels[i])
330                .next()
331                .unwrap_or(levels[start_of_seq]);
332
333            let end_level = result
334                .iter_backwards_from(end_of_seq, runs_len - 1)
335                .filter(|i| not_removed_by_x9(&original_classes[*i]))
336                .map(|i| levels[i])
337                .next()
338                .unwrap_or(levels[end_of_seq - 1]);
339
340            #[cfg(test)]
341            for run in result.runs.clone() {
342                for idx in run {
343                    if not_removed_by_x9(&original_classes[idx]) {
344                        assert_eq!(seq_level, levels[idx]);
345                    }
346                }
347            }
348
349            // Get the level of the last non-removed char before the runs.
350            let pred_level = match original_classes[..start_of_seq].iter().rposition(not_removed_by_x9) {
351                Some(idx) => levels[idx],
352                None => para_level,
353            };
354
355            // Get the last non-removed character to check if it is an isolate initiator.
356            // The spec calls for an unmatched one, but matched isolate initiators
357            // will never be at the end of a level run (otherwise there would be more to the run).
358            // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check.
359            let last_non_removed = original_classes[..end_of_seq]
360                .iter()
361                .copied()
362                .rev()
363                .find(not_removed_by_x9)
364                .unwrap_or(BN);
365
366            // Get the level of the next non-removed char after the runs.
367            let succ_level = if let RLI | LRI | FSI = last_non_removed {
368                para_level
369            } else {
370                match original_classes[end_of_seq..].iter().position(not_removed_by_x9) {
371                    Some(idx) => levels[end_of_seq + idx],
372                    None => para_level,
373                }
374            };
375
376            result.sos = std::cmp::max(seq_level, pred_level).bidi_class();
377            result.eos = std::cmp::max(end_level, succ_level).bidi_class();
378            result
379        })
380        .collect()
381}
382
383/// Entries in the directional status stack:
384struct Status {
385    level: Level,
386    status: OverrideStatus,
387}
388
389#[derive(PartialEq)]
390#[expect(clippy::upper_case_acronyms)]
391enum OverrideStatus {
392    Neutral,
393    RTL,
394    LTR,
395    Isolate,
396}
397
398struct DirectionalStatusStack {
399    vec: Vec<Status>,
400}
401
402impl DirectionalStatusStack {
403    fn new() -> Self {
404        DirectionalStatusStack {
405            vec: Vec::with_capacity(Level::max_explicit_depth() as usize + 2),
406        }
407    }
408
409    fn push(&mut self, level: Level, status: OverrideStatus) {
410        self.vec.push(Status { level, status });
411    }
412
413    fn last(&self) -> &Status {
414        self.vec.last().unwrap()
415    }
416}
417
418fn is_rtl(bidi_class: BidiClass) -> bool {
419    use BidiClass::*;
420    matches!(bidi_class, RLE | RLO | RLI)
421}
422
423pub(super) type LevelRun = Range<usize>;
424
425pub(super) struct IsolatingRunSequence {
426    pub runs: Vec<LevelRun>,
427    pub sos: BidiClass, // Start-of-sequence type.
428    pub eos: BidiClass, // End-of-sequence type.
429}
430
431/// Should this character be ignored in steps after X9?
432///
433/// <http://www.unicode.org/reports/tr9/#X9>
434fn removed_by_x9(class: BidiClass) -> bool {
435    use BidiClass::*;
436    matches!(class, RLE | LRE | RLO | LRO | PDF | BN)
437}
438
439// For use as a predicate for `position` / `r_position`
440fn not_removed_by_x9(class: &BidiClass) -> bool {
441    !removed_by_x9(*class)
442}
443
444impl IsolatingRunSequence {
445    /// Given a text-relative position `pos` and an index of the level run it is in,
446    /// produce an iterator of all characters after and pos (`pos..`) that are in this
447    /// run sequence
448    pub(crate) fn iter_forwards_from(&self, pos: usize, level_run_index: usize) -> impl Iterator<Item = usize> + '_ {
449        let runs = &self.runs[level_run_index..];
450
451        // Check that it is in range
452        // (we can't use contains() since we want an inclusive range)
453        debug_assert!(runs[0].start <= pos && pos <= runs[0].end);
454
455        (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone))
456    }
457
458    /// Given a text-relative position `pos` and an index of the level run it is in,
459    /// produce an iterator of all characters before and excluding pos (`..pos`) that are in this
460    /// run sequence
461    pub(crate) fn iter_backwards_from(&self, pos: usize, level_run_index: usize) -> impl Iterator<Item = usize> + '_ {
462        let prev_runs = &self.runs[..level_run_index];
463        let current = &self.runs[level_run_index];
464
465        // Check that it is in range
466        // (we can't use contains() since we want an inclusive range)
467        debug_assert!(current.start <= pos && pos <= current.end);
468
469        (current.start..pos).rev().chain(prev_runs.iter().rev().flat_map(Clone::clone))
470    }
471}
472
473/// Finds the level runs in a paragraph.
474///
475/// <http://www.unicode.org/reports/tr9/#BD7>
476fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
477    assert_eq!(levels.len(), original_classes.len());
478
479    let mut runs = Vec::new();
480    if levels.is_empty() {
481        return runs;
482    }
483
484    let mut current_run_level = levels[0];
485    let mut current_run_start = 0;
486    for i in 1..levels.len() {
487        if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
488            // End the last run and start a new one.
489            runs.push(current_run_start..i);
490            current_run_level = levels[i];
491            current_run_start = i;
492        }
493    }
494    runs.push(current_run_start..levels.len());
495
496    runs
497}
498
499/// 3.3.4 Resolving Weak Types
500///
501/// <http://www.unicode.org/reports/tr9/#Resolving_Weak_Types>
502pub(super) fn implicit_resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [BidiClass]) {
503    use BidiClass::*;
504
505    // Note: The spec treats these steps as individual passes that are applied one after the other
506    // on the entire IsolatingRunSequence at once. We instead collapse it into a single iteration,
507    // which is straightforward for rules that are based on the state of the current character, but not
508    // for rules that care about surrounding characters. To deal with them, we retain additional state
509    // about previous character classes that may have since been changed by later rules.
510
511    // The previous class for the purposes of rule W4/W6, not tracking changes made after or during W4.
512    let mut prev_class_before_w4 = sequence.sos;
513    // The previous class for the purposes of rule W5.
514    let mut prev_class_before_w5 = sequence.sos;
515    // The previous class for the purposes of rule W1, not tracking changes from any other rules.
516    let mut prev_class_before_w1 = sequence.sos;
517    let mut last_strong_is_al = false;
518    let mut et_run_indices = Vec::new(); // for W5
519    let mut bn_run_indices = Vec::new(); // for W5 + <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
520
521    for (run_index, level_run) in sequence.runs.iter().enumerate() {
522        for i in &mut level_run.clone() {
523            if processing_classes[i] == BN {
524                // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
525                // Keeps track of BN runs for W5 in case we see an ET.
526                bn_run_indices.push(i);
527                // BNs aren't real, skip over them.
528                continue;
529            }
530
531            // Store the processing class of all rules before W2/W1.
532            // Used to keep track of the last strong character for W2. W3 is able to insert new strong
533            // characters, so we don't want to be misled by it.
534            let mut w2_processing_class = processing_classes[i];
535
536            // <http://www.unicode.org/reports/tr9/#W1>
537            //
538
539            if processing_classes[i] == NSM {
540                processing_classes[i] = match prev_class_before_w1 {
541                    RLI | LRI | FSI | PDI => ON,
542                    _ => prev_class_before_w1,
543                };
544                // W1 occurs before W2, update this.
545                w2_processing_class = processing_classes[i];
546            }
547
548            prev_class_before_w1 = processing_classes[i];
549
550            // <http://www.unicode.org/reports/tr9/#W2>
551            // <http://www.unicode.org/reports/tr9/#W3>
552            //
553            match processing_classes[i] {
554                EN => {
555                    if last_strong_is_al {
556                        // W2. If previous strong char was AL, change EN to AN.
557                        processing_classes[i] = AN;
558                    }
559                }
560                // W3.
561                AL => processing_classes[i] = R,
562                _ => {}
563            }
564
565            // update last_strong_is_al.
566            match w2_processing_class {
567                L | R => {
568                    last_strong_is_al = false;
569                }
570                AL => {
571                    last_strong_is_al = true;
572                }
573                _ => {}
574            }
575
576            let class_before_w456 = processing_classes[i];
577
578            // <http://www.unicode.org/reports/tr9/#W4>
579            // <http://www.unicode.org/reports/tr9/#W5>
580            // <http://www.unicode.org/reports/tr9/#W6> (separators only)
581            // (see below for W6 terminator code)
582            //
583            match processing_classes[i] {
584                // <http://www.unicode.org/reports/tr9/#W6>
585                EN => {
586                    // W5. If a run of ETs is adjacent to an EN, change the ETs to EN.
587                    for j in &et_run_indices {
588                        processing_classes[*j] = EN;
589                    }
590                    et_run_indices.clear();
591                }
592
593                // <http://www.unicode.org/reports/tr9/#W4>
594                // <http://www.unicode.org/reports/tr9/#W6>
595                ES | CS => {
596                    let mut next_class = sequence
597                        .iter_forwards_from(i, run_index)
598                        .map(|j| processing_classes[j])
599                        // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
600                        .find(not_removed_by_x9)
601                        .unwrap_or(sequence.eos);
602                    if next_class == EN && last_strong_is_al {
603                        // Apply W2 to next_class. We know that last_strong_is_al
604                        // has no chance of changing on this character so we can still assume its value
605                        // will be the same by the time we get to it.
606                        next_class = AN;
607                    }
608                    processing_classes[i] = match (prev_class_before_w4, processing_classes[i], next_class) {
609                        // W4
610                        (EN, ES, EN) | (EN, CS, EN) => EN,
611                        // W4
612                        (AN, CS, AN) => AN,
613                        // W6 (separators only)
614                        (_, _, _) => ON,
615                    };
616
617                    // W6 + <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
618                    // We have to do this before W5 gets its grubby hands on these characters and thinks
619                    // they're part of an ET run.
620                    // We check for ON to ensure that we had hit the W6 branch above, since this `ES | CS` match
621                    // arm handles both W4 and W6.
622                    if processing_classes[i] == ON {
623                        for idx in sequence.iter_backwards_from(i, run_index) {
624                            let class = &mut processing_classes[idx];
625                            if *class != BN {
626                                break;
627                            }
628                            *class = ON;
629                        }
630                        for idx in sequence.iter_forwards_from(i, run_index) {
631                            let class = &mut processing_classes[idx];
632                            if *class != BN {
633                                break;
634                            }
635                            *class = ON;
636                        }
637                    }
638                }
639                // <http://www.unicode.org/reports/tr9/#W5>
640                ET => {
641                    match prev_class_before_w5 {
642                        EN => processing_classes[i] = EN,
643                        _ => {
644                            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
645                            // If there was a BN run before this, that's now a part of this ET run.
646                            et_run_indices.extend(&bn_run_indices);
647
648                            // In case this is followed by an EN.
649                            et_run_indices.push(i);
650                        }
651                    }
652                }
653                _ => {}
654            }
655
656            // Common loop iteration code
657            //
658
659            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
660            // BN runs would have already continued the loop, clear them before we get to the next one.
661            bn_run_indices.clear();
662
663            // W6 above only deals with separators, so it doesn't change anything W5 cares about,
664            // so we still can update this after running that part of W6.
665            prev_class_before_w5 = processing_classes[i];
666
667            // <http://www.unicode.org/reports/tr9/#W6> (terminators only)
668            // (see above for W6 separator code)
669            //
670            if prev_class_before_w5 != ET {
671                // W6. If we didn't find an adjacent EN, turn any ETs into ON instead.
672                for j in &et_run_indices {
673                    processing_classes[*j] = ON;
674                }
675                et_run_indices.clear();
676            }
677
678            // We stashed this before W4/5/6 could get their grubby hands on it, and it's not
679            // used in the W6 terminator code below so we can update it now.
680            prev_class_before_w4 = class_before_w456;
681        }
682    }
683    // Rerun this check in case we ended with a sequence of BNs (i.e., we'd never
684    // hit the end of the for loop above).
685    // W6. If we didn't find an adjacent EN, turn any ETs into ON instead.
686    for j in &et_run_indices {
687        processing_classes[*j] = ON;
688    }
689    et_run_indices.clear();
690
691    // W7. If the previous strong char was L, change EN to L.
692    let mut last_strong_is_l = sequence.sos == L;
693    for run in &sequence.runs {
694        for i in run.clone() {
695            match processing_classes[i] {
696                EN if last_strong_is_l => {
697                    processing_classes[i] = L;
698                }
699                L => {
700                    last_strong_is_l = true;
701                }
702                R | AL => {
703                    last_strong_is_l = false;
704                }
705                // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
706                // Already scanning past BN here.
707                _ => {}
708            }
709        }
710    }
711}
712
713/// 3.3.5 Resolving Neutral Types
714///
715/// <http://www.unicode.org/reports/tr9/#Resolving_Neutral_Types>
716pub(super) fn implicit_resolve_neutral(
717    sequence: &IsolatingRunSequence,
718    levels: &[Level],
719    original_classes: &[BidiClass],
720    processing_classes: &mut [BidiClass],
721    brackets: &HashMap<usize, char>,
722) {
723    use BidiClass::*;
724
725    // e = embedding direction
726    let e: BidiClass = levels[sequence.runs[0].start].bidi_class();
727    let not_e = if e == BidiClass::L { BidiClass::R } else { BidiClass::L };
728    // N0. Process bracket pairs.
729
730    // > Identify the bracket pairs in the current isolating run sequence according to BD16.
731    // We use processing_classes, not original_classes, due to BD14/BD15
732    let bracket_pairs = identify_bracket_pairs(sequence, processing_classes, brackets);
733
734    // > For each bracket-pair element in the list of pairs of text positions
735    //
736    // Note: Rust ranges are interpreted as [start..end), be careful using `pair` directly
737    // for indexing as it will include the opening bracket pair but not the closing one.
738    for pair in bracket_pairs {
739        debug_assert!(
740            pair.start < processing_classes.len(),
741            "identify_bracket_pairs returned a range that is out of bounds!"
742        );
743        debug_assert!(
744            pair.end < processing_classes.len(),
745            "identify_bracket_pairs returned a range that is out of bounds!"
746        );
747        let mut found_e = false;
748        let mut found_not_e = false;
749        let mut class_to_set = None;
750
751        // > Inspect the bidirectional types of the characters enclosed within the bracket pair.
752        //
753        // `pair` is [start, end) so we will end up processing the opening character but not the closing one.
754        //
755        for enclosed_i in sequence.iter_forwards_from(pair.start + 1, pair.start_run) {
756            if enclosed_i >= pair.end {
757                debug_assert!(enclosed_i == pair.end, "If we skipped past this, the iterator is broken");
758                break;
759            }
760            let class = processing_classes[enclosed_i];
761            if class == e {
762                found_e = true;
763            } else if class == not_e {
764                found_not_e = true;
765            } else if class == BidiClass::EN || class == BidiClass::AN {
766                // > Within this scope, bidirectional types EN and AN are treated as R.
767                if e == BidiClass::L {
768                    found_not_e = true;
769                } else {
770                    found_e = true;
771                }
772            }
773
774            // If we have found a character with the class of the embedding direction
775            // we can bail early.
776            if found_e {
777                break;
778            }
779        }
780        // > If any strong type (either L or R) matching the embedding direction is found
781        if found_e {
782            // > .. set the type for both brackets in the pair to match the embedding direction
783            class_to_set = Some(e);
784        // > Otherwise, if there is a strong type it must be opposite the embedding direction
785        } else if found_not_e {
786            // > Therefore, test for an established context with a preceding strong type by
787            // > checking backwards before the opening paired bracket
788            // > until the first strong type (L, R, or sos) is found.
789            // (see note above about processing_classes and character boundaries)
790            let mut previous_strong = sequence
791                .iter_backwards_from(pair.start, pair.start_run)
792                .map(|i| processing_classes[i])
793                .find(|class| *class == BidiClass::L || *class == BidiClass::R || *class == BidiClass::EN || *class == BidiClass::AN)
794                .unwrap_or(sequence.sos);
795
796            // > Within this scope, bidirectional types EN and AN are treated as R.
797            if previous_strong == BidiClass::EN || previous_strong == BidiClass::AN {
798                previous_strong = BidiClass::R;
799            }
800
801            // > If the preceding strong type is also opposite the embedding direction,
802            // > context is established,
803            // > so set the type for both brackets in the pair to that direction.
804            // AND
805            // > Otherwise set the type for both brackets in the pair to the embedding direction.
806            // > Either way it gets set to previous_strong
807            //
808            // Both branches amount to setting the type to the strong type.
809            class_to_set = Some(previous_strong);
810        }
811
812        if let Some(class_to_set) = class_to_set {
813            // Update all processing classes corresponding to the start and end elements, as requested.
814            // We should include all bytes of the character, not the first one.
815            for class in &mut processing_classes[pair.start..pair.start + 1] {
816                *class = class_to_set;
817            }
818            for class in &mut processing_classes[pair.end..pair.end + 1] {
819                *class = class_to_set;
820            }
821            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
822            for idx in sequence.iter_backwards_from(pair.start, pair.start_run) {
823                let class = &mut processing_classes[idx];
824                if *class != BN {
825                    break;
826                }
827                *class = class_to_set;
828            }
829            // > Any number of characters that had original bidirectional character type NSM prior to the application of
830            // > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket.
831
832            // This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
833            // about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
834            let nsm_start = pair.start + 1;
835            for idx in sequence.iter_forwards_from(nsm_start, pair.start_run) {
836                let class = original_classes[idx];
837                if class == BidiClass::NSM || processing_classes[idx] == BN {
838                    processing_classes[idx] = class_to_set;
839                } else {
840                    break;
841                }
842            }
843            let nsm_end = pair.end + 1;
844            for idx in sequence.iter_forwards_from(nsm_end, pair.end_run) {
845                let class = original_classes[idx];
846                if class == BidiClass::NSM || processing_classes[idx] == BN {
847                    processing_classes[idx] = class_to_set;
848                } else {
849                    break;
850                }
851            }
852        }
853        // > Otherwise, there are no strong types within the bracket pair
854        // > Therefore, do not set the type for that bracket pair
855    }
856
857    // N1 and N2.
858    // Indices of every byte in this isolating run sequence
859    let mut indices = sequence.runs.iter().flat_map(Clone::clone);
860    let mut prev_class = sequence.sos;
861    while let Some(mut i) = indices.next() {
862        // Process sequences of NI characters.
863        let mut ni_run = Vec::new();
864        // The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
865        if is_NI(processing_classes[i]) || processing_classes[i] == BN {
866            // Consume a run of consecutive NI characters.
867            ni_run.push(i);
868            let mut next_class;
869            loop {
870                match indices.next() {
871                    Some(j) => {
872                        i = j;
873                        next_class = processing_classes[j];
874                        // The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
875                        if is_NI(next_class) || next_class == BN {
876                            ni_run.push(i);
877                        } else {
878                            break;
879                        }
880                    }
881                    None => {
882                        next_class = sequence.eos;
883                        break;
884                    }
885                };
886            }
887            // N1-N2.
888            //
889            // <http://www.unicode.org/reports/tr9/#N1>
890            // <http://www.unicode.org/reports/tr9/#N2>
891            let new_class = match (prev_class, next_class) {
892                (L, L) => L,
893                (R, R) | (R, AN) | (R, EN) | (AN, R) | (AN, AN) | (AN, EN) | (EN, R) | (EN, AN) | (EN, EN) => R,
894                (_, _) => e,
895            };
896            for j in &ni_run {
897                processing_classes[*j] = new_class;
898            }
899            ni_run.clear();
900        }
901        prev_class = processing_classes[i];
902    }
903}
904
905struct BracketPair {
906    /// The text-relative index of the opening bracket.
907    start: usize,
908    /// The text-relative index of the closing bracket.
909    end: usize,
910    /// The index of the run (in the run sequence) that the opening bracket is in.
911    start_run: usize,
912    /// The index of the run (in the run sequence) that the closing bracket is in.
913    end_run: usize,
914}
915/// 3.1.3 Identifying Bracket Pairs
916///
917/// Returns all paired brackets in the source, as indices into the
918/// text source.
919///
920/// <https://www.unicode.org/reports/tr9/#BD16>
921fn identify_bracket_pairs(
922    run_sequence: &IsolatingRunSequence,
923    original_classes: &[BidiClass],
924    brackets: &HashMap<usize, char>,
925) -> Vec<BracketPair> {
926    let data_source = &unicode_bidi::HardcodedBidiData;
927
928    let mut ret = vec![];
929    let mut stack = vec![];
930
931    for (run_index, level_run) in run_sequence.runs.iter().enumerate() {
932        for i in 0..level_run.len() {
933            let actual_index = level_run.start + i;
934            // All bracket characters are ON.
935            // From BidiBrackets.txt:
936            // > The Unicode property value stability policy guarantees that characters
937            // > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y
938            if original_classes[actual_index] != BidiClass::ON {
939                continue;
940            }
941
942            if let Some(matched) = brackets
943                .get(&actual_index)
944                .and_then(|c| data_source.bidi_matched_opening_bracket(*c))
945            {
946                if matched.is_open {
947                    // > If an opening paired bracket is found ...
948
949                    // > ... and there is no room in the stack,
950                    // > stop processing BD16 for the remainder of the isolating run sequence.
951                    if stack.len() >= 63 {
952                        break;
953                    }
954                    // > ... push its Bidi_Paired_Bracket property value and its text position onto the stack
955                    stack.push((matched.opening, actual_index, run_index))
956                } else {
957                    // > If a closing paired bracket is found, do the following
958
959                    // > Declare a variable that holds a reference to the current stack element
960                    // > and initialize it with the top element of the stack.
961                    // AND
962                    // > Else, if the current stack element is not at the bottom of the stack
963                    for (stack_index, element) in stack.iter().enumerate().rev() {
964                        // > Compare the closing paired bracket being inspected or its canonical
965                        // > equivalent to the bracket in the current stack element.
966                        if element.0 == matched.opening {
967                            // > If the values match, meaning the two characters form a bracket pair, then
968
969                            // > Append the text position in the current stack element together with the
970                            // > text position of the closing paired bracket to the list.
971                            let pair = BracketPair {
972                                start: element.1,
973                                end: actual_index,
974                                start_run: element.2,
975                                end_run: run_index,
976                            };
977                            ret.push(pair);
978
979                            // > Pop the stack through the current stack element inclusively.
980                            stack.truncate(stack_index);
981                            break;
982                        }
983                    }
984                }
985            }
986        }
987    }
988    // > Sort the list of pairs of text positions in ascending order based on
989    // > the text position of the opening paired bracket.
990    ret.sort_by_key(|r| r.start);
991    ret
992}
993
994/// Neutral or Isolate formatting character (B, S, WS, ON, FSI, LRI, RLI, PDI)
995///
996/// <http://www.unicode.org/reports/tr9/#NI>
997#[expect(non_snake_case)]
998fn is_NI(class: BidiClass) -> bool {
999    use BidiClass::*;
1000    matches!(class, B | S | WS | ON | FSI | LRI | RLI | PDI)
1001}
1002
1003/// 3.3.6 Resolving Implicit Levels
1004///
1005/// Returns the maximum embedding level in the paragraph.
1006///
1007/// <http://www.unicode.org/reports/tr9/#Resolving_Implicit_Levels>
1008pub(super) fn implicit_resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> Level {
1009    use BidiClass::*;
1010
1011    let mut max_level = Level::ltr();
1012    assert_eq!(original_classes.len(), levels.len());
1013    for i in 0..levels.len() {
1014        match (levels[i].is_rtl(), original_classes[i]) {
1015            (false, AN) | (false, EN) => levels[i].raise(2).expect("Level number error"),
1016            (false, R) | (true, L) | (true, EN) | (true, AN) => levels[i].raise(1).expect("Level number error"),
1017            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> handled here
1018            (_, _) => {}
1019        }
1020        max_level = std::cmp::max(max_level, levels[i]);
1021    }
1022
1023    max_level
1024}
1025
1026/// Assign levels to characters removed by rule X9.
1027///
1028/// The levels assigned to these characters are not specified by the algorithm. This function
1029/// assigns each one the level of the previous character, to avoid breaking level runs.
1030pub(super) fn assign_levels_to_removed_chars(para_level: Level, classes: &[BidiClass], levels: &mut [Level]) {
1031    for i in 0..levels.len() {
1032        if removed_by_x9(classes[i]) {
1033            levels[i] = if i > 0 { levels[i - 1] } else { para_level };
1034        }
1035    }
1036}
zng_ext_font/unicode_bidi_util.rs

zng_ext_font/
unicode_bidi_util.rs