1use std::{collections::HashMap, ops};
2
3use crate::emoji_util;
4
5use super::Txt;
6use unicode_bidi::{BidiDataSource as _, BidiInfo};
7
8use zng_layout::context::LayoutDirection;
9pub use zng_layout::context::TextSegmentKind;
10
11pub use unicode_bidi::Level as BidiLevel;
12
13#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15#[non_exhaustive]
16pub struct TextSegment {
17 pub kind: TextSegmentKind,
19 pub level: BidiLevel,
21
22 pub end: usize,
26}
27impl TextSegment {
28 pub fn direction(self) -> LayoutDirection {
33 from_unic_level(self.level)
34 }
35}
36
37#[derive(Default, Debug, Clone, PartialEq, Eq)]
42pub struct SegmentedText {
43 text: Txt,
44 segments: Vec<TextSegment>,
45 base_direction: LayoutDirection,
46}
47impl SegmentedText {
48 pub fn new(text: impl Into<Txt>, base_direction: LayoutDirection) -> Self {
50 Self::new_text(text.into(), base_direction)
51 }
52 fn new_text(text: Txt, base_direction: LayoutDirection) -> Self {
53 let mut segs: Vec<TextSegment> = vec![];
54 let text_str: &str = &text;
55 let bidi = BidiInfo::new(text_str, Some(into_unic_level(base_direction)));
56
57 for (offset, kind) in unicode_linebreak::linebreaks(text_str) {
58 if let unicode_linebreak::BreakOpportunity::Mandatory = kind {
60 let start = segs.last().map(|s| s.end).unwrap_or(0);
62
63 let seg = &text_str[start..offset];
66
67 let break_start = if seg.ends_with("\r\n") {
68 offset - 2
70 } else if seg.ends_with('\n') || seg.ends_with('\r') || seg.ends_with('\u{85}') {
71 offset - 1
73 } else {
74 debug_assert_eq!(offset, text_str.len());
76 offset
77 };
78
79 if break_start > start {
80 Self::push_seg(text_str, &bidi, &mut segs, break_start);
82 }
83 if break_start < offset {
84 segs.push(TextSegment {
86 kind: TextSegmentKind::LineBreak,
87 end: offset,
88 level: bidi.levels[break_start],
89 })
90 }
91 }
92 }
94 SegmentedText {
95 text,
96 segments: segs,
97 base_direction,
98 }
99 }
100
101 fn push_seg(text: &str, bidi: &BidiInfo, segs: &mut Vec<TextSegment>, end: usize) {
102 let start = segs.last().map(|s| s.end).unwrap_or(0);
103
104 let mut char_indices = text[start..end].char_indices().peekable();
105
106 let mut kind = TextSegmentKind::LeftToRight;
107 let mut level = BidiLevel::ltr();
108 for (i, c) in &mut char_indices {
109 const ZWJ: char = '\u{200D}'; const VS16: char = '\u{FE0F}'; const CEK: char = '\u{20E3}'; let is_emoji = (kind == TextSegmentKind::Emoji && (
115 c == VS16 || c == CEK || c == ZWJ || emoji_util::is_modifier(c) || emoji_util::is_component(c) ))
121 || (emoji_util::maybe_emoji(c) && (emoji_util::definitely_emoji(c) || (text[start+i..].chars().nth(1).map(|c| c == VS16 || emoji_util::is_modifier(c)).unwrap_or(false))));
125
126 let (c_kind, c_level) = if is_emoji {
127 (TextSegmentKind::Emoji, level)
128 } else {
129 let k = match TextSegmentKind::from(bidi.original_classes[start + i]) {
130 TextSegmentKind::OtherNeutral if unicode_bidi::HardcodedBidiData.bidi_matched_opening_bracket(c).is_some() => {
131 TextSegmentKind::Bracket(c)
132 }
133 k => k,
134 };
135 (k, bidi.levels[start + i])
136 };
137
138 if c_kind != kind || c_level != level || !c_kind.can_merge() {
139 if i > 0 {
140 segs.push(TextSegment {
141 kind,
142 end: i + start,
143 level,
144 });
145 }
146 level = c_level;
147 kind = c_kind;
148 }
149 }
150 segs.push(TextSegment { kind, end, level });
151 }
152
153 pub fn text(&self) -> &Txt {
155 &self.text
156 }
157
158 pub fn segs(&self) -> &[TextSegment] {
160 &self.segments
161 }
162
163 pub fn seg_from_char(&self, from: usize) -> usize {
165 match self.segments.binary_search_by_key(&from, |s| s.end) {
166 Ok(e) => e + 1,
167 Err(s) => s,
168 }
169 }
170
171 pub fn base_direction(&self) -> LayoutDirection {
176 self.base_direction
177 }
178
179 pub fn is_bidi(&self) -> bool {
181 for seg in self.segments.iter() {
182 if seg.direction() != self.base_direction {
183 return true;
184 }
185 }
186 false
187 }
188
189 pub fn get(&self, index: usize) -> Option<(&str, TextSegment)> {
191 if let Some(&seg) = self.segments.get(index) {
192 let text = if index == 0 {
193 &self.text[..seg.end]
194 } else {
195 &self.text[self.segments[index - 1].end..seg.end]
196 };
197
198 Some((text, seg))
199 } else {
200 None
201 }
202 }
203
204 pub fn get_clone(&self, index: usize) -> Option<SegmentedText> {
206 self.get(index).map(|(txt, seg)| SegmentedText {
207 text: txt.to_owned().into(),
208 segments: vec![TextSegment { end: txt.len(), ..seg }],
209 base_direction: self.base_direction,
210 })
211 }
212
213 pub fn is_empty(&self) -> bool {
215 self.segments.is_empty()
216 }
217
218 pub fn into_parts(self) -> (Txt, Vec<TextSegment>, LayoutDirection) {
220 (self.text, self.segments, self.base_direction)
221 }
222
223 pub fn from_parts(text: Txt, segments: Vec<TextSegment>, base_direction: LayoutDirection) -> Self {
232 assert_eq!(text.is_empty(), segments.is_empty());
233 if !text.is_empty() {
234 assert!(segments.last().unwrap().end == text.len());
235 }
236
237 SegmentedText {
238 text,
239 segments,
240 base_direction,
241 }
242 }
243
244 pub fn iter(&self) -> SegmentedTextIter<'_> {
256 SegmentedTextIter {
257 text: &self.text,
258 start: 0,
259 segs_iter: self.segments.iter(),
260 }
261 }
262
263 pub fn text_range(&self, segs_range: ops::Range<usize>) -> ops::Range<usize> {
265 let start = if segs_range.start == 0 {
266 0
267 } else {
268 self.segments[segs_range.start - 1].end
269 };
270 let end = self.segments[..segs_range.end].last().map(|s| s.end).unwrap_or(0);
271 start..end
272 }
273
274 pub fn reorder_line_to_ltr(&self, segs_range: ops::Range<usize>) -> Vec<usize> {
278 let mut r = Vec::with_capacity(segs_range.len());
279 let offset = segs_range.start;
280 unicode_bidi_sort(
281 self.base_direction,
282 self.segments[segs_range].iter().map(|s| (s.kind, s.level)),
283 offset,
284 &mut r,
285 );
286 r
287 }
288
289 pub fn snap_char_boundary(&self, i: usize) -> usize {
294 if i >= self.text.len() {
295 self.text.len()
296 } else {
297 let mut next = i;
298 while !self.text.is_char_boundary(next) {
299 next += 1;
300 }
301 next
302 }
303 }
304
305 pub fn snap_grapheme_boundary(&self, i: usize) -> usize {
310 let i = self.snap_char_boundary(i);
311 if i == self.text.len() {
312 i
313 } else {
314 let mut seg_start = 0;
315 for seg in self.segments.iter() {
316 if seg.end > i {
317 break;
318 }
319 seg_start = seg.end;
320 }
321 let s = &self.text[seg_start..];
322
323 let seg_i = i - seg_start;
324 let mut best_before = 0;
325 let mut best_after = s.len();
326 for (i, _) in unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true) {
327 if i > seg_i {
328 best_after = i;
329 break;
330 }
331 best_before = i;
332 }
333
334 let best = if best_after - seg_i > seg_i - best_before {
335 best_before
336 } else {
337 best_after
338 };
339 seg_start + best
340 }
341 }
342
343 pub fn next_insert_index(&self, from: usize) -> usize {
351 if from == self.text.len() {
352 from
353 } else {
354 let s = &self.text.as_str()[from..];
355 let mut iter = unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true).map(|(i, _)| i + from);
356 assert_eq!(iter.next(), Some(from), "`from` was not a grapheme boundary");
357 iter.next().unwrap_or(self.text.len())
358 }
359 }
360
361 pub fn prev_insert_index(&self, from: usize) -> usize {
369 if from == self.text.len() {
370 let s = &self.text.as_str()[..from];
371 let mut iter = unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true)
372 .map(|(i, _)| i)
373 .rev();
374 iter.next().unwrap_or(0)
375 } else {
376 let s = self.text.as_str();
377
378 let inclusive_from = s[from..].char_indices().nth(1).map(|(b, _)| from + b).unwrap_or_else(|| s.len());
380
381 let s = &self.text.as_str()[..inclusive_from];
382 let mut iter = unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true)
383 .map(|(i, _)| i)
384 .rev();
385 assert_eq!(iter.next(), Some(from), "`from` was not a grapheme boundary");
386 iter.next().unwrap_or(0)
387 }
388 }
389
390 pub fn next_word_index(&self, from: usize) -> usize {
394 let mut segs = self.segments[self.seg_from_char(from)..].iter();
395
396 if let Some(seg) = segs.next() {
397 if seg.kind.is_line_break() {
398 return seg.end;
399 }
400 let mut start = seg.end;
401 for seg in segs {
402 if seg.kind.is_word() || seg.kind.is_line_break() {
403 return start;
404 }
405 start = seg.end;
406 }
407 }
408 self.text.len()
409 }
410
411 pub fn next_word_end_index(&self, from: usize) -> usize {
415 let mut segs = self.segments[self.seg_from_char(from)..].iter();
416 if let Some(seg) = segs.next() {
417 if seg.kind.is_word() || seg.kind.is_line_break() {
418 return seg.end;
419 }
420 for seg in segs {
421 if seg.kind.is_word() || seg.kind.is_line_break() {
422 return seg.end;
423 }
424 }
425 }
426 self.text.len()
427 }
428
429 pub fn prev_word_index(&self, from: usize) -> usize {
433 let seg_i = self.seg_from_char(from);
434 let mut segs = if seg_i < self.segments.len() {
435 self.segments[..=seg_i].iter().rev()
436 } else {
437 self.segs().iter().rev()
438 };
439 let mut seg_kind = TextSegmentKind::Space;
440 for seg in &mut segs {
441 if seg.end < from {
442 if seg_kind.is_word() || seg.kind.is_line_break() {
443 return seg.end;
445 }
446 seg_kind = seg.kind;
447 for seg in segs {
448 if seg_kind.is_word() || seg.kind.is_line_break() {
449 return seg.end;
451 }
452 seg_kind = seg.kind;
453 }
454 break;
455 } else if seg.end == from && seg.kind.is_line_break() {
456 return segs.next().map(|p| p.end).unwrap_or(0);
458 }
459 seg_kind = seg.kind;
460 }
461 0
462 }
463
464 pub fn line_start_index(&self, from: usize) -> usize {
470 let line_break = self.text.as_str()[..from]
471 .char_indices()
472 .rev()
473 .find(|(_, c)| "\n\r\u{85}".contains(*c));
474
475 match line_break {
476 Some((i, _)) => i + 1,
477 None => 0,
478 }
479 }
480
481 pub fn line_end_index(&self, from: usize) -> usize {
487 if from == self.text.len() {
488 return from;
489 }
490
491 let line_break = self.text.as_str()[from..].char_indices().find(|(_, c)| "\n\r\u{85}".contains(*c));
492
493 match line_break {
494 Some((i, _)) => from + i,
495 None => self.text.len(),
496 }
497 }
498
499 pub fn delete_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
510 let mut end = from;
511 for _ in 0..count {
512 let e = self.next_insert_index(end);
513 if e == end {
514 break;
515 }
516 end = e;
517 }
518
519 from..end
520 }
521
522 pub fn backspace_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
532 let mut start = from;
533 for _ in 0..count {
534 let s = self.backspace_start(start);
535 if s == start {
536 break;
537 }
538 start = s;
539 }
540 start..from
541 }
542 fn backspace_start(&self, from: usize) -> usize {
543 let text = &self.text[..from];
544 let mut start = from;
545 for (i, c) in text.char_indices().rev() {
546 start = i;
547 match c {
548 '\u{200D}' => continue, '\n' => {
550 if text[..i].ends_with('\r') {
551 start = i - 1;
552 }
553 }
554 c if c == '\u{FE0F}' || emoji_util::is_modifier(c) => {
555 if let Some((i, c)) = text[..i].char_indices().next_back()
557 && emoji_util::maybe_emoji(c)
558 {
559 start = i;
560 }
561 }
562 _ => {}
563 }
564 break;
565 }
566 start
567 }
568
569 pub fn backspace_word_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
573 let mut start = from;
574 for _ in 0..count {
575 let s = self.prev_word_index(start);
576 if s == start {
577 break;
578 }
579 start = s;
580 }
581 start..from
582 }
583
584 pub fn delete_word_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
586 let mut end = from;
587 for _ in 0..count {
588 let e = self.next_word_end_index(end);
589 if e == end {
590 break;
591 }
592 end = e;
593 }
594
595 from..end
596 }
597}
598
599pub fn unicode_bidi_levels(base_direction: LayoutDirection, line: impl Iterator<Item = TextSegmentKind>, levels: &mut Vec<BidiLevel>) {
603 let mut original_classes = Vec::with_capacity(line.size_hint().0);
604 let mut brackets = HashMap::default();
605 for (i, k) in line.enumerate() {
606 original_classes.push(k.into());
607 if let TextSegmentKind::Bracket(c) = k {
608 brackets.insert(i, c);
609 }
610 }
611
612 unicode_bidi_levels_impl(levels, base_direction, original_classes, brackets);
613}
614fn unicode_bidi_levels_impl(
615 levels: &mut Vec<BidiLevel>,
616 base_direction: LayoutDirection,
617 original_classes: Vec<unicode_bidi::BidiClass>,
618 brackets: HashMap<usize, char>,
619) {
620 levels.clear();
621 let para_level = into_unic_level(base_direction);
622 levels.resize(original_classes.len(), para_level);
623
624 if !original_classes.is_empty() {
625 let mut processing_classes = original_classes.clone();
626
627 super::unicode_bidi_util::explicit_compute(para_level, &original_classes, levels, &mut processing_classes);
628
629 let sequences = super::unicode_bidi_util::prepare_isolating_run_sequences(para_level, &original_classes, levels);
630 for sequence in &sequences {
631 super::unicode_bidi_util::implicit_resolve_weak(sequence, &mut processing_classes);
632 super::unicode_bidi_util::implicit_resolve_neutral(sequence, levels, &original_classes, &mut processing_classes, &brackets);
633 }
634 super::unicode_bidi_util::implicit_resolve_levels(&processing_classes, levels);
635
636 super::unicode_bidi_util::assign_levels_to_removed_chars(para_level, &original_classes, levels);
637 }
638}
639
640pub fn unicode_bidi_sort(
644 base_direction: LayoutDirection,
645 line: impl Iterator<Item = (TextSegmentKind, BidiLevel)>,
646 idx_offset: usize,
647 sort_map: &mut Vec<usize>,
648) {
649 sort_map.clear();
650
651 let cap = line.size_hint().0;
652 let mut line_classes = Vec::with_capacity(cap);
653 let mut levels = Vec::with_capacity(cap);
654 for (kind, level) in line {
655 line_classes.push(kind.into());
656 levels.push(level);
657 }
658
659 if !levels.is_empty() {
660 let (directions, vis_ranges) = super::unicode_bidi_util::visual_runs(levels, line_classes, into_unic_level(base_direction));
661
662 for vis_range in vis_ranges {
663 if directions[vis_range.start].is_rtl() {
664 for i in vis_range.rev() {
665 sort_map.push(idx_offset + i);
666 }
667 } else {
668 for i in vis_range {
669 sort_map.push(idx_offset + i);
670 }
671 }
672 }
673 }
674}
675
676pub struct SegmentedTextIter<'a> {
680 text: &'a str,
681 start: usize,
682 segs_iter: std::slice::Iter<'a, TextSegment>,
683}
684impl<'a> Iterator for SegmentedTextIter<'a> {
685 type Item = (&'a str, TextSegment);
686 fn next(&mut self) -> Option<Self::Item> {
687 if let Some(&seg) = self.segs_iter.next() {
688 let r = Some((&self.text[self.start..seg.end], seg));
689 self.start = seg.end;
690 r
691 } else {
692 None
693 }
694 }
695}
696
697fn from_unic_level(d: unicode_bidi::Level) -> LayoutDirection {
698 if d.is_ltr() { LayoutDirection::LTR } else { LayoutDirection::RTL }
699}
700fn into_unic_level(d: LayoutDirection) -> unicode_bidi::Level {
701 match d {
702 LayoutDirection::LTR => unicode_bidi::Level::ltr(),
703 LayoutDirection::RTL => unicode_bidi::Level::rtl(),
704 }
705}
706
707#[cfg(test)]
708mod tests {
709 use zng_layout::context::{LayoutDirection, TextSegmentKind};
710 use zng_txt::ToTxt;
711
712 use crate::{BidiLevel, SegmentedText, TextSegment};
713
714 #[test]
715 fn segments() {
716 let test = "a\nb\r\nc\td ";
717 let actual = SegmentedText::new(test, LayoutDirection::LTR);
718
719 fn seg(kind: TextSegmentKind, end: usize) -> TextSegment {
720 TextSegment {
721 kind,
722 end,
723 level: BidiLevel::ltr(),
724 }
725 }
726 use TextSegmentKind::*;
727
728 let expected = SegmentedText {
729 text: test.to_txt(),
730 segments: vec![
731 seg(LeftToRight, 1),
732 seg(LineBreak, 2),
733 seg(LeftToRight, 3),
734 seg(LineBreak, 5),
735 seg(LeftToRight, 6),
736 seg(Tab, 7),
737 seg(LeftToRight, 8),
738 seg(Space, 9),
739 ],
740 base_direction: LayoutDirection::LTR,
741 };
742
743 assert_eq!(expected, actual);
744 }
745
746 #[test]
747 fn reorder_line() {
748 let test = "0 2 4";
749 let txt = SegmentedText::new(test, LayoutDirection::RTL);
750
751 let expected = vec![4, 3, 2, 1, 0];
752 let actual = txt.reorder_line_to_ltr(0..test.len());
753
754 assert_eq!(expected, actual);
755 }
756
757 #[test]
758 fn reorder_line_issue() {
759 let test = " المادة 1";
760 let txt = SegmentedText::new(test, LayoutDirection::RTL);
761
762 let expected = vec![3, 2, 1, 0];
763 let actual = txt.reorder_line_to_ltr(0..4);
764
765 assert_eq!(expected, actual);
766 }
767
768 #[test]
769 fn emoji_seg() {
770 let test = "'🙎🏻♀️'1# 1️⃣#️⃣";
771 let txt = SegmentedText::new(test, LayoutDirection::LTR);
772 let k: Vec<_> = txt.segs().iter().map(|s| s.kind).collect();
773
774 assert_eq!(
775 vec![
776 TextSegmentKind::OtherNeutral, TextSegmentKind::Emoji, TextSegmentKind::OtherNeutral, TextSegmentKind::EuropeanNumber, TextSegmentKind::EuropeanTerminator, TextSegmentKind::Space,
782 TextSegmentKind::Emoji, ],
784 k
785 );
786 }
787
788 #[test]
789 fn emoji_issues() {
790 let test = "🏴";
791 let txt = SegmentedText::new(test, LayoutDirection::LTR);
792 for (t, seg) in txt.iter() {
793 assert_eq!(seg.kind, TextSegmentKind::Emoji, "text: {t:?}");
794 }
795 }
796}