1use std::{collections::HashMap, ops};
2
3use crate::emoji_util;
4
5use super::Txt;
6use unicode_bidi::{BidiDataSource as _, BidiInfo};
7
8use zng_layout::context::LayoutDirection;
9pub use zng_layout::context::TextSegmentKind;
10
11pub use unicode_bidi::Level as BidiLevel;
12
13#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15#[non_exhaustive]
16pub struct TextSegment {
17 pub kind: TextSegmentKind,
19 pub level: BidiLevel,
21
22 pub end: usize,
26}
27impl TextSegment {
28 pub fn direction(self) -> LayoutDirection {
33 from_unic_level(self.level)
34 }
35}
36
37#[derive(Default, Debug, Clone, PartialEq, Eq)]
42pub struct SegmentedText {
43 text: Txt,
44 segments: Vec<TextSegment>,
45 base_direction: LayoutDirection,
46}
47impl SegmentedText {
48 pub fn new(text: impl Into<Txt>, base_direction: LayoutDirection) -> Self {
50 Self::new_text(text.into(), base_direction)
51 }
52 fn new_text(text: Txt, base_direction: LayoutDirection) -> Self {
53 let mut segs: Vec<TextSegment> = vec![];
54 let text_str: &str = &text;
55 let bidi = BidiInfo::new(text_str, Some(into_unic_level(base_direction)));
56
57 for (offset, kind) in unicode_linebreak::linebreaks(text_str) {
58 if let unicode_linebreak::BreakOpportunity::Mandatory = kind {
60 let start = segs.last().map(|s| s.end).unwrap_or(0);
62
63 let seg = &text_str[start..offset];
66
67 let break_start = if seg.ends_with("\r\n") {
68 offset - 2
70 } else if seg.ends_with('\n') || seg.ends_with('\r') || seg.ends_with('\u{85}') {
71 offset - 1
73 } else {
74 debug_assert_eq!(offset, text_str.len());
76 offset
77 };
78
79 if break_start > start {
80 Self::push_seg(text_str, &bidi, &mut segs, break_start);
82 }
83 if break_start < offset {
84 segs.push(TextSegment {
86 kind: TextSegmentKind::LineBreak,
87 end: offset,
88 level: bidi.levels[break_start],
89 })
90 }
91 }
92 }
94 SegmentedText {
95 text,
96 segments: segs,
97 base_direction,
98 }
99 }
100
101 fn push_seg(text: &str, bidi: &BidiInfo, segs: &mut Vec<TextSegment>, end: usize) {
102 let start = segs.last().map(|s| s.end).unwrap_or(0);
103
104 let mut char_indices = text[start..end].char_indices().peekable();
105
106 let mut kind = TextSegmentKind::LeftToRight;
107 let mut level = BidiLevel::ltr();
108 for (i, c) in &mut char_indices {
109 const ZWJ: char = '\u{200D}'; const VS16: char = '\u{FE0F}'; const CEK: char = '\u{20E3}'; let is_emoji = (kind == TextSegmentKind::Emoji && (
115 c == VS16 || c == CEK || c == ZWJ || emoji_util::is_modifier(c) || emoji_util::is_component(c) ))
121 || (emoji_util::maybe_emoji(c) && (emoji_util::definitely_emoji(c) || (text[start+i..].chars().nth(1).map(|c| c == VS16 || emoji_util::is_modifier(c)).unwrap_or(false))));
125
126 let (c_kind, c_level) = if is_emoji {
127 (TextSegmentKind::Emoji, level)
128 } else {
129 let raw_k = TextSegmentKind::from(bidi.original_classes[start + i]);
130 if i > 0
131 && let TextSegmentKind::NonSpacingMark = raw_k
132 {
133 (kind, level)
137 } else {
138 let mut k = raw_k;
139 if let TextSegmentKind::OtherNeutral = raw_k
140 && unicode_bidi::HardcodedBidiData.bidi_matched_opening_bracket(c).is_some()
141 {
142 k = TextSegmentKind::Bracket(c);
143 }
144 (k, bidi.levels[start + i])
145 }
146 };
147
148 if c_kind != kind || c_level != level || !c_kind.can_merge() {
149 if i > 0 {
150 segs.push(TextSegment {
151 kind,
152 end: i + start,
153 level,
154 });
155 }
156 level = c_level;
157 kind = c_kind;
158 }
159 }
160 segs.push(TextSegment { kind, end, level });
161 }
162
163 pub fn text(&self) -> &Txt {
165 &self.text
166 }
167
168 pub fn segs(&self) -> &[TextSegment] {
170 &self.segments
171 }
172
173 pub fn seg_from_char(&self, from: usize) -> usize {
175 match self.segments.binary_search_by_key(&from, |s| s.end) {
176 Ok(e) => e + 1,
177 Err(s) => s,
178 }
179 }
180
181 pub fn base_direction(&self) -> LayoutDirection {
186 self.base_direction
187 }
188
189 pub fn is_bidi(&self) -> bool {
191 for seg in self.segments.iter() {
192 if seg.direction() != self.base_direction {
193 return true;
194 }
195 }
196 false
197 }
198
199 pub fn get(&self, index: usize) -> Option<(&str, TextSegment)> {
201 if let Some(&seg) = self.segments.get(index) {
202 let text = if index == 0 {
203 &self.text[..seg.end]
204 } else {
205 &self.text[self.segments[index - 1].end..seg.end]
206 };
207
208 Some((text, seg))
209 } else {
210 None
211 }
212 }
213
214 pub fn get_clone(&self, index: usize) -> Option<SegmentedText> {
216 self.get(index).map(|(txt, seg)| SegmentedText {
217 text: txt.to_owned().into(),
218 segments: vec![TextSegment { end: txt.len(), ..seg }],
219 base_direction: self.base_direction,
220 })
221 }
222
223 pub fn is_empty(&self) -> bool {
225 self.segments.is_empty()
226 }
227
228 pub fn into_parts(self) -> (Txt, Vec<TextSegment>, LayoutDirection) {
230 (self.text, self.segments, self.base_direction)
231 }
232
233 pub fn from_parts(text: Txt, segments: Vec<TextSegment>, base_direction: LayoutDirection) -> Self {
242 assert_eq!(text.is_empty(), segments.is_empty());
243 if !text.is_empty() {
244 assert!(segments.last().unwrap().end == text.len());
245 }
246
247 SegmentedText {
248 text,
249 segments,
250 base_direction,
251 }
252 }
253
254 pub fn iter(&self) -> SegmentedTextIter<'_> {
266 SegmentedTextIter {
267 text: &self.text,
268 start: 0,
269 segs_iter: self.segments.iter(),
270 }
271 }
272
273 pub fn text_range(&self, segs_range: ops::Range<usize>) -> ops::Range<usize> {
275 let start = if segs_range.start == 0 {
276 0
277 } else {
278 self.segments[segs_range.start - 1].end
279 };
280 let end = self.segments[..segs_range.end].last().map(|s| s.end).unwrap_or(0);
281 start..end
282 }
283
284 pub fn reorder_line_to_ltr(&self, segs_range: ops::Range<usize>) -> Vec<usize> {
288 let mut r = Vec::with_capacity(segs_range.len());
289 let offset = segs_range.start;
290 unicode_bidi_sort(
291 self.base_direction,
292 self.segments[segs_range].iter().map(|s| (s.kind, s.level)),
293 offset,
294 &mut r,
295 );
296 r
297 }
298
299 pub fn snap_char_boundary(&self, i: usize) -> usize {
304 if i >= self.text.len() {
305 self.text.len()
306 } else {
307 let mut next = i;
308 while !self.text.is_char_boundary(next) {
309 next += 1;
310 }
311 next
312 }
313 }
314
315 pub fn snap_grapheme_boundary(&self, i: usize) -> usize {
320 let i = self.snap_char_boundary(i);
321 if i == self.text.len() {
322 i
323 } else {
324 let mut seg_start = 0;
325 for seg in self.segments.iter() {
326 if seg.end > i {
327 break;
328 }
329 seg_start = seg.end;
330 }
331 let s = &self.text[seg_start..];
332
333 let seg_i = i - seg_start;
334 let mut best_before = 0;
335 let mut best_after = s.len();
336 for (i, _) in unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true) {
337 if i > seg_i {
338 best_after = i;
339 break;
340 }
341 best_before = i;
342 }
343
344 let best = if best_after - seg_i > seg_i - best_before {
345 best_before
346 } else {
347 best_after
348 };
349 seg_start + best
350 }
351 }
352
353 pub fn next_insert_index(&self, from: usize) -> usize {
361 if from == self.text.len() {
362 from
363 } else {
364 let s = &self.text.as_str()[from..];
365 let mut iter = unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true).map(|(i, _)| i + from);
366 assert_eq!(iter.next(), Some(from), "`from` was not a grapheme boundary");
367 iter.next().unwrap_or(self.text.len())
368 }
369 }
370
371 pub fn prev_insert_index(&self, from: usize) -> usize {
379 if from == self.text.len() {
380 let s = &self.text.as_str()[..from];
381 let mut iter = unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true)
382 .map(|(i, _)| i)
383 .rev();
384 iter.next().unwrap_or(0)
385 } else {
386 let s = self.text.as_str();
387
388 let inclusive_from = s[from..].char_indices().nth(1).map(|(b, _)| from + b).unwrap_or_else(|| s.len());
390
391 let s = &self.text.as_str()[..inclusive_from];
392 let mut iter = unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true)
393 .map(|(i, _)| i)
394 .rev();
395 assert_eq!(iter.next(), Some(from), "`from` was not a grapheme boundary");
396 iter.next().unwrap_or(0)
397 }
398 }
399
400 pub fn next_word_index(&self, from: usize) -> usize {
404 let mut segs = self.segments[self.seg_from_char(from)..].iter();
405
406 if let Some(seg) = segs.next() {
407 if seg.kind.is_line_break() {
408 return seg.end;
409 }
410 let mut start = seg.end;
411 for seg in segs {
412 if seg.kind.is_word() || seg.kind.is_line_break() {
413 return start;
414 }
415 start = seg.end;
416 }
417 }
418 self.text.len()
419 }
420
421 pub fn next_word_end_index(&self, from: usize) -> usize {
425 let mut segs = self.segments[self.seg_from_char(from)..].iter();
426 if let Some(seg) = segs.next() {
427 if seg.kind.is_word() || seg.kind.is_line_break() {
428 return seg.end;
429 }
430 for seg in segs {
431 if seg.kind.is_word() || seg.kind.is_line_break() {
432 return seg.end;
433 }
434 }
435 }
436 self.text.len()
437 }
438
439 pub fn prev_word_index(&self, from: usize) -> usize {
443 let seg_i = self.seg_from_char(from);
444 let mut segs = if seg_i < self.segments.len() {
445 self.segments[..=seg_i].iter().rev()
446 } else {
447 self.segs().iter().rev()
448 };
449 let mut seg_kind = TextSegmentKind::Space;
450 for seg in &mut segs {
451 if seg.end < from {
452 if seg_kind.is_word() || seg.kind.is_line_break() {
453 return seg.end;
455 }
456 seg_kind = seg.kind;
457 for seg in segs {
458 if seg_kind.is_word() || seg.kind.is_line_break() {
459 return seg.end;
461 }
462 seg_kind = seg.kind;
463 }
464 break;
465 } else if seg.end == from && seg.kind.is_line_break() {
466 return segs.next().map(|p| p.end).unwrap_or(0);
468 }
469 seg_kind = seg.kind;
470 }
471 0
472 }
473
474 pub fn line_start_index(&self, from: usize) -> usize {
480 let line_break = self.text.as_str()[..from]
481 .char_indices()
482 .rev()
483 .find(|(_, c)| "\n\r\u{85}".contains(*c));
484
485 match line_break {
486 Some((i, _)) => i + 1,
487 None => 0,
488 }
489 }
490
491 pub fn line_end_index(&self, from: usize) -> usize {
497 if from == self.text.len() {
498 return from;
499 }
500
501 let line_break = self.text.as_str()[from..].char_indices().find(|(_, c)| "\n\r\u{85}".contains(*c));
502
503 match line_break {
504 Some((i, _)) => from + i,
505 None => self.text.len(),
506 }
507 }
508
509 pub fn delete_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
520 let mut end = from;
521 for _ in 0..count {
522 let e = self.next_insert_index(end);
523 if e == end {
524 break;
525 }
526 end = e;
527 }
528
529 from..end
530 }
531
532 pub fn backspace_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
542 let mut start = from;
543 for _ in 0..count {
544 let s = self.backspace_start(start);
545 if s == start {
546 break;
547 }
548 start = s;
549 }
550 start..from
551 }
552 fn backspace_start(&self, from: usize) -> usize {
553 let text = &self.text[..from];
554 let mut start = from;
555 for (i, c) in text.char_indices().rev() {
556 start = i;
557 match c {
558 '\u{200D}' => continue, '\n' => {
560 if text[..i].ends_with('\r') {
561 start = i - 1;
562 }
563 }
564 c if c == '\u{FE0F}' || emoji_util::is_modifier(c) => {
565 if let Some((i, c)) = text[..i].char_indices().next_back()
567 && emoji_util::maybe_emoji(c)
568 {
569 start = i;
570 }
571 }
572 _ => {}
573 }
574 break;
575 }
576 start
577 }
578
579 pub fn backspace_word_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
583 let mut start = from;
584 for _ in 0..count {
585 let s = self.prev_word_index(start);
586 if s == start {
587 break;
588 }
589 start = s;
590 }
591 start..from
592 }
593
594 pub fn delete_word_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
596 let mut end = from;
597 for _ in 0..count {
598 let e = self.next_word_end_index(end);
599 if e == end {
600 break;
601 }
602 end = e;
603 }
604
605 from..end
606 }
607}
608
609pub fn unicode_bidi_levels(base_direction: LayoutDirection, line: impl Iterator<Item = TextSegmentKind>, levels: &mut Vec<BidiLevel>) {
613 let mut original_classes = Vec::with_capacity(line.size_hint().0);
614 let mut brackets = HashMap::default();
615 for (i, k) in line.enumerate() {
616 original_classes.push(k.into());
617 if let TextSegmentKind::Bracket(c) = k {
618 brackets.insert(i, c);
619 }
620 }
621
622 unicode_bidi_levels_impl(levels, base_direction, original_classes, brackets);
623}
624fn unicode_bidi_levels_impl(
625 levels: &mut Vec<BidiLevel>,
626 base_direction: LayoutDirection,
627 original_classes: Vec<unicode_bidi::BidiClass>,
628 brackets: HashMap<usize, char>,
629) {
630 levels.clear();
631 let para_level = into_unic_level(base_direction);
632 levels.resize(original_classes.len(), para_level);
633
634 if !original_classes.is_empty() {
635 let mut processing_classes = original_classes.clone();
636
637 super::unicode_bidi_util::explicit_compute(para_level, &original_classes, levels, &mut processing_classes);
638
639 let sequences = super::unicode_bidi_util::prepare_isolating_run_sequences(para_level, &original_classes, levels);
640 for sequence in &sequences {
641 super::unicode_bidi_util::implicit_resolve_weak(sequence, &mut processing_classes);
642 super::unicode_bidi_util::implicit_resolve_neutral(sequence, levels, &original_classes, &mut processing_classes, &brackets);
643 }
644 super::unicode_bidi_util::implicit_resolve_levels(&processing_classes, levels);
645
646 super::unicode_bidi_util::assign_levels_to_removed_chars(para_level, &original_classes, levels);
647 }
648}
649
650pub fn unicode_bidi_sort(
654 base_direction: LayoutDirection,
655 line: impl Iterator<Item = (TextSegmentKind, BidiLevel)>,
656 idx_offset: usize,
657 sort_map: &mut Vec<usize>,
658) {
659 sort_map.clear();
660
661 let cap = line.size_hint().0;
662 let mut line_classes = Vec::with_capacity(cap);
663 let mut levels = Vec::with_capacity(cap);
664 for (kind, level) in line {
665 line_classes.push(kind.into());
666 levels.push(level);
667 }
668
669 if !levels.is_empty() {
670 let (directions, vis_ranges) = super::unicode_bidi_util::visual_runs(levels, line_classes, into_unic_level(base_direction));
671
672 for vis_range in vis_ranges {
673 if directions[vis_range.start].is_rtl() {
674 for i in vis_range.rev() {
675 sort_map.push(idx_offset + i);
676 }
677 } else {
678 for i in vis_range {
679 sort_map.push(idx_offset + i);
680 }
681 }
682 }
683 }
684}
685
686pub struct SegmentedTextIter<'a> {
690 text: &'a str,
691 start: usize,
692 segs_iter: std::slice::Iter<'a, TextSegment>,
693}
694impl<'a> Iterator for SegmentedTextIter<'a> {
695 type Item = (&'a str, TextSegment);
696 fn next(&mut self) -> Option<Self::Item> {
697 if let Some(&seg) = self.segs_iter.next() {
698 let r = Some((&self.text[self.start..seg.end], seg));
699 self.start = seg.end;
700 r
701 } else {
702 None
703 }
704 }
705}
706
707fn from_unic_level(d: unicode_bidi::Level) -> LayoutDirection {
708 if d.is_ltr() { LayoutDirection::LTR } else { LayoutDirection::RTL }
709}
710fn into_unic_level(d: LayoutDirection) -> unicode_bidi::Level {
711 match d {
712 LayoutDirection::LTR => unicode_bidi::Level::ltr(),
713 LayoutDirection::RTL => unicode_bidi::Level::rtl(),
714 }
715}
716
717#[cfg(test)]
718mod tests {
719 use zng_layout::context::{LayoutDirection, TextSegmentKind};
720 use zng_txt::ToTxt;
721
722 use crate::{BidiLevel, SegmentedText, TextSegment};
723
724 #[test]
725 fn segments() {
726 let test = "a\nb\r\nc\td ";
727 let actual = SegmentedText::new(test, LayoutDirection::LTR);
728
729 fn seg(kind: TextSegmentKind, end: usize) -> TextSegment {
730 TextSegment {
731 kind,
732 end,
733 level: BidiLevel::ltr(),
734 }
735 }
736 use TextSegmentKind::*;
737
738 let expected = SegmentedText {
739 text: test.to_txt(),
740 segments: vec![
741 seg(LeftToRight, 1),
742 seg(LineBreak, 2),
743 seg(LeftToRight, 3),
744 seg(LineBreak, 5),
745 seg(LeftToRight, 6),
746 seg(Tab, 7),
747 seg(LeftToRight, 8),
748 seg(Space, 9),
749 ],
750 base_direction: LayoutDirection::LTR,
751 };
752
753 assert_eq!(expected, actual);
754 }
755
756 #[test]
757 fn reorder_line() {
758 let test = "0 2 4";
759 let txt = SegmentedText::new(test, LayoutDirection::RTL);
760
761 let expected = vec![4, 3, 2, 1, 0];
762 let actual = txt.reorder_line_to_ltr(0..test.len());
763
764 assert_eq!(expected, actual);
765 }
766
767 #[test]
768 fn reorder_line_issue() {
769 let test = " المادة 1";
770 let txt = SegmentedText::new(test, LayoutDirection::RTL);
771
772 let expected = vec![3, 2, 1, 0];
773 let actual = txt.reorder_line_to_ltr(0..4);
774
775 assert_eq!(expected, actual);
776 }
777
778 #[test]
779 fn emoji_seg() {
780 let test = "'🙎🏻♀️'1# 1️⃣#️⃣";
781 let txt = SegmentedText::new(test, LayoutDirection::LTR);
782 let k: Vec<_> = txt.segs().iter().map(|s| s.kind).collect();
783
784 assert_eq!(
785 vec![
786 TextSegmentKind::OtherNeutral, TextSegmentKind::Emoji, TextSegmentKind::OtherNeutral, TextSegmentKind::EuropeanNumber, TextSegmentKind::EuropeanTerminator, TextSegmentKind::Space,
792 TextSegmentKind::Emoji, ],
794 k
795 );
796 }
797
798 #[test]
799 fn emoji_issues() {
800 let test = "🏴";
801 let txt = SegmentedText::new(test, LayoutDirection::LTR);
802 for (t, seg) in txt.iter() {
803 assert_eq!(seg.kind, TextSegmentKind::Emoji, "text: {t:?}");
804 }
805 }
806}