1use std::{collections::HashMap, ops};
2
3use crate::emoji_util;
4
5use super::Txt;
6use unicode_bidi::{BidiDataSource as _, BidiInfo};
7
8use zng_layout::context::LayoutDirection;
9pub use zng_layout::context::TextSegmentKind;
10
11pub use unicode_bidi::Level as BidiLevel;
12
13#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15pub struct TextSegment {
16 pub kind: TextSegmentKind,
18 pub level: BidiLevel,
20
21 pub end: usize,
25}
26impl TextSegment {
27 pub fn direction(self) -> LayoutDirection {
32 from_unic_level(self.level)
33 }
34}
35
36#[derive(Default, Debug, Clone, PartialEq, Eq)]
41pub struct SegmentedText {
42 text: Txt,
43 segments: Vec<TextSegment>,
44 base_direction: LayoutDirection,
45}
46impl SegmentedText {
47 pub fn new(text: impl Into<Txt>, base_direction: LayoutDirection) -> Self {
49 Self::new_text(text.into(), base_direction)
50 }
51 fn new_text(text: Txt, base_direction: LayoutDirection) -> Self {
52 let mut segs: Vec<TextSegment> = vec![];
53 let text_str: &str = &text;
54 let bidi = BidiInfo::new(text_str, Some(into_unic_level(base_direction)));
55
56 for (offset, kind) in unicode_linebreak::linebreaks(text_str) {
57 if let unicode_linebreak::BreakOpportunity::Mandatory = kind {
59 let start = segs.last().map(|s| s.end).unwrap_or(0);
61
62 let seg = &text_str[start..offset];
65
66 let break_start = if seg.ends_with("\r\n") {
67 offset - 2
69 } else if seg.ends_with('\n') || seg.ends_with('\r') || seg.ends_with('\u{85}') {
70 offset - 1
72 } else {
73 debug_assert_eq!(offset, text_str.len());
75 offset
76 };
77
78 if break_start > start {
79 Self::push_seg(text_str, &bidi, &mut segs, break_start);
81 }
82 if break_start < offset {
83 segs.push(TextSegment {
85 kind: TextSegmentKind::LineBreak,
86 end: offset,
87 level: bidi.levels[break_start],
88 })
89 }
90 }
91 }
93 SegmentedText {
94 text,
95 segments: segs,
96 base_direction,
97 }
98 }
99
100 fn push_seg(text: &str, bidi: &BidiInfo, segs: &mut Vec<TextSegment>, end: usize) {
101 let start = segs.last().map(|s| s.end).unwrap_or(0);
102
103 let mut char_indices = text[start..end].char_indices().peekable();
104
105 let mut kind = TextSegmentKind::LeftToRight;
106 let mut level = BidiLevel::ltr();
107 for (i, c) in &mut char_indices {
108 const ZWJ: char = '\u{200D}'; const VS16: char = '\u{FE0F}'; const CEK: char = '\u{20E3}'; let is_emoji = (kind == TextSegmentKind::Emoji && (
114 c == VS16 || c == CEK || c == ZWJ || emoji_util::is_modifier(c) || emoji_util::is_component(c) ))
120 || (emoji_util::maybe_emoji(c) && (emoji_util::definitely_emoji(c) || (text[start+i..].chars().nth(1).map(|c| c == VS16 || emoji_util::is_modifier(c)).unwrap_or(false))));
124
125 let (c_kind, c_level) = if is_emoji {
126 (TextSegmentKind::Emoji, level)
127 } else {
128 let k = match TextSegmentKind::from(bidi.original_classes[start + i]) {
129 TextSegmentKind::OtherNeutral if unicode_bidi::HardcodedBidiData.bidi_matched_opening_bracket(c).is_some() => {
130 TextSegmentKind::Bracket(c)
131 }
132 k => k,
133 };
134 (k, bidi.levels[start + i])
135 };
136
137 if c_kind != kind || c_level != level || !c_kind.can_merge() {
138 if i > 0 {
139 segs.push(TextSegment {
140 kind,
141 end: i + start,
142 level,
143 });
144 }
145 level = c_level;
146 kind = c_kind;
147 }
148 }
149 segs.push(TextSegment { kind, end, level });
150 }
151
152 pub fn text(&self) -> &Txt {
154 &self.text
155 }
156
157 pub fn segs(&self) -> &[TextSegment] {
159 &self.segments
160 }
161
162 pub fn seg_from_char(&self, from: usize) -> usize {
164 match self.segments.binary_search_by_key(&from, |s| s.end) {
165 Ok(e) => e + 1,
166 Err(s) => s,
167 }
168 }
169
170 pub fn base_direction(&self) -> LayoutDirection {
175 self.base_direction
176 }
177
178 pub fn is_bidi(&self) -> bool {
180 for seg in self.segments.iter() {
181 if seg.direction() != self.base_direction {
182 return true;
183 }
184 }
185 false
186 }
187
188 pub fn get(&self, index: usize) -> Option<(&str, TextSegment)> {
190 if let Some(&seg) = self.segments.get(index) {
191 let text = if index == 0 {
192 &self.text[..seg.end]
193 } else {
194 &self.text[self.segments[index - 1].end..seg.end]
195 };
196
197 Some((text, seg))
198 } else {
199 None
200 }
201 }
202
203 pub fn get_clone(&self, index: usize) -> Option<SegmentedText> {
205 self.get(index).map(|(txt, seg)| SegmentedText {
206 text: txt.to_owned().into(),
207 segments: vec![TextSegment { end: txt.len(), ..seg }],
208 base_direction: self.base_direction,
209 })
210 }
211
212 pub fn is_empty(&self) -> bool {
214 self.segments.is_empty()
215 }
216
217 pub fn into_parts(self) -> (Txt, Vec<TextSegment>, LayoutDirection) {
219 (self.text, self.segments, self.base_direction)
220 }
221
222 pub fn from_parts(text: Txt, segments: Vec<TextSegment>, base_direction: LayoutDirection) -> Self {
231 assert_eq!(text.is_empty(), segments.is_empty());
232 if !text.is_empty() {
233 assert!(segments.last().unwrap().end == text.len());
234 }
235
236 SegmentedText {
237 text,
238 segments,
239 base_direction,
240 }
241 }
242
243 pub fn iter(&self) -> SegmentedTextIter {
255 SegmentedTextIter {
256 text: &self.text,
257 start: 0,
258 segs_iter: self.segments.iter(),
259 }
260 }
261
262 pub fn text_range(&self, segs_range: ops::Range<usize>) -> ops::Range<usize> {
264 let start = if segs_range.start == 0 {
265 0
266 } else {
267 self.segments[segs_range.start - 1].end
268 };
269 let end = self.segments[..segs_range.end].last().map(|s| s.end).unwrap_or(0);
270 start..end
271 }
272
273 pub fn reorder_line_to_ltr(&self, segs_range: ops::Range<usize>) -> Vec<usize> {
277 let mut r = Vec::with_capacity(segs_range.len());
278 let offset = segs_range.start;
279 unicode_bidi_sort(
280 self.base_direction,
281 self.segments[segs_range].iter().map(|s| (s.kind, s.level)),
282 offset,
283 &mut r,
284 );
285 r
286 }
287
288 pub fn snap_char_boundary(&self, i: usize) -> usize {
293 if i >= self.text.len() {
294 self.text.len()
295 } else {
296 let mut next = i;
297 while !self.text.is_char_boundary(next) {
298 next += 1;
299 }
300 next
301 }
302 }
303
304 pub fn snap_grapheme_boundary(&self, i: usize) -> usize {
309 let i = self.snap_char_boundary(i);
310 if i == self.text.len() {
311 i
312 } else {
313 let mut seg_start = 0;
314 for seg in self.segments.iter() {
315 if seg.end > i {
316 break;
317 }
318 seg_start = seg.end;
319 }
320 let s = &self.text[seg_start..];
321
322 let seg_i = i - seg_start;
323 let mut best_before = 0;
324 let mut best_after = s.len();
325 for (i, _) in unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true) {
326 if i > seg_i {
327 best_after = i;
328 break;
329 }
330 best_before = i;
331 }
332
333 let best = if best_after - seg_i > seg_i - best_before {
334 best_before
335 } else {
336 best_after
337 };
338 seg_start + best
339 }
340 }
341
342 pub fn next_insert_index(&self, from: usize) -> usize {
350 if from == self.text.len() {
351 from
352 } else {
353 let s = &self.text.as_str()[from..];
354 let mut iter = unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true).map(|(i, _)| i + from);
355 assert_eq!(iter.next(), Some(from), "`from` was not a grapheme boundary");
356 iter.next().unwrap_or(self.text.len())
357 }
358 }
359
360 pub fn prev_insert_index(&self, from: usize) -> usize {
368 if from == self.text.len() {
369 let s = &self.text.as_str()[..from];
370 let mut iter = unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true)
371 .map(|(i, _)| i)
372 .rev();
373 iter.next().unwrap_or(0)
374 } else {
375 let s = self.text.as_str();
376
377 let inclusive_from = s[from..].char_indices().nth(1).map(|(b, _)| from + b).unwrap_or_else(|| s.len());
379
380 let s = &self.text.as_str()[..inclusive_from];
381 let mut iter = unicode_segmentation::UnicodeSegmentation::grapheme_indices(s, true)
382 .map(|(i, _)| i)
383 .rev();
384 assert_eq!(iter.next(), Some(from), "`from` was not a grapheme boundary");
385 iter.next().unwrap_or(0)
386 }
387 }
388
389 pub fn next_word_index(&self, from: usize) -> usize {
393 let mut segs = self.segments[self.seg_from_char(from)..].iter();
394
395 if let Some(seg) = segs.next() {
396 if seg.kind.is_line_break() {
397 return seg.end;
398 }
399 let mut start = seg.end;
400 for seg in segs {
401 if seg.kind.is_word() || seg.kind.is_line_break() {
402 return start;
403 }
404 start = seg.end;
405 }
406 }
407 self.text.len()
408 }
409
410 pub fn next_word_end_index(&self, from: usize) -> usize {
414 let mut segs = self.segments[self.seg_from_char(from)..].iter();
415 if let Some(seg) = segs.next() {
416 if seg.kind.is_word() || seg.kind.is_line_break() {
417 return seg.end;
418 }
419 for seg in segs {
420 if seg.kind.is_word() || seg.kind.is_line_break() {
421 return seg.end;
422 }
423 }
424 }
425 self.text.len()
426 }
427
428 pub fn prev_word_index(&self, from: usize) -> usize {
432 let seg_i = self.seg_from_char(from);
433 let mut segs = if seg_i < self.segments.len() {
434 self.segments[..=seg_i].iter().rev()
435 } else {
436 self.segs().iter().rev()
437 };
438 let mut seg_kind = TextSegmentKind::Space;
439 for seg in &mut segs {
440 if seg.end < from {
441 if seg_kind.is_word() || seg.kind.is_line_break() {
442 return seg.end;
444 }
445 seg_kind = seg.kind;
446 for seg in segs {
447 if seg_kind.is_word() || seg.kind.is_line_break() {
448 return seg.end;
450 }
451 seg_kind = seg.kind;
452 }
453 break;
454 } else if seg.end == from && seg.kind.is_line_break() {
455 return segs.next().map(|p| p.end).unwrap_or(0);
457 }
458 seg_kind = seg.kind;
459 }
460 0
461 }
462
463 pub fn line_start_index(&self, from: usize) -> usize {
469 let line_break = self.text.as_str()[..from]
470 .char_indices()
471 .rev()
472 .find(|(_, c)| "\n\r\u{85}".contains(*c));
473
474 match line_break {
475 Some((i, _)) => i + 1,
476 None => 0,
477 }
478 }
479
480 pub fn line_end_index(&self, from: usize) -> usize {
486 if from == self.text.len() {
487 return from;
488 }
489
490 let line_break = self.text.as_str()[from..].char_indices().find(|(_, c)| "\n\r\u{85}".contains(*c));
491
492 match line_break {
493 Some((i, _)) => from + i,
494 None => self.text.len(),
495 }
496 }
497
498 pub fn delete_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
509 let mut end = from;
510 for _ in 0..count {
511 let e = self.next_insert_index(end);
512 if e == end {
513 break;
514 }
515 end = e;
516 }
517
518 from..end
519 }
520
521 pub fn backspace_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
531 let mut start = from;
532 for _ in 0..count {
533 let s = self.backspace_start(start);
534 if s == start {
535 break;
536 }
537 start = s;
538 }
539 start..from
540 }
541 fn backspace_start(&self, from: usize) -> usize {
542 let text = &self.text[..from];
543 let mut start = from;
544 for (i, c) in text.char_indices().rev() {
545 start = i;
546 match c {
547 '\u{200D}' => continue, '\n' => {
549 if text[..i].ends_with('\r') {
550 start = i - 1;
551 }
552 }
553 c if c == '\u{FE0F}' || emoji_util::is_modifier(c) => {
554 if let Some((i, c)) = text[..i].char_indices().next_back() {
556 if emoji_util::maybe_emoji(c) {
557 start = i;
558 }
559 }
560 }
561 _ => {}
562 }
563 break;
564 }
565 start
566 }
567
568 pub fn backspace_word_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
572 let mut start = from;
573 for _ in 0..count {
574 let s = self.prev_word_index(start);
575 if s == start {
576 break;
577 }
578 start = s;
579 }
580 start..from
581 }
582
583 pub fn delete_word_range(&self, from: usize, count: u32) -> std::ops::Range<usize> {
585 let mut end = from;
586 for _ in 0..count {
587 let e = self.next_word_end_index(end);
588 if e == end {
589 break;
590 }
591 end = e;
592 }
593
594 from..end
595 }
596}
597
598pub fn unicode_bidi_levels(base_direction: LayoutDirection, line: impl Iterator<Item = TextSegmentKind>, levels: &mut Vec<BidiLevel>) {
602 let mut original_classes = Vec::with_capacity(line.size_hint().0);
603 let mut brackets = HashMap::default();
604 for (i, k) in line.enumerate() {
605 original_classes.push(k.into());
606 if let TextSegmentKind::Bracket(c) = k {
607 brackets.insert(i, c);
608 }
609 }
610
611 unicode_bidi_levels_impl(levels, base_direction, original_classes, brackets);
612}
613fn unicode_bidi_levels_impl(
614 levels: &mut Vec<BidiLevel>,
615 base_direction: LayoutDirection,
616 original_classes: Vec<unicode_bidi::BidiClass>,
617 brackets: HashMap<usize, char>,
618) {
619 levels.clear();
620 let para_level = into_unic_level(base_direction);
621 levels.resize(original_classes.len(), para_level);
622
623 if !original_classes.is_empty() {
624 let mut processing_classes = original_classes.clone();
625
626 super::unicode_bidi_util::explicit_compute(para_level, &original_classes, levels, &mut processing_classes);
627
628 let sequences = super::unicode_bidi_util::prepare_isolating_run_sequences(para_level, &original_classes, levels);
629 for sequence in &sequences {
630 super::unicode_bidi_util::implicit_resolve_weak(sequence, &mut processing_classes);
631 super::unicode_bidi_util::implicit_resolve_neutral(sequence, levels, &original_classes, &mut processing_classes, &brackets);
632 }
633 super::unicode_bidi_util::implicit_resolve_levels(&processing_classes, levels);
634
635 super::unicode_bidi_util::assign_levels_to_removed_chars(para_level, &original_classes, levels);
636 }
637}
638
639pub fn unicode_bidi_sort(
643 base_direction: LayoutDirection,
644 line: impl Iterator<Item = (TextSegmentKind, BidiLevel)>,
645 idx_offset: usize,
646 sort_map: &mut Vec<usize>,
647) {
648 sort_map.clear();
649
650 let cap = line.size_hint().0;
651 let mut line_classes = Vec::with_capacity(cap);
652 let mut levels = Vec::with_capacity(cap);
653 for (kind, level) in line {
654 line_classes.push(kind.into());
655 levels.push(level);
656 }
657
658 if !levels.is_empty() {
659 let (directions, vis_ranges) = super::unicode_bidi_util::visual_runs(levels, line_classes, into_unic_level(base_direction));
660
661 for vis_range in vis_ranges {
662 if directions[vis_range.start].is_rtl() {
663 for i in vis_range.rev() {
664 sort_map.push(idx_offset + i);
665 }
666 } else {
667 for i in vis_range {
668 sort_map.push(idx_offset + i);
669 }
670 }
671 }
672 }
673}
674
675pub struct SegmentedTextIter<'a> {
679 text: &'a str,
680 start: usize,
681 segs_iter: std::slice::Iter<'a, TextSegment>,
682}
683impl<'a> Iterator for SegmentedTextIter<'a> {
684 type Item = (&'a str, TextSegment);
685 fn next(&mut self) -> Option<Self::Item> {
686 if let Some(&seg) = self.segs_iter.next() {
687 let r = Some((&self.text[self.start..seg.end], seg));
688 self.start = seg.end;
689 r
690 } else {
691 None
692 }
693 }
694}
695
696fn from_unic_level(d: unicode_bidi::Level) -> LayoutDirection {
697 if d.is_ltr() { LayoutDirection::LTR } else { LayoutDirection::RTL }
698}
699fn into_unic_level(d: LayoutDirection) -> unicode_bidi::Level {
700 match d {
701 LayoutDirection::LTR => unicode_bidi::Level::ltr(),
702 LayoutDirection::RTL => unicode_bidi::Level::rtl(),
703 }
704}
705
706#[cfg(test)]
707mod tests {
708 use zng_layout::context::{LayoutDirection, TextSegmentKind};
709 use zng_txt::ToTxt;
710
711 use crate::{BidiLevel, SegmentedText, TextSegment};
712
713 #[test]
714 fn segments() {
715 let test = "a\nb\r\nc\td ";
716 let actual = SegmentedText::new(test, LayoutDirection::LTR);
717
718 fn seg(kind: TextSegmentKind, end: usize) -> TextSegment {
719 TextSegment {
720 kind,
721 end,
722 level: BidiLevel::ltr(),
723 }
724 }
725 use TextSegmentKind::*;
726
727 let expected = SegmentedText {
728 text: test.to_txt(),
729 segments: vec![
730 seg(LeftToRight, 1),
731 seg(LineBreak, 2),
732 seg(LeftToRight, 3),
733 seg(LineBreak, 5),
734 seg(LeftToRight, 6),
735 seg(Tab, 7),
736 seg(LeftToRight, 8),
737 seg(Space, 9),
738 ],
739 base_direction: LayoutDirection::LTR,
740 };
741
742 assert_eq!(expected, actual);
743 }
744
745 #[test]
746 fn reorder_line() {
747 let test = "0 2 4";
748 let txt = SegmentedText::new(test, LayoutDirection::RTL);
749
750 let expected = vec![4, 3, 2, 1, 0];
751 let actual = txt.reorder_line_to_ltr(0..test.len());
752
753 assert_eq!(expected, actual);
754 }
755
756 #[test]
757 fn reorder_line_issue() {
758 let test = " المادة 1";
759 let txt = SegmentedText::new(test, LayoutDirection::RTL);
760
761 let expected = vec![3, 2, 1, 0];
762 let actual = txt.reorder_line_to_ltr(0..4);
763
764 assert_eq!(expected, actual);
765 }
766
767 #[test]
768 fn emoji_seg() {
769 let test = "'🙎🏻♀️'1# 1️⃣#️⃣";
770 let txt = SegmentedText::new(test, LayoutDirection::LTR);
771 let k: Vec<_> = txt.segs().iter().map(|s| s.kind).collect();
772
773 assert_eq!(
774 vec![
775 TextSegmentKind::OtherNeutral, TextSegmentKind::Emoji, TextSegmentKind::OtherNeutral, TextSegmentKind::EuropeanNumber, TextSegmentKind::EuropeanTerminator, TextSegmentKind::Space,
781 TextSegmentKind::Emoji, ],
783 k
784 );
785 }
786
787 #[test]
788 fn emoji_issues() {
789 let test = "🏴";
790 let txt = SegmentedText::new(test, LayoutDirection::LTR);
791 for (t, seg) in txt.iter() {
792 assert_eq!(seg.kind, TextSegmentKind::Emoji, "text: {t:?}");
793 }
794 }
795}