diff --git a/src/core_editor/graphemes.rs b/src/core_editor/graphemes.rs new file mode 100644 index 000000000..808c614dd --- /dev/null +++ b/src/core_editor/graphemes.rs @@ -0,0 +1,141 @@ +use unicode_segmentation::UnicodeSegmentation; + +/// Byte index of the next grapheme boundary at or after `pos`. +/// +/// Returns `buf.len()` if there is no grapheme after `pos`. +/// +/// # Panics +/// +/// Panics if `pos` is not on a UTF-8 character boundary in `buf`. +pub fn next_grapheme_boundary(buf: &str, pos: usize) -> usize { + buf[pos..] + .grapheme_indices(true) + .nth(1) + .map(|(i, _)| pos + i) + .unwrap_or(buf.len()) +} + +/// Byte index of the previous grapheme boundary before `pos`. +/// +/// Returns `0` if there is no grapheme before `pos`. +/// +/// # Panics +/// +/// Panics if `pos` is not on a UTF-8 character boundary in `buf`. +pub fn prev_grapheme_boundary(buf: &str, pos: usize) -> usize { + buf[..pos] + .grapheme_indices(true) + .next_back() + .map(|(i, _)| i) + .unwrap_or(0) +} + +#[cfg(test)] +mod tests { + use super::*; + + // --- next_grapheme_boundary --------------------------------------------- + + #[test] + fn next_advances_one_ascii_char() { + assert_eq!(next_grapheme_boundary("abc", 0), 1); + } + + #[test] + fn next_returns_buf_len_when_at_end() { + assert_eq!(next_grapheme_boundary("abc", 3), 3); + } + + #[test] + fn next_on_empty_buffer_returns_zero() { + assert_eq!(next_grapheme_boundary("", 0), 0); + } + + #[test] + fn next_skips_two_byte_utf8_grapheme() { + assert_eq!(next_grapheme_boundary("café!", 3), 5); + } + + #[test] + fn next_at_end_returns_buf_len() { + let buf = "café"; + assert_eq!(next_grapheme_boundary(buf, 3), buf.len()); + } + + #[test] + fn next_treats_combining_mark_as_single_grapheme() { + assert_eq!(next_grapheme_boundary("e\u{0301}", 0), 3); + } + + #[test] + fn next_advances_one_cjk_char() { + assert_eq!(next_grapheme_boundary("日本", 0), 3); + } + + #[test] + fn next_skips_zwj_emoji_sequence_as_one() { + // family-emoji + `!`. From 0, skip the whole 18-byte sequence and land on `!` + let prefix = "👨‍👩‍👧"; + assert_eq!(next_grapheme_boundary("👨‍👩‍👧!", 0), prefix.len()); + } + + // --- prev_grapheme_boundary --------------------------------------------- + + #[test] + fn prev_retreats_one_ascii_char() { + assert_eq!(prev_grapheme_boundary("abc", 2), 1); + } + + #[test] + fn prev_at_zero_returns_zero() { + assert_eq!(prev_grapheme_boundary("abc", 0), 0); + } + + #[test] + fn prev_retreats_past_two_byte_utf8_grapheme() { + // from byte 5 (end of "café") retreat past `é` to byte 3 (its start) + let buf = "café"; + assert_eq!(prev_grapheme_boundary(buf, buf.len()), 3); + } + + #[test] + fn prev_retreats_past_combining_mark() { + // 'a' + combined 'é' (3 bytes). From end, retreat past combined grapheme to byte 1 + let buf = "ae\u{0301}"; + assert_eq!(prev_grapheme_boundary(buf, buf.len()), 1); + } + + #[test] + fn prev_retreats_past_zwj_emoji_sequence() { + // 'a' + family-emoji (18 bytes). From end, retreat past the family to byte 1 + let buf = "a👨‍👩‍👧"; + assert_eq!(prev_grapheme_boundary(buf, buf.len()), 1); + } + + // --- round-trip ---------------------------------------------------------- + + #[test] + fn next_then_prev_returns_to_origin_for_ascii() { + let buf = "abc"; + for (pos, _) in buf.grapheme_indices(true) { + assert_eq!( + prev_grapheme_boundary(buf, next_grapheme_boundary(buf, pos)), + pos, + "round-trip failed at pos {pos}" + ); + } + } + + #[test] + fn next_then_prev_returns_to_origin_for_unicode() { + // mix ASCII, multi-byte, combining mark, and ZWJ emoji + let buf = "a日e\u{0301}👨‍👩‍👧"; + for (pos, _) in buf.grapheme_indices(true) { + assert_eq!( + prev_grapheme_boundary(buf, next_grapheme_boundary(buf, pos)), + pos, + "round-trip failed at pos {pos}" + ); + } + } +} diff --git a/src/core_editor/line_buffer.rs b/src/core_editor/line_buffer.rs index 416e13fe6..1582d4960 100644 --- a/src/core_editor/line_buffer.rs +++ b/src/core_editor/line_buffer.rs @@ -1,4 +1,5 @@ use { + crate::core_editor::graphemes::{next_grapheme_boundary, prev_grapheme_boundary}, itertools::Itertools, std::{convert::From, ops::Range}, unicode_segmentation::UnicodeSegmentation, @@ -174,20 +175,12 @@ impl LineBuffer { /// Cursor position *behind* the next unicode grapheme to the right from the given position pub fn grapheme_right_index_from_pos(&self, pos: usize) -> usize { - self.lines[pos..] - .grapheme_indices(true) - .nth(1) - .map(|(i, _)| pos + i) - .unwrap_or_else(|| self.lines.len()) + next_grapheme_boundary(&self.lines, pos) } /// Cursor position *behind* the previous unicode grapheme to the left from the given position pub(crate) fn grapheme_left_index_from_pos(&self, pos: usize) -> usize { - self.lines[..pos] - .grapheme_indices(true) - .next_back() - .map(|(i, _)| i) - .unwrap_or(0) + prev_grapheme_boundary(&self.lines, pos) } /// Cursor position *behind* the next word to the right @@ -223,13 +216,7 @@ impl LineBuffer { .map(|x| self.insertion_point + x.0 + i) .filter(|x| !is_whitespace_str(word) && *x != self.insertion_point) }) - .unwrap_or_else(|| { - self.lines - .grapheme_indices(true) - .next_back() - .map(|x| x.0) - .unwrap_or(0) - }) + .unwrap_or_else(|| prev_grapheme_boundary(&self.lines, self.lines.len())) } /// Cursor position *at end of* the next WORD to the right @@ -248,13 +235,7 @@ impl LineBuffer { None } }) - .unwrap_or_else(|| { - self.lines - .grapheme_indices(true) - .next_back() - .map(|x| x.0) - .unwrap_or(0) - }) + .unwrap_or_else(|| prev_grapheme_boundary(&self.lines, self.lines.len())) } /// Cursor position *in front of* the next word to the right diff --git a/src/core_editor/mod.rs b/src/core_editor/mod.rs index b30009bb2..721db3239 100644 --- a/src/core_editor/mod.rs +++ b/src/core_editor/mod.rs @@ -1,6 +1,7 @@ mod clip_buffer; mod edit_stack; mod editor; +mod graphemes; mod line_buffer; #[cfg(feature = "system_clipboard")]