convert_case/
segmentation.rs

1#[cfg(test)]
2use strum_macros::EnumIter;
3
4use unicode_segmentation::{UnicodeSegmentation}; //, GraphemeCursor};
5
6/// A boundary defines how a string is split into words.  Some boundaries, `Hyphen`, `Underscore`,
7/// and `Space`, consume the character they split on, whereas the other boundaries
8/// do not.
9///
10/// The struct offers methods that return `Vec`s containing useful groups of boundaries.  It also
11/// contains the [`list_from`](Boundary::list_from) method which will generate a list of boundaries
12/// based on a string slice.
13///
14/// Note that all boundaries are distinct and do not share functionality.  That is, there is no
15/// such DigitLetter variant, because that would be equivalent to the current `DigitUpper` and
16/// `DigitLower` variants.  For common functionality, consider using
17/// some provided functions that return a list of boundaries.
18/// ```
19/// use convert_case::{Boundary, Case, Casing, Converter};
20///
21/// assert_eq!(
22///     "transformations_in_3d",
23///     "TransformationsIn3D"
24///         .from_case(Case::Camel)
25///         .without_boundaries(&Boundary::digit_letter())
26///         .to_case(Case::Snake)
27/// );
28///
29/// let conv = Converter::new()
30///     .set_boundaries(&Boundary::list_from("aA "))
31///     .to_case(Case::Title);
32/// assert_eq!("7empest By Tool", conv.convert("7empest byTool"));
33/// ```
34#[cfg_attr(test, derive(EnumIter))]
35#[derive(Clone, Copy, Eq, PartialEq, Debug)]
36pub enum Boundary {
37    /// Splits on `-`, consuming the character on segmentation.
38    /// ```
39    /// use convert_case::Boundary;
40    /// assert_eq!(
41    ///     vec![Boundary::Hyphen],
42    ///     Boundary::list_from("-")
43    /// );
44    /// ```
45    Hyphen,
46
47    /// Splits on `_`, consuming the character on segmentation.
48    /// ```
49    /// use convert_case::Boundary;
50    /// assert_eq!(
51    ///     vec![Boundary::Underscore],
52    ///     Boundary::list_from("_")
53    /// );
54    /// ```
55    Underscore,
56
57    /// Splits on space, consuming the character on segmentation.
58    /// ```
59    /// use convert_case::Boundary;
60    /// assert_eq!(
61    ///     vec![Boundary::Space],
62    ///     Boundary::list_from(" ")
63    /// );
64    /// ```
65    Space,
66
67    /// Splits where an uppercase letter is followed by a lowercase letter.  This is seldom used,
68    /// and is not included in the [defaults](Boundary::defaults).
69    /// ```
70    /// use convert_case::Boundary;
71    /// assert_eq!(
72    ///     vec![Boundary::UpperLower],
73    ///     Boundary::list_from("Aa")
74    /// );
75    /// ```
76    UpperLower,
77
78    /// Splits where a lowercase letter is followed by an uppercase letter.
79    /// ```
80    /// use convert_case::Boundary;
81    /// assert_eq!(
82    ///     vec![Boundary::LowerUpper],
83    ///     Boundary::list_from("aA")
84    /// );
85    /// ```
86    LowerUpper,
87
88    /// Splits where digit is followed by an uppercase letter.
89    /// ```
90    /// use convert_case::Boundary;
91    /// assert_eq!(
92    ///     vec![Boundary::DigitUpper],
93    ///     Boundary::list_from("1A")
94    /// );
95    /// ```
96    DigitUpper,
97
98    /// Splits where an uppercase letter is followed by a digit.
99    /// ```
100    /// use convert_case::Boundary;
101    /// assert_eq!(
102    ///     vec![Boundary::UpperDigit],
103    ///     Boundary::list_from("A1")
104    /// );
105    /// ```
106    UpperDigit,
107
108    /// Splits where digit is followed by a lowercase letter.
109    /// ```
110    /// use convert_case::Boundary;
111    /// assert_eq!(
112    ///     vec![Boundary::DigitLower],
113    ///     Boundary::list_from("1a")
114    /// );
115    /// ```
116    DigitLower,
117
118    /// Splits where a lowercase letter is followed by a digit.
119    /// ```
120    /// use convert_case::Boundary;
121    /// assert_eq!(
122    ///     vec![Boundary::LowerDigit],
123    ///     Boundary::list_from("a1")
124    /// );
125    /// ```
126    LowerDigit,
127
128    /// Acronyms are identified by two uppercase letters followed by a lowercase letter.
129    /// The word boundary is between the two uppercase letters.  For example, "HTTPRequest"
130    /// would have an acronym boundary identified at "PRe" and split into "HTTP" and "Request".
131    /// ```
132    /// use convert_case::Boundary;
133    /// assert_eq!(
134    ///     vec![Boundary::Acronym],
135    ///     Boundary::list_from("AAa")
136    /// );
137    /// ```
138    Acronym,
139}
140
141impl Boundary {
142    /// Returns a list of all boundaries that are identified within the given string.
143    /// Could be a short of writing out all the boundaries in a list directly.  This will not
144    /// identify boundary `UpperLower` if it also used as part of `Acronym`.
145    ///
146    /// If you want to be very explicit and not overlap boundaries, it is recommended to use a colon
147    /// character.
148    /// ```
149    /// use convert_case::Boundary;
150    /// use Boundary::*;
151    /// assert_eq!(
152    ///     vec![Hyphen, Space, LowerUpper, UpperDigit, DigitLower],
153    ///     Boundary::list_from("aA8a -")
154    /// );
155    /// assert_eq!(
156    ///     vec![Underscore, LowerUpper, DigitUpper, Acronym],
157    ///     Boundary::list_from("bD:0B:_:AAa")
158    /// );
159    /// ```
160    pub fn list_from(s: &str) -> Vec<Self> {
161        Boundary::all().iter().filter(|boundary| {
162            let left_iter = s.graphemes(true);
163            let mid_iter = s.graphemes(true).skip(1);
164            let right_iter = s.graphemes(true).skip(2);
165
166            let mut one_iter = left_iter.clone();
167
168            // Also capture when the previous pair was both uppercase, so we don't
169            // match the UpperLower boundary in the case of Acronym
170            let two_iter = left_iter.clone().zip(mid_iter.clone());
171            let mut two_iter_and_upper = two_iter.clone()
172                .zip(std::iter::once(false).chain(
173                        two_iter.map(|(a, b)| grapheme_is_uppercase(a) && grapheme_is_uppercase(b))
174                ));
175
176            let mut three_iter = left_iter.zip(mid_iter).zip(right_iter);
177
178            one_iter.any(|a| boundary.detect_one(a))
179                || two_iter_and_upper.any(|((a, b), is_acro)| boundary.detect_two(a, b) && !is_acro)
180                || three_iter.any(|((a, b), c)| boundary.detect_three(a, b, c))
181        }).copied().collect()
182    }
183
184    /// The default list of boundaries used when `Casing::to_case` is called directly
185    /// and in a `Converter` generated from `Converter::new()`.  This includes
186    /// all the boundaries except the `UpperLower` boundary.
187    /// ```
188    /// use convert_case::Boundary;
189    /// use Boundary::*;
190    /// assert_eq!(
191    ///     vec![
192    ///         Underscore, Hyphen, Space, LowerUpper, UpperDigit, 
193    ///         DigitUpper, DigitLower, LowerDigit, Acronym,
194    ///     ],
195    ///     Boundary::defaults()
196    /// );
197    /// ```
198    pub fn defaults() -> Vec<Self> {
199        use Boundary::*;
200        vec![
201            Underscore, Hyphen, Space, LowerUpper, UpperDigit, DigitUpper, DigitLower, LowerDigit,
202            Acronym,
203        ]
204    }
205
206    /// Returns the boundaries that split around single characters: `Hyphen`,
207    /// `Underscore`, and `Space`.
208    /// ```
209    /// use convert_case::Boundary;
210    /// use Boundary::*;
211    /// assert_eq!(
212    ///     vec![Hyphen, Underscore, Space],
213    ///     Boundary::delims()
214    /// );
215    /// ```
216    pub fn delims() -> Vec<Self> {
217        use Boundary::*;
218        vec![Hyphen, Underscore, Space]
219    }
220
221    /// Returns the boundaries that involve digits: `DigitUpper`, `DigitLower`, `UpperDigit`, and
222    /// `LowerDigit`.
223    /// ```
224    /// use convert_case::Boundary;
225    /// use Boundary::*;
226    /// assert_eq!(
227    ///     vec![DigitUpper, UpperDigit, DigitLower, LowerDigit],
228    ///     Boundary::digits()
229    /// );
230    /// ```
231    pub fn digits() -> Vec<Self> {
232        use Boundary::*;
233        vec![DigitUpper, UpperDigit, DigitLower, LowerDigit]
234    }
235
236    /// Returns the boundaries that are letters followed by digits: `UpperDigit` and `LowerDigit`.
237    /// ```
238    /// use convert_case::Boundary;
239    /// use Boundary::*;
240    /// assert_eq!(
241    ///     vec![UpperDigit, LowerDigit],
242    ///     Boundary::letter_digit()
243    /// );
244    /// ```
245    pub fn letter_digit() -> Vec<Self> {
246        use Boundary::*;
247        vec![UpperDigit, LowerDigit]
248    }
249
250    /// Returns the boundaries that are digits followed by letters: `DigitUpper` and
251    /// `DigitLower`.
252    /// ```
253    /// use convert_case::Boundary;
254    /// use Boundary::*;
255    /// assert_eq!(
256    ///     vec![DigitUpper, DigitLower],
257    ///     Boundary::digit_letter()
258    /// );
259    /// ```
260    pub fn digit_letter() -> Vec<Self> {
261        use Boundary::*;
262        vec![DigitUpper, DigitLower]
263    }
264
265    /// Returns all boundaries.  Note that this includes the `UpperLower` variant which
266    /// might be unhelpful.  Please look at [`Boundary::defaults`].
267    /// ```
268    /// use convert_case::Boundary;
269    /// use Boundary::*;
270    /// assert_eq!(
271    ///     vec![
272    ///         Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper,
273    ///         UpperDigit, DigitLower, LowerDigit, Acronym,
274    ///     ],
275    ///     Boundary::all()
276    /// );
277    /// ```
278    pub fn all() -> Vec<Self> {
279        use Boundary::*;
280        vec![
281            Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper, UpperDigit, 
282            DigitLower, LowerDigit, Acronym
283        ]
284    }
285
286    fn detect_one(&self, c: &str) -> bool {
287        use Boundary::*;
288        match self {
289            Hyphen => c == "-",
290            Underscore => c == "_",
291            Space => c == " ",
292            _ => false,
293        }
294    }
295
296    fn detect_two(&self, c: &str, d: &str) -> bool {
297        use Boundary::*;
298        match self {
299            UpperLower => grapheme_is_uppercase(c) && grapheme_is_lowercase(d),
300            LowerUpper => grapheme_is_lowercase(c) && grapheme_is_uppercase(d),
301            DigitUpper => grapheme_is_digit(c) && grapheme_is_uppercase(d),
302            UpperDigit => grapheme_is_uppercase(c) && grapheme_is_digit(d),
303            DigitLower => grapheme_is_digit(c) && grapheme_is_lowercase(d),
304            LowerDigit => grapheme_is_lowercase(c) && grapheme_is_digit(d),
305            _ => false,
306        }
307    }
308
309    fn detect_three(&self, c: &str, d: &str, e: &str) -> bool {
310        use Boundary::*;
311        if let Acronym = self {
312            grapheme_is_uppercase(c)
313                && grapheme_is_uppercase(d)
314                && grapheme_is_lowercase(e)
315        } else {
316            false
317        }
318    }
319}
320
321fn grapheme_is_digit(c: &str) -> bool {
322    c.chars().all(|c| c.is_ascii_digit())
323}
324
325fn grapheme_is_uppercase(c: &str) -> bool {
326    c.to_uppercase() != c.to_lowercase() && c == c.to_uppercase()
327}
328
329fn grapheme_is_lowercase(c: &str) -> bool {
330    c.to_uppercase() != c.to_lowercase() && c == c.to_lowercase()
331}
332
333pub fn split<T>(s: T, boundaries: &[Boundary]) -> Vec<String>
334where
335    T: AsRef<str>,
336{
337    use std::iter::once;
338    // create split_points function that counts off by graphemes into list
339    
340    let s = s.as_ref();
341
342    // Some<bool> means the following
343    // None: no split
344    // Some(false): split between characters
345    // Some(true): split consuming characters
346
347    let left_iter = s.graphemes(true);
348    let mid_iter = s.graphemes(true).skip(1);
349    let right_iter = s.graphemes(true).skip(2);
350
351    let singles = left_iter.clone();
352    let doubles = left_iter.clone().zip(mid_iter.clone());
353    let triples = left_iter.zip(mid_iter).zip(right_iter);
354
355    let singles = singles
356        .map(|c| boundaries.iter().any(|b| b.detect_one(c)))
357        .map(|split| if split {Some(true)} else {None});
358    let doubles = doubles
359        .map(|(c,d)| boundaries.iter().any(|b| b.detect_two(c, d)))
360        .map(|split| if split {Some(false)} else {None});
361    let triples = triples
362        .map(|((c,d),e)| boundaries.iter().any(|b| b.detect_three(c, d, e)))
363        .map(|split| if split {Some(false)} else {None});
364
365    let split_points = singles
366        .zip(once(None).chain(doubles))
367        .zip(once(None).chain(triples).chain(once(None)))
368        .map(|((s, d), t)| s.or(d).or(t));
369
370    let mut words = Vec::new();
371    let mut word = String::new();
372    for (c, split) in s.graphemes(true).zip(split_points) {
373        match split {
374            // no split here
375            None => word.push_str(c),
376            // split here, consume letter
377            Some(true) => words.push(std::mem::take(&mut word)),
378            // split here, keep letter
379            Some(false) => {
380                words.push(std::mem::take(&mut word));
381                word.push_str(c);
382            }
383        }
384    }
385    words.push(word);
386
387    /*
388    let mut words = Vec::new();
389    let mut left_idx = 0;
390    let mut total_chars = 0;
391    let mut skip = 0;
392    let mut cur = GraphemeCursor::new(left_idx, s.len(), true);
393
394    for (right_idx, split) in split_points.enumerate() {
395        match split {
396            // no split here
397            None => {},
398            // split here, consume letter
399            Some(true) => {
400                let mut right_bound = left_bound;
401                for _ in 0..total_chars {
402                    right_bound = cur.next_boundary(s, skip).unwrap().unwrap();
403                }
404                words.push(&s[left_bound..right_bound])
405            }
406            // split here, keep letter
407            Some(false) => {
408            }
409            // dont push an empty string, do nothing
410            _ => {}
411        }
412    }
413    */
414
415    words.into_iter().filter(|s| !s.is_empty()).collect()
416}
417
418#[cfg(test)]
419mod test {
420    use super::*;
421    use strum::IntoEnumIterator;
422
423    #[test]
424    fn all_boundaries_in_iter() {
425        let all = Boundary::all();
426        for boundary in Boundary::iter() {
427            assert!(all.contains(&boundary));
428        }
429    }
430
431    #[test]
432    fn split_on_delims() {
433        assert_eq!(
434            vec!["my", "word", "list", "separated", "by", "delims"],
435            split("my_word-list separated-by_delims", &Boundary::delims())
436        )
437    }
438
439    #[test]
440    fn boundaries_found_in_string() {
441        use Boundary::*;
442        assert_eq!(
443            vec![UpperLower],
444            Boundary::list_from(".Aaaa")
445        );
446        assert_eq!(
447            vec![LowerUpper, UpperLower, LowerDigit],
448            Boundary::list_from("a8.Aa.aA")
449        );
450        assert_eq!(
451            Boundary::digits(),
452            Boundary::list_from("b1B1b")
453        );
454        assert_eq!(
455            vec![Hyphen, Underscore, Space, Acronym],
456            Boundary::list_from("AAa -_")
457        );
458    }
459}