rust_analyzer/
line_index.rs

1//! Enhances `ide::LineIndex` with additional info required to convert offsets
2//! into lsp positions.
3//!
4//! We maintain invariant that all internal strings use `\n` as line separator.
5//! This module does line ending conversion and detection (so that we can
6//! convert back to `\r\n` on the way out).
7
8use ide_db::line_index::WideEncoding;
9use memchr::memmem;
10use triomphe::Arc;
11
12#[derive(Clone, Copy)]
13pub enum PositionEncoding {
14    Utf8,
15    Wide(WideEncoding),
16}
17
18pub(crate) struct LineIndex {
19    pub(crate) index: Arc<ide::LineIndex>,
20    pub(crate) endings: LineEndings,
21    pub(crate) encoding: PositionEncoding,
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
25pub(crate) enum LineEndings {
26    Unix,
27    Dos,
28}
29
30impl LineEndings {
31    /// Replaces `\r\n` with `\n` in-place in `src`.
32    pub(crate) fn normalize(src: String) -> (String, LineEndings) {
33        // We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
34        // While we *can* call `as_mut_vec` and do surgery on the live string
35        // directly, let's rather steal the contents of `src`. This makes the code
36        // safe even if a panic occurs.
37
38        let mut buf = src.into_bytes();
39        let mut gap_len = 0;
40        let mut tail = buf.as_mut_slice();
41        let mut crlf_seen = false;
42
43        let finder = memmem::Finder::new(b"\r\n");
44
45        loop {
46            let idx = match finder.find(&tail[gap_len..]) {
47                None if crlf_seen => tail.len(),
48                // SAFETY: buf is unchanged and therefore still contains utf8 data
49                None => return (unsafe { String::from_utf8_unchecked(buf) }, LineEndings::Unix),
50                Some(idx) => {
51                    crlf_seen = true;
52                    idx + gap_len
53                }
54            };
55            tail.copy_within(gap_len..idx, 0);
56            tail = &mut tail[idx - gap_len..];
57            if tail.len() == gap_len {
58                break;
59            }
60            gap_len += 1;
61        }
62
63        // Account for removed `\r`.
64        // After `set_len`, `buf` is guaranteed to contain utf-8 again.
65        let new_len = buf.len() - gap_len;
66        let src = unsafe {
67            buf.set_len(new_len);
68            String::from_utf8_unchecked(buf)
69        };
70        (src, LineEndings::Dos)
71    }
72}
73
74#[cfg(test)]
75mod tests {
76    use super::*;
77
78    #[test]
79    fn unix() {
80        let src = "a\nb\nc\n\n\n\n";
81        let (res, endings) = LineEndings::normalize(src.into());
82        assert_eq!(endings, LineEndings::Unix);
83        assert_eq!(res, src);
84    }
85
86    #[test]
87    fn dos() {
88        let src = "\r\na\r\n\r\nb\r\nc\r\n\r\n\r\n\r\n";
89        let (res, endings) = LineEndings::normalize(src.into());
90        assert_eq!(endings, LineEndings::Dos);
91        assert_eq!(res, "\na\n\nb\nc\n\n\n\n");
92    }
93
94    #[test]
95    fn mixed() {
96        let src = "a\r\nb\r\nc\r\n\n\r\n\n";
97        let (res, endings) = LineEndings::normalize(src.into());
98        assert_eq!(endings, LineEndings::Dos);
99        assert_eq!(res, "a\nb\nc\n\n\n\n");
100    }
101
102    #[test]
103    fn none() {
104        let src = "abc";
105        let (res, endings) = LineEndings::normalize(src.into());
106        assert_eq!(endings, LineEndings::Unix);
107        assert_eq!(res, src);
108    }
109}